<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2024.1371222</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A comprehensive evaluation of the potential of three next-generation short-read-based plant pan-genome construction strategies for the identification of novel non-reference sequence</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Jiang</surname>
<given-names>Meiye</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Meili</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<uri xlink:href="https://loop.frontiersin.org/people/2664390"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zeng</surname>
<given-names>Jingyao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Du</surname>
<given-names>Zhenglin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xiao</surname>
<given-names>Jingfa</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/27143"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>National Genomics Data Center, Beijing Institute of Genomics, Chinese Academy of Sciences and China National Center for Bioinformation</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>CAS Key Laboratory of Genome Sciences and Information, Beijing Institute of Genomics, Chinese Academy of Sciences and China National Center for Bioinformation</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>College of Life Sciences, University of Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Nunzio D&#x2019;Agostino, University of Naples Federico II, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Bruno Contreras-Moreira, Spanish National Research Council (CSIC), Spain</p>
<p>Rub&#xe9;n Sancho, Spanish National Research Council (CSIC), Spain</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Jingfa Xiao, <email xlink:href="mailto:xiaojingfa@big.ac.cn">xiaojingfa@big.ac.cn</email>; Zhenglin Du, <email xlink:href="mailto:duzhl@qq.com">duzhl@qq.com</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>19</day>
<month>03</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1371222</elocation-id>
<history>
<date date-type="received">
<day>16</day>
<month>01</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>02</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Jiang, Chen, Zeng, Du and Xiao</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Jiang, Chen, Zeng, Du and Xiao</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Pan-genome studies are important for understanding plant evolution and guiding the breeding of crops by containing all genomic diversity of a certain species. Three short-read-based strategies for plant pan-genome construction include iterative individual, iteration pooling, and map-to-pan. Their performance is very different under various conditions, while comprehensive evaluations have yet to be conducted nowadays. Here, we evaluate the performance of these three pan-genome construction strategies for plants under different sequencing depths and sample sizes. Also, we indicate the influence of length and repeat content percentage of novel sequences on three pan-genome construction strategies. Besides, we compare the computational resource consumption among the three strategies. Our findings indicate that map-to-pan has the greatest recall but the lowest precision. In contrast, both two iterative strategies have superior precision but lower recall. Factors of sample numbers, novel sequence length, and the percentage of novel sequences&#x2019; repeat content adversely affect the performance of all three strategies. Increased sequencing depth improves map-to-pan&#x2019;s performance, while not affecting the other two iterative strategies. For computational resource consumption, map-to-pan demands considerably more than the other two iterative strategies. Overall, the iterative strategy, especially the iterative pooling strategy, is optimal when the sequencing depth is less than 20X. Map-to-pan is preferable when the sequencing depth exceeds 20X despite its higher computational resource consumption.</p>
</abstract>
<kwd-group>
<kwd>plant pan-genome</kwd>
<kwd>short-reads based construction strategies</kwd>
<kwd>evaluation</kwd>
<kwd>map-to-pan</kwd>
<kwd>iterative</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="44"/>
<page-count count="12"/>
<word-count count="5807"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Plant Bioinformatics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>In 2005, Tettelin et&#xa0;al. introduced the pan-genome concept to encompass the entire gene set in <italic>Streptococcus agalactiae</italic> (<xref ref-type="bibr" rid="B36">Tettelin et&#xa0;al., 2005</xref>). Since then, this concept has gained widespread application in characterizing the collective genes of a species, encompassing core, dispensable, and private components. The advancement of sequencing technology, especially the prevalent next-generation short-read sequencing, has enabled large-scale pan-genome analysis in plants, extending beyond its initial application in microbes. By 2007, the pan-genome concept was introduced to maize (<xref ref-type="bibr" rid="B27">Morgante et&#xa0;al., 2007</xref>). After that, plenty of studies have delved into the plant pan-genomes of diverse species, such as poplar (<xref ref-type="bibr" rid="B40">Zhang et&#xa0;al., 2019</xref>), <italic>Brachypodium distachyon</italic> (<xref ref-type="bibr" rid="B11">Gordon et&#xa0;al., 2017</xref>), <italic>Brassica oleracea</italic> (<xref ref-type="bibr" rid="B9">Golicz et&#xa0;al., 2016</xref>), <italic>Brassica napus</italic> (<xref ref-type="bibr" rid="B16">Hurgobin et&#xa0;al., 2018</xref>), pepper (<xref ref-type="bibr" rid="B28">Ou et&#xa0;al., 2018</xref>), Medicago (<xref ref-type="bibr" rid="B44">Zhou et&#xa0;al., 2017</xref>), rice (<xref ref-type="bibr" rid="B41">Zhao et&#xa0;al., 2018</xref>), soybean (<xref ref-type="bibr" rid="B22">Li et&#xa0;al., 2014</xref>), hexaploid bread wheat (<xref ref-type="bibr" rid="B26">Montenegro et&#xa0;al., 2017</xref>), tomato (<xref ref-type="bibr" rid="B8">Gao et&#xa0;al., 2019</xref>), and sunflower (<xref ref-type="bibr" rid="B14">H&#xfc;bner et&#xa0;al., 2019</xref>). These plant pan-genomics studies are pivotal in pinpointing key novel non-reference genes or sequences related to processes like signaling (<xref ref-type="bibr" rid="B9">Golicz et&#xa0;al., 2016</xref>), defense mechanisms (<xref ref-type="bibr" rid="B11">Gordon et&#xa0;al., 2017</xref>), resistance pathways (<xref ref-type="bibr" rid="B2">Bayer et&#xa0;al., 2019</xref>), important agricultural traits (<xref ref-type="bibr" rid="B8">Gao et&#xa0;al., 2019</xref>), and heterosis (<xref ref-type="bibr" rid="B38">Zhang et&#xa0;al., 2016</xref>).</p>
<p>Microbial pan-genome studies have benefited from well-established toolkits like Roary (<xref ref-type="bibr" rid="B30">Page et&#xa0;al., 2015</xref>), PGAP (<xref ref-type="bibr" rid="B43">Zhao et&#xa0;al., 2012</xref>), PanGP (<xref ref-type="bibr" rid="B42">Zhao et&#xa0;al., 2014</xref>), PanOCT (<xref ref-type="bibr" rid="B6">Fouts et&#xa0;al., 2012</xref>), and PANNOTATOR (<xref ref-type="bibr" rid="B32">Santos et&#xa0;al., 2013</xref>), while there is not a uniform strategy or pipeline for plant pan-genome construction. There are three plant pan-genome construction strategies based on next-generation sequencing short-reads. They can be summarized as the iterative individual (<xref ref-type="bibr" rid="B9">Golicz et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B16">Hurgobin et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B14">H&#xfc;bner et&#xa0;al., 2019</xref>), the iterative pooling (<xref ref-type="bibr" rid="B26">Montenegro et&#xa0;al., 2017</xref>), and the map-to-pan (<xref ref-type="bibr" rid="B12">Hu et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B34">Sun et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B44">Zhou et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B28">Ou et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B8">Gao et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B31">Qin et&#xa0;al., 2021</xref>). All these three strategies construct a pan-genome based on a high-quality reference genome. For map-to-pan, the whole genome of each accession included in the pan-genome analysis is assembled and then aligned to the reference genome to obtain non-redundant novel sequences not existing in the reference genome. Unlike map-to-pan, unmapped or poorly mapped reads with reference genomes are first extracted. In the iterative pooling method, unmapped or poorly mapped reads from each accession are pooled and assembled in a metagenomic way. In the iterative individual approach, unmapped or poorly mapped reads are assembled directly for each accession, pooled, and removed redundancy. Two iterative strategies are used for pan-genome construction with large-scale samples due to their low requirement for low sequencing depth and computation resource consumption. In contrast, whole genome sequencing and assembly are needed in map-to-pan, so map-to-pan is suitable for pan-genome construction with a few samples. Some pan-genome studies have incorporated long reads from third-generation sequencing platforms, like in rice (<xref ref-type="bibr" rid="B31">Qin et&#xa0;al., 2021</xref>), soybean (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2020</xref>), sorghum (<xref ref-type="bibr" rid="B35">Tao et&#xa0;al., 2021</xref>), maize (<xref ref-type="bibr" rid="B15">Hufford et&#xa0;al., 2021</xref>), and <italic>Raphanus sativus</italic> (<xref ref-type="bibr" rid="B39">Zhang et&#xa0;al., 2021</xref>), while their widespread adoption is constrained by high sequencing expenses, especially in plant pan-genome projects with large-scale samples. Given the vast availability of published short-read sequencing data for numerous plant species, it is prevalent to construct plant pan-genomes based on next-generation short-reads.</p>
<p>Here, we thoroughly benchmark these three strategies for plant pan-genome construction, factoring in different sequencing depths and the number of samples included. We also compare the efficiency of these three strategies in recovering novel non-reference sequences with different lengths and repetitive content percentages. Additionally, we compare computational resource consumption among these three strategies, encompassing both time and memory. Our in-depth evaluation aims to shed light on the effectiveness of these three pan-genome construction strategies under varying conditions and guide researchers in choosing the optimal pan-genome construction strategy.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data sets</title>
<p>Our research collected 20 high-quality chromosome-level genome assemblies, gene annotation files, gene sequences, protein sequences, and PacBio long reads from the rice XI subtype (<xref ref-type="bibr" rid="B31">Qin et&#xa0;al., 2021</xref>) (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;1</bold>
</xref>). We categorized these samples into five groups with 5, 8, 10, 15, and 20 samples, respectively. The group with 8 samples included all subtypes from XI-1B. It was used for benchmarking the influence of various sequencing depths, lengths, and repeat content percentages of novel sequences on these three strategies. The information from the other four groups was compared to examine how the sample number included affected the performance of these three strategies.</p>
<p>The ART-Illumina read simulation tool (<xref ref-type="bibr" rid="B13">Huang et&#xa0;al., 2012</xref>) was used to generate the simulated next-generation sequencing short-reads with depths of 5X, 10X, 20X, 30X, and 50X, with 20 high-quality chromosome-level genome assemblies as the reference. To evaluate the limitations of simulated reads, the real data of next-generation sequencing short-reads for the 9311 sample was downloaded from GSA (<ext-link ext-link-type="uri" xlink:href="https://ngdc.cncb.ac.cn/gsa/">https://ngdc.cncb.ac.cn/gsa/</ext-link>) under Project ID PRJCA002103 and RunID CRR279354. These sequences were aligned to the reference genome using BWA-MEM (<xref ref-type="bibr" rid="B19">Li, 2013</xref>). MSU was used as a reference genome, and its genome sequence was downloaded from RiceRC (<ext-link ext-link-type="uri" xlink:href="https://ricerc.sicau.edu.cn/RiceRC/download/downloadBefore">https://ricerc.sicau.edu.cn/RiceRC/download/downloadBefore</ext-link>). This genome assembly produced by the Rice Genome Annotation Project was initially located at the Institute for Genomic Research. It is now at Michigan State University (MSU) (<xref ref-type="bibr" rid="B29">Ouyang et&#xa0;al., 2007</xref>). Finally, sequencing depth, genome coverage, and other characteristics were calculated using the BAMDST toolkit (<ext-link ext-link-type="uri" xlink:href="https://github.com/shiquan/bamdst">https://github.com/shiquan/bamdst</ext-link>). We generated the simulated sequencing data according to the average depth of real data for each chromosome. The characteristics of simulated data were calculated by the BAMDST toolkit and then compared with the characteristics of real data.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Construction of the testing data set</title>
<p>Three pan-genome construction strategies, iterative individual, iterative pooling, and map-to-pan, utilized simulated short reads to create a test dataset for each group with different sample sizes (<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>). Each strategy underwent identical data pre-processing, which involved eliminating reads with over five Ns, trimming adapters, removing low-quality bases from the 5&#x2019; and 3&#x2019; ends when the quality score was consistently below 20, and discarding reads shorter than 30 bp. All pre-processing tasks were executed using a Perl script developed in-house, which was deposited in BioCode with ID BT007415 (<ext-link ext-link-type="uri" xlink:href="https://ngdc.cncb.ac.cn/biocode/tools/BT007415">https://ngdc.cncb.ac.cn/biocode/tools/BT007415</ext-link>).</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Workflow of evaluation for three plant pan-genome construction strategies based on next-generation short-reads.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1371222-g001.tif"/>
</fig>
<p>For map-to-pan, high-quality reads were firstly collected for whole genome assembly using SOAPdenovo2 (<xref ref-type="bibr" rid="B25">Luo et al., 2012</xref>) through the <italic>eupan assemble linearK</italic> model in the EUPAN toolkit (<xref ref-type="bibr" rid="B12">Hu et&#xa0;al., 2017</xref>). The iterative k-mer was set to a range between 15 and 127 to optimize the assembly outcome. Secondly, the whole genome assembly of each sample was aligned to the reference genome via the MUMmer software (<xref ref-type="bibr" rid="B18">Kurtz et&#xa0;al., 2004</xref>). Those sequences not aligned with the reference genome with 90% identity and 90% coverage simultaneously were recognized as candidate novel non-reference sequences. Subsequently, each sample&#x2019;s novel sequences were combined, and redundancy was eliminated using CD-HIT (<xref ref-type="bibr" rid="B7">Fu et&#xa0;al., 2012</xref>).</p>
<p>For the iterative individual, high-quality reads were initially mapped to the reference genome using BWA MEM (<xref ref-type="bibr" rid="B19">Li, 2013</xref>). Unmapped and poorly mapped reads and those with an edit distance of &#x2265; 8 were extracted for assembly by MEGAHIT (<xref ref-type="bibr" rid="B20">Li et&#xa0;al., 2015</xref>). Then, the contigs assembled from each sample were merged, and redundancy was removed with CD-HIT (<xref ref-type="bibr" rid="B7">Fu et&#xa0;al., 2012</xref>). For iterative pooling, high-quality reads were initially mapped to the reference genome using BWA-MEM (<xref ref-type="bibr" rid="B19">Li, 2013</xref>). Unmapped and poorly mapped reads with an edit distance of &#x2265; 8 were extracted and pooled. These pooling of unmapped or poorly mapped reads were assembled using MEGAHIT (<xref ref-type="bibr" rid="B20">Li et&#xa0;al., 2015</xref>).</p>
<p>For both iterative methods, the edit distance threshold was 8 to select poorly mapped reads. The length of almost all simulated reads was 83 bp, so if the edit distance was greater than 8, the mapping rate of a read to the reference genome was less than ~90%. They may be from highly diverse genomic regions of subspecies compared with the reference genome. So, these reads were also collected and combined with the unmapped reads for novel sequence assembly for two iterative methods.</p>
<p>Unlike the SOAPDENOVO2 for assembly in map-to-pan, we employed MEGAHIT to assemble those unmapped or poorly mapped reads in both iterative strategies to maximize the utilization of these reads. Since MEGAHIT was often utilized for microbial metagenome assembly, it performed better when reads exhibited greater heterogeneity, especially in iterative pooling, where unmapped or poorly mapped reads were pooled together for assembly.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Construction of the validated data set</title>
<p>The plant pan-genome consists of the gene-centric and sequence-centric pan-genome (<xref ref-type="bibr" rid="B10">Golicz et&#xa0;al., 2020</xref>). Here, novel genes identified from gene-based pan-genome and insertions identified from sequence-based pan-genome were combined as the validated data set.</p>
<p>For gene-centric pan-genome construction, there were two kinds of strategies including synteny-based, such as in rice (<xref ref-type="bibr" rid="B31">Qin et&#xa0;al., 2021</xref>), and gene clustering-based, such as in <italic>Brachypodium distachyon</italic> (<xref ref-type="bibr" rid="B11">Gordon et&#xa0;al., 2017</xref>) using GET_HOMOLOG-EST (<xref ref-type="bibr" rid="B4">Contreras-Moreira et&#xa0;al., 2017</xref>), soybean (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2020</xref>) using OrthoMCL (<xref ref-type="bibr" rid="B21">Li et&#xa0;al., 2003</xref>), rice (<xref ref-type="bibr" rid="B33">Shang et&#xa0;al., 2022</xref>) using OrthoFinder (<xref ref-type="bibr" rid="B5">Emms and Kelly, 2019</xref>). Besides, GENESPACE can cluster genes across multiple genomes (<xref ref-type="bibr" rid="B24">Lovell et&#xa0;al., 2022</xref>). Here, we used a synteny-based method. Protein sequences related to the longest gene transcript and information on the gene location for each of the 20 samples from Qin et&#xa0;al (<xref ref-type="bibr" rid="B31">Qin et&#xa0;al., 2021</xref>) were used for the gene-based pan-genome construction for each of the 5 groups. All genes of the nuclear genome&#x2019;s 12 chromosomes from MSU (V.7.0 <ext-link ext-link-type="uri" xlink:href="http://rice.plantbiology.msu.edu">http://rice.plantbiology.msu.edu</ext-link>) were used as the base. Genes from a new genome were aligned against a reference gene set using BLASTP software (<xref ref-type="bibr" rid="B1">Altschul et&#xa0;al., 1990</xref>) and gene synteny was analyzed using MCSCANX software (<xref ref-type="bibr" rid="B37">Wang et&#xa0;al., 2012</xref>). Those genes that did not show synteny with the reference gene set were considered novel genes. These novel genes were then added to the former reference gene set to form a new reference gene set. These steps were repeated until all samples were included. The reference gene set and identified novel genes from the final step were combined as the pan-gene set. Novel genes from each step were combined and then aligned to the MSU reference genome using MUMmer (<xref ref-type="bibr" rid="B18">Kurtz et&#xa0;al., 2004</xref>). Genes with high similarity (identity &#x2265; 90% and coverage &#x2265; 90%) with the MSU reference genome were discarded to exclude the false positives. The remaining gene set was used for further analysis.</p>
<p>To compare the consistency of the gene-based pan-genome from the synteny-based method and gene-clustering-based methods, OrthoFinder was used to construct the gene-based pan-genome with the reference genome and extra 5, 8, 10, 15, and 20 samples. Those gene groups not containing genes from MSU were considered novel gene groups that did not exist in the reference genome.</p>
<p>Sequence-based pan-genome was constructed as complementary to gene-based pan-genome. Here, insertions compared with the reference genome from each sample for each of the 5 groups were considered novel sequences absent from the reference genome. PacBio long reads of each sample were first mapped to the MSU reference genome by pbmm2 software (<ext-link ext-link-type="uri" xlink:href="https://github.com/PacificBiosciences/pbmm2">https://github.com/PacificBiosciences/pbmm2</ext-link>) with default parameters. After this, structural variations were called and genotyped using pbsv software (<ext-link ext-link-type="uri" xlink:href="https://github.com/PacificBiosciences/pbsv">https://github.com/PacificBiosciences/pbsv</ext-link>) using default parameters. Entries related to insertions were extracted. Then, these insertions were merged at the group level using SURVIVOR software (<xref ref-type="bibr" rid="B17">Jeffares et&#xa0;al., 2017</xref>). Those insertions &#x2264; 50 bp in length or had supporting reads of &#x2264; 20 were excluded. To eliminate the false positive introduced during insertion identification, the remaining insertion sequences were then aligned to the genome of each sample in each of the 5 groups. Those insertions not having a high similarity (identity &#x2265; 90% and coverage &#x2265; 90%) with the genome sequences were excluded.</p>
<p>The RepeatMasker tool (<xref ref-type="bibr" rid="B3">Chen, 2004</xref>) was employed for the validated data set to detect repetitive elements, using rice as the model species.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Recall and precision definition</title>
<p>The sequences from the testing data set were aligned to sequences from the validated data set using the MUMmer software (<xref ref-type="bibr" rid="B18">Kurtz et&#xa0;al., 2004</xref>). When different sequences from the testing data sets were aligned to the same sequences from the validated data set, and they had an overlap of 90% or more, these sequences from the testing data sets and their recovered regions for sequences from the validated data set were combined. For each sequence from the validated data set, its coverage was defined as the ratio of recovered length by sequences from the testing data set to its whole length. If the coverage was &#x2265; 0.5, this sequence from the validated data set was considered a recovered sequence. The recall value was defined as the ratio of the number of recovered ones to the total number of sequences from the validated data set.</p>
<p>For each of the 5 groups, sequences from the testing data set were aligned to all genomes in that group. Those sequences with a high similarity (90% identity and 90% coverage) were considered as precise sequences. The precision value was defined as the ratio of the number of precise ones to the total number of sequences from the testing data set.</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>The characteristics of the testing and validated data set</title>
<p>The characteristics of the testing data set. All the simulated next-generation short-reads with sequencing depths of 5X, 10X, 20X, 30X, and 50X for 20 samples have a high-quality read rate of &#x2265;99% (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;2</bold>
</xref>). By comparing the characteristics between simulated and real data, we find that the simulated reads have almost identical or even higher genome coverage than the real data under the same sequencing depth (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;3</bold>
</xref>). This indicates the availability of simulated data for evaluation. However, there are some biases in simulated data. For example, the rate of singletons and reads pairs mapping to different chromosomes of simulated data is lower than in real data (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;4</bold>
</xref>). These simulated reads after preprocessing are used to construct the testing data set using three strategies for each of the 5 groups (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;5</bold>
</xref>). For map-to-pan, optimal k-mers used for whole genome assembly for different samples are different, highlighting the necessity for an iterative k-mer strategy (<xref ref-type="supplementary-material" rid="SF1">
<bold>Supplementary Figure&#xa0;1</bold>
</xref>). When sequencing depth increases, the length of assembled contigs of map-to-pan increases, while sequencing depth has no significant influence on both iterative methods (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2A</bold>
</xref>).</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>
<bold>(A)</bold> The average and maximum lengths of assembled contigs for three strategies across varied sequencing depths. <bold>(B)</bold> A heatmap of the overlapping number of insertions between paired samples in the group consisting of 8 samples. <bold>(C)</bold> A pie chart showing the percentage of insertions found in genic versus intergenic regions and the distribution of insertion numbers as samples increase in the group consisting of 8 samples. <bold>(D)</bold> A heatmap of the presence and absence profile for insertions across samples in the group consisting of 8 samples. The distribution for the repeat content percentage <bold>(E)</bold> and length <bold>(F)</bold> of novel sequences from the validated data set for the group consisting of 8 samples. The distribution for the count of novel genes <bold>(G)</bold> and insertions <bold>(H)</bold> with different lengths and repeat content percentages in the group consisting of 8 samples.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1371222-g002.tif"/>
</fig>
<p>The characteristics of the validated data set. For gene-based pan-genome, the ratio of core genes decreases with sample size increases, and this ratio stabilizes around 50% when the sample size reaches 6 or more (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;6</bold>
</xref>). For the group with MSU and the other 8 samples, synteny-based methods can find 18,500 (91.67%) of 20,179 gene groups from OrthoFinder. After filtering, all 13,078 novel genes identified from the synteny-based method are included in the results from the OrthoFinder. This further demonstrated the usability of synteny-based methods in novel gene identification. For sequence-based pan-genome by 8 samples, the insertion counts diverge among samples, and their overlaps with each other are not uniform (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2B</bold>
</xref>). Insertions are predominantly localized in intergenic regions, indicating that insertions can be used as a complement to novel genes (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2C</bold>
</xref>). The insertions have different distribution patterns among different samples, further supported by the insertion presence and absence profile (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2D</bold>
</xref>). The characteristics of sequence-based pan-genome are consistently observed in the other 4 groups (<xref ref-type="supplementary-material" rid="SF2">
<bold>Supplementary Figure&#xa0;2</bold>
</xref>). The summary of novel genes and insertions for each of the 5 groups is shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. Insertions have a higher repeat percentage than the novel genes (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2E</bold>
</xref>), retroelements and DNA transposons emerge as the predominant repeat elements in them (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;7</bold>
</xref>). However, their overall lengths are less than the novel genes (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2F</bold>
</xref>). The repeat percentage of novel genes is the highest at the longest and shortest ones (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2G</bold>
</xref>), while for insertions, they consistently show a high repeat percentage for all lengths (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2H</bold>
</xref>).</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Statistics of novel genes and insertions from the validated data set for each of the 5 groups.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Type</th>
<th valign="middle" align="center">Sample Number</th>
<th valign="middle" align="center"># Seqs</th>
<th valign="middle" align="center">Total Size (bp)</th>
<th valign="middle" align="center">Mean Length (bp)</th>
<th valign="middle" align="center">Repeat Percentage</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="5" align="center">Novel Genes</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">9,697</td>
<td valign="top" align="center">39,114,313</td>
<td valign="top" align="center">4033.70</td>
<td valign="top" align="center">46.02%</td>
</tr>
<tr>
<td valign="top" align="center">8</td>
<td valign="top" align="center">13,078</td>
<td valign="top" align="center">51,527,357</td>
<td valign="top" align="center">3940.00</td>
<td valign="top" align="center">46.19%</td>
</tr>
<tr>
<td valign="top" align="center">10</td>
<td valign="top" align="center">15,306</td>
<td valign="top" align="center">59,557,869</td>
<td valign="top" align="center">3891.10</td>
<td valign="top" align="center">46.30%</td>
</tr>
<tr>
<td valign="top" align="center">15</td>
<td valign="top" align="center">19,901</td>
<td valign="top" align="center">79,273,953</td>
<td valign="top" align="center">3983.40</td>
<td valign="top" align="center">46.38%</td>
</tr>
<tr>
<td valign="top" align="center">20</td>
<td valign="top" align="center">24,792</td>
<td valign="top" align="center">98,210,643</td>
<td valign="top" align="center">3961.40</td>
<td valign="top" align="center">46.38%</td>
</tr>
<tr>
<td valign="middle" rowspan="5" align="center">Insertions</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">13,082</td>
<td valign="top" align="center">12,528,436</td>
<td valign="top" align="center">957.70</td>
<td valign="top" align="center">44.44%</td>
</tr>
<tr>
<td valign="top" align="center">8</td>
<td valign="top" align="center">15,047</td>
<td valign="top" align="center">16,504,941</td>
<td valign="top" align="center">1096.90</td>
<td valign="top" align="center">44.77%</td>
</tr>
<tr>
<td valign="top" align="center">10</td>
<td valign="top" align="center">17,109</td>
<td valign="top" align="center">20,891,729</td>
<td valign="top" align="center">1221.10</td>
<td valign="top" align="center">45.11%</td>
</tr>
<tr>
<td valign="top" align="center">15</td>
<td valign="top" align="center">18,756</td>
<td valign="top" align="center">25,039,572</td>
<td valign="top" align="center">1335.00</td>
<td valign="top" align="center">45.24%</td>
</tr>
<tr>
<td valign="top" align="center">20</td>
<td valign="top" align="center">19,959</td>
<td valign="top" align="center">27,876,840</td>
<td valign="top" align="center">1396.70</td>
<td valign="top" align="center">45.37%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Evaluation of the influence of sequencing depth on three pan-genome construction strategies</title>
<p>Testing and validated data sets from the group with 8 samples are utilized to evaluate the different efficiency of three pan-genome construction strategies under different sequencing depths. For the coverage of novel genes from the validated data set under all different sequencing depths (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3A</bold>
</xref>) and insertions from the validated data set under 20X or more sequencing depth (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3B</bold>
</xref>), the difference is significant between map-to-pan and the other two iterative strategies, highlighting the different performance of map-to-pan and the other two iterative strategies. The difference is significant between iterative individual and iterative pooling for the coverage of novel genes under 10X or less sequencing depth (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3A</bold>
</xref>) and insertions (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3B</bold>
</xref>) under all different sequencing depths. Iterative pooling has a slightly higher average coverage for novel sequences from the validated data set than iterative individual, especially when sequencing depth is 10X or less. The main reason is that iterative pooling gathered all unmapped or poorly mapped reads for assembly, comparable to increasing the sequencing depth.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The impact of sequencing depth on three strategies. <bold>(A)</bold> The distribution for recovered coverage of sequences from the testing dataset to novel genes from the validated dataset for the three strategies across varied sequencing depths. <bold>(B)</bold> The distribution for recovered coverage of sequences from the testing dataset to insertions from the validated dataset for the three strategies across varied sequencing depths. <bold>(C)</bold> Recall distribution for the three strategies across various sequencing depths. <bold>(D)</bold> Precision distribution for the three strategies across various sequencing depths. <bold>(E)</bold>&#xa0;Distribution of assembled length, categorized by false and true tags, for the three pan-genome construction strategies of the plant. NS means P &gt; 0.05, * means P&#xa0;&#x2264;&#xa0;0.05, ** means P &lt; 0.01, *** means P &lt; 0.001.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1371222-g003.tif"/>
</fig>
<p>Map-to-pan has the highest recall value, and the other two iterative strategies have nearly identical lower recall values (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3C</bold>
</xref>). Specifically, the recall value of both novel genes and insertions from the validated data set is lower than 0.25 for two iterative strategies under all sequencing depths. For map-to-pan, the recall value of novel genes from the validated data set is around 0.5, and of insertions from the validated data set is around 0.75 under 50X sequencing depth.</p>
<p>Conversely, map-to-pan has the lowest precision value, and the other two iterative strategies have almost identical precision values (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3D</bold>
</xref>). Those sequences that are not precisive, are mainly from short sequences for map-to-pan and have a consistent distribution across all lengths for the other two iterative strategies (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3E</bold>
</xref>).</p>
<p>Overall, higher sequencing depths improve map-to-pan performance, including its coverage and recall for novel sequences from the validated data set (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3A&#x2013;C</bold>
</xref>), and precision (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3D</bold>
</xref>). However, there needs to be obvious evidence to support the influence of sequencing depth on the other two iterative strategies.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Impact of sample size on three pan-genome construction strategies</title>
<p>In pan-genome research, including more samples will introduce more genomic diversity and biological information unless the current pan-genome of certain species is closed. A closed pan-genome means adding new genomes or samples will not induce the increase in pan-genome size, which depends on the frequency of gene exchange between subspecies and whether enough samples are included. Therefore, the number of samples included is vital in pan-genome construction.</p>
<p>For sequences from the map-to-pan strategy, the difference in their coverage for novel genes from the validated data set is significant among different sample sizes with all sequencing depths. At the same time, there is no significance for both iterative strategies (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4A</bold>
</xref>). Conversely, for sequences from these three strategies, their coverage for insertions from the validated data set is similar among different sample sizes, except for the map-to-pan strategy under 50X sequencing depth (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4B</bold>
</xref>).</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The impact of the number of samples included on three strategies. <bold>(A)</bold> The distribution for recovered coverage of sequences from the testing dataset to novel genes from the validated dataset for the three strategies across various sample numbers included. <bold>(B)</bold> The distribution for recovered coverage of sequences from the testing dataset to insertions from the validated dataset for the three strategies across various sample numbers included. <bold>(C)</bold> Recall distribution for the three strategies across various sample numbers included. <bold>(D)</bold> Precision distribution for the three strategies across various sample numbers included. NS means P &gt; 0.05, * means P &#x2264; 0.05, ** means P  0.01, *** means P &lt; 0.001.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1371222-g004.tif"/>
</fig>
<p>Recall and precision values are further used to evaluate sample size influence on these three strategies. For map-to-pan, their recall value for novel genes decreases as sample size increases, while for insertions, their recall value increases as sample size increases (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4C</bold>
</xref>). For two iterative strategies, the sample size does not significantly influence their recall value for both novel genes and insertions from the validated data set. There is no obvious difference between iterative individual and iterative pooling.</p>
<p>Higher sequencing depth can improve the coverage and recall for novel sequences from the validated data set of map-to-pan with an expanded sample size but does not affect both iterative strategies. This indicates the limited capability of iterative strategies for novel sequence identification, no matter the sample size or sequencing depth. Map-to-pan has the lowest precision value under different sample sizes, while there is a positive correlation between precision value and sample size, such a relationship is not observed for the two iterative methods (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4D</bold>
</xref>).</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Comparison of three pan-genome construction methods&#x2019; performance with the different novel sequence length</title>
<p>Novel sequences from the validated data set are divided into four length-based categories: SS, S, M, and L for both novel genes and insertions (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;8</bold>
</xref>). SS-tagged novel sequences have lengths from 50 bp to 100 bp, S-tagged novel sequences have lengths from 100 bp to 1000 bp, M-tagged novel sequences have lengths from 1000 bp to 10000 bp, L-tagged novel sequences have lengths larger than 10000 bp. Most novel genes fall in the M category, whereas most insertions are in the S category.</p>
<p>For sequences from all three strategies, there is a negative relationship between their coverage for novel sequences from the validated data set and the length of the novel sequences from the validated data set for both novel genes and insertions (<xref ref-type="supplementary-material" rid="SF3">
<bold>Supplementary Figure&#xa0;3A, B</bold>
</xref>). Increased sequencing depth improves the recovered coverage of sequences from map-to-pan for novel sequences from the validated data set (<xref ref-type="supplementary-material" rid="SF3">
<bold>Supplementary Figure&#xa0;3A, B</bold>
</xref>) and the length of recovered novel sequences from the validated data set, especially for insertions (<xref ref-type="supplementary-material" rid="SF3">
<bold>Supplementary Figure&#xa0;3C</bold>
</xref>). The overall recall value is lower for the SS and L categories than the S and M categories for all three strategies (<xref ref-type="supplementary-material" rid="SF3">
<bold>Supplementary Figure&#xa0;3D</bold>
</xref>). The recall value drops as the length of novel sequences from the validated data set increases for two iterative strategies under all sequencing depths and for map-to-pan under 10X or less sequencing depth. Increased sequencing depth improves the map-to-pan&#x2019;s recall for novel sequences with different lengths but has no significant effect on the two iterative methods.</p>
<p>Regarding recall value, the map-to-pan strategy outperforms the other two iterative strategies for different length categories except for L. Additionally, no significant difference exists between the individual and pooling iterative strategy across all length categories.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Diverse efficiency of three pan-genome construction methods in response to novel sequences&#x2019; repeat content percentage</title>
<p>Novel genes and insertions from the validated data set are divided into ten groups based on their repeat content percentage, using intervals of 0.10. The majority of these genes and insertions are found within the [0, 0.1] and (0.9, 1] intervals (<xref ref-type="supplementary-material" rid="s10">
<bold>Supplementary Table&#xa0;9</bold>
</xref>).</p>
<p>For sequences from all three pan-genome construction strategies, their recovered coverage of novel sequences from the validated data set decreases as the repeat content percentage increases (<xref ref-type="supplementary-material" rid="SF4">
<bold>Supplementary Figure&#xa0;4A, 4B</bold>
</xref>). Novel sequences with repeat percentages in the ranges of [0, 0.25] and [0.75, 1] are more easily identified by these three methods (<xref ref-type="supplementary-material" rid="SF4">
<bold>Supplementary Figure&#xa0;4C</bold>
</xref>).</p>
<p>The recall value is negatively associated with the repeat content percentage for the two iterative strategies under all sequencing depths and for the map-to-pan technique under 10X or less sequencing depth (<xref ref-type="supplementary-material" rid="SF4">
<bold>Supplementary Figure&#xa0;4D</bold>
</xref>). Sequencing depth can improve the recall value of map-to-pan for novel sequences with different repeat content percentages but has no significant effect on the two iterative methods. Overall, the map-to-pan strategy has a higher recall value than the other two iterative strategies, especially for those novel sequences with higher repeat percentages. The distinction between the iterative individual and iterative pooling strategies is subtle under different repeat content percentages.</p>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Time and memory consumption comparison among three pan-genome construction methods</title>
<p>The map-to-pan strategy demands considerably greater computational resources regarding memory and time than the other two iterative methods (<xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>). The main computational burden for the map-to-pan strategy arises from assembling the whole genome for every sample included. At a sequencing depth of 30X, it uses about 62GB of memory and takes approximately 212 minutes for each sample, utilizing 4 CPUs. Assembling unmapped or poorly mapped reads for the iterative individual strategy uses only around 10MB and takes about 18 minutes per sample. For the iterative pooling strategy, assembling pooled unmapped or poorly mapped reads consumes nearly 10MB of memory and takes about 115 minutes to construct a pan-genome with 8 samples, operating on 4 CPUs. The second highest computational demand for the map-to-pan strategy comes from aligning the assembled genome of each sample to the reference genome. In the case of the two iterative methods, only the assembly of unmapped or poorly mapped reads is aligned to the reference genome, thus requiring significantly less memory and time than map-to-pan.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Memory and time requirements for three pan-genome construction strategies at the sequencing depth of 30X for the 9311 sample.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="bottom" rowspan="2" align="center">Steps</th>
<th valign="middle" colspan="3" align="center">Map-to-pan</th>
<th valign="middle" colspan="3" align="center">Iterative Individual</th>
<th valign="middle" colspan="3" align="center">Iterative Pooling</th>
</tr>
<tr>
<th valign="middle" align="center">Mem</th>
<th valign="middle" align="center">Time</th>
<th valign="middle" align="center">CPU</th>
<th valign="middle" align="center">Mem</th>
<th valign="middle" align="center">Time</th>
<th valign="middle" align="center">CPU</th>
<th valign="middle" align="center">Mem</th>
<th valign="middle" align="center">Time</th>
<th valign="middle" align="center">CPU</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Filter low-quality reads</td>
<td valign="middle" align="center">1.3M/sample</td>
<td valign="middle" align="center">~47mins/sample</td>
<td valign="middle" align="center">1/sample</td>
<td valign="middle" align="center">1.3M/sample</td>
<td valign="middle" align="center">~47mins/sample</td>
<td valign="middle" align="center">1/sample</td>
<td valign="middle" align="center">1.3M/sample</td>
<td valign="middle" align="center">~47mins/sample</td>
<td valign="middle" align="center">1/sample</td>
</tr>
<tr>
<td valign="middle" align="center">Map to reference genome and extract unmapped reads</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">5.4G/sample</td>
<td valign="middle" align="center">~208mins/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">5.4G/sample</td>
<td valign="middle" align="center">~208mins/sample</td>
<td valign="middle" align="center">4/sample</td>
</tr>
<tr>
<td valign="middle" align="center">MEGAHIT assembles individual unmapped reads</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">10M/sample</td>
<td valign="middle" align="center">~18mins/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
</tr>
<tr>
<td valign="middle" align="center">Individual unmapped reads pooling and assemble for 8 samples</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">10M/sample</td>
<td valign="middle" align="center">~115mins/sample</td>
<td valign="middle" align="center">4/sample</td>
</tr>
<tr>
<td valign="middle" align="center">Pool assembled contigs from individual unmapped reads and remove redundancy</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">350M/sample</td>
<td valign="middle" align="center">~2mins/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
</tr>
<tr>
<td valign="middle" align="center">Whole genome assembly</td>
<td valign="middle" align="center">~63G/sample</td>
<td valign="middle" align="center">~212mins/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
</tr>
<tr>
<td valign="middle" align="center">Map whole genome assembly to reference</td>
<td valign="middle" align="center">~480M/sample</td>
<td valign="middle" align="center">~29mins/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
</tr>
<tr>
<td valign="middle" align="center">Extract unaligned contigs</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">~1min/sample</td>
<td valign="middle" align="center">1/sample</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
</tr>
<tr>
<td valign="middle" align="center">Pool unaligned contigs and remove redundancy</td>
<td valign="middle" align="center">~860M/sample</td>
<td valign="middle" align="center">~13mins/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
<td valign="middle" align="center">&#x2026;</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
</tr>
<tr>
<td valign="middle" align="center">In total</td>
<td valign="middle" align="center">63G/sample</td>
<td valign="middle" align="center">~5hrs/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">5.4G/sample</td>
<td valign="middle" align="center">~4.5hrs/sample</td>
<td valign="middle" align="center">4/sample</td>
<td valign="middle" align="center">5.4G/sample</td>
<td valign="middle" align="center">~4hrs/sample</td>
<td valign="middle" align="center">4/sample</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The computational resources are evaluated based on 9311 samples with 30X sequencing depth if a single sample is considered. If population statistics are needed, 8 samples, including 9311, G8, IR64, J4155, R527, S548, Y3551, and Y58S, are evaluated. All information is just based on 30X sequencing depth; if more sequencing depth and more samples are analyzed, then the time and memory will increase correspondingly. At 20X sequencing depth, for whole genome assembly mapping to reference, time and memory are also larger than that with 30X sequencing depth due to its large assembled genome size with a high false positive rate.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>For both two iterative methods, the most resource-intensive step is the alignment of whole-genome sequencing reads from each sample included in the pan-genome construction to the reference genome. This step requires about 5.4GB of memory and an estimated 202 minutes per sample when using 4 CPUs for each sample.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>The pan-genome study proves effective for plant genomic studies because it aims to encompass all genomic diversity of a certain species, which is important for the deep understanding of evolution and providing more novel genomic targets for breeding. It aids in identifying crucial novel non-reference genes or sequences associated with signaling (<xref ref-type="bibr" rid="B9">Golicz et&#xa0;al., 2016</xref>), defense mechanisms (<xref ref-type="bibr" rid="B11">Gordon et&#xa0;al., 2017</xref>), resistance pathways (<xref ref-type="bibr" rid="B2">Bayer et&#xa0;al., 2019</xref>), vital agricultural attributes (<xref ref-type="bibr" rid="B8">Gao et&#xa0;al., 2019</xref>), and heterosis (<xref ref-type="bibr" rid="B38">Zhang et&#xa0;al., 2016</xref>). Currently, three strategies based on next-generation sequencing short-reads are utilized for constructing the plant pan-genome, they can be summarized as iterative individual, iterative pooling, and map-to-pan. They have different performances under different conditions. This diversity complicates the integration or comparison of pan-genome information for the same species from different projects and makes it difficult for users to select the optimal pan-genome construction strategy. Hence, we performed the first comprehensive evaluation of these three strategies considering the sequencing depths, sample sizes, length and repeat content percentage of novel sequence, and computational resource consumption.</p>
<p>Our findings indicate that: (1) map-to-pan has the highest recall but lowest precision value, whereas the two iterative strategies have lower recall but higher precision values; (2) the number of samples, the length of novel sequences, and the percentage of repeat content are inversely related to the recall value of these three pan-genome construction strategies, primarily because an increased number of samples brings more complexity, and new sequences with larger length and a higher percentage of repeat content are challenging to be assembled just based on next-generation short-reads; (3) higher sequencing depth can enhance the performance of map-to-pan, but it doesn&#x2019;t affect the other two iterative strategies; (4) regarding the consumption of computational resources, map-to-pan requires significantly more than the other two iterative strategies, particularly at higher sequencing depths. Generally, the iterative method, particularly the iterative pooling method, is optimal when the sequencing depth is lower than 20X, considering recall and precision value. However, map-to-pan performs better with sequencing depths greater than 20X, even though it demands more computational memory and time.</p>
<p>However, there are some limitations in our evaluation. First, we only included a single species (rice) in our assessment. These three short-reads-based strategies for plant pan-genome construction may perform better in species with simpler genomes, such as <italic>Arabidopsis thaliana</italic>, and worse in species with more complex genomes, such as barley. Secondly, certain assembly and mapping software are used for these three strategies in our evaluation, while the choice of different software may also impact the evaluation results. Thirdly, we only used a synteny-based method for gene-based pan-genome construction. The core gene ratio differs slightly between these two methods of OrthoFinder and synteny-based. Fourthly, the choice of assessment data also influences the evaluation results. Here, we selected simulated data for evaluation, which needs to fully characterize the real data results. Meanwhile, we evaluated the performance of pan-genome construction strategies based on short reads. Still, it would be better to construct the pan-genome by a combination of short and long reads, such as in rice (<xref ref-type="bibr" rid="B31">Qin et&#xa0;al., 2021</xref>), soybean (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2020</xref>), sorghum (<xref ref-type="bibr" rid="B35">Tao et&#xa0;al., 2021</xref>), maize (<xref ref-type="bibr" rid="B15">Hufford et&#xa0;al., 2021</xref>), and <italic>Raphanus sativus</italic> (<xref ref-type="bibr" rid="B39">Zhang et&#xa0;al., 2021</xref>).</p>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly avaliable datasets were analyzed in this study. This data can be found here: Whole genome sequences, gene annotation files, gene sequences, and protein sequences of 20 rice samples are from Qin et al. (Qin et al., 2021). They can be downloaded from the RiceRC database via <uri xlink:href="https://ricerc.sicau.edu.cn/">https://ricerc.sicau.edu.cn/</uri>. The PacBio long reads and real next-generation short reads of the 9311 sample are obtained from GSA under Project ID (PRJCA002103) via <uri xlink:href="https://ngdc.cncb.ac.cn/gsa/">https://ngdc.cncb.ac.cn/gsa/</uri>. The Perl script used for data preprocessing is available via <uri xlink:href="https://ngdc.cncb.ac.cn/biocode/tools/BT007415">https://ngdc.cncb.ac.cn/biocode/tools/BT007415</uri>.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>MJ: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Resources, Software, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. MC: Writing&#xa0;&#x2013; review &amp; editing. JZ: Writing &#x2013; review &amp; editing. ZD: Supervision, Writing &#x2013; review &amp; editing. JX: Project administration, Supervision, Writing &#x2013; review &amp; editing.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported by the Strategic Priority Research Program of the Chinese Academy of Sciences (XDB38030400 to JX); National Natural Science Foundation of China (32170669 to JX); National Key Research Program of China (2020YFC0848900 to JX); the Youth Innovation Promotion Association of the Chinese Academy of Sciences (2022098 to JZ).</p>
</sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors&#xa0;and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fpls.2024.1371222/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fpls.2024.1371222/full#supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet_1.zip" id="SF1" mimetype="application/zip">
<label>Supplementary Figure&#xa0;1</label>
<caption>
<p>The optimal k-mer used in the whole genome assembly for each of the 20 rice samples using <italic>eupan assemble linearK</italic> model from the EUPAN toolkit.</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="DataSheet_1.zip" id="SF2" mimetype="application/zip">
<label>Supplementary Figure&#xa0;2</label>
<caption>
<p>The heatmap of the overlapping number of insertions between paired samples in groups consisting of 5 <bold>(A)</bold>, 10 <bold>(B)</bold>, 15 <bold>(C)</bold>, and 20 <bold>(D)</bold> samples. The heatmap of the presence and absence profile of insertions across samples in the group consisting of 5 <bold>(E)</bold>, 10 <bold>(F)</bold>, 15 <bold>(G)</bold>, and 20 <bold>(H)</bold> samples. The distribution of insertion numbers as samples increase in the group consisting of 5 <bold>(I)</bold>, 10 <bold>(J)</bold>, 15 <bold>(K)</bold>, and 20 <bold>(L)</bold> samples. The pie chart shows the percentage of insertions found in genic versus intergenic regions in the group consisting of 5 <bold>(M)</bold>, 10 <bold>(N)</bold>, 15 <bold>(O)</bold>, and 20 <bold>(P)</bold> samples.</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="DataSheet_1.zip" id="SF3" mimetype="application/zip">
<label>Supplementary Figure&#xa0;3</label>
<caption>
<p>The effect of the length for novel sequences on three strategies. <bold>(A)</bold> The distribution for recovered coverage of sequences from the testing dataset to novel genes from the validated dataset for the three strategies across various lengths. <bold>(B)</bold> The distribution for recovered coverage of sequences from the testing dataset to insertions from the validated dataset for the three strategies across various lengths. <bold>(C)</bold> Density of recovered length of sequences from the testing dataset to novel genes and insertions from the validated dataset for the three strategies across various lengths of novel sequences from the validated date set. <bold>(D)</bold> Recall distribution for the three strategies across various lengths of novel sequences from the validated data set. Length categories are defined as SS: 50-100bp; S: 100-1,000bp; M: 1,000-100,000bp; L: &gt;100,000bp.</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="DataSheet_1.zip" id="SF4" mimetype="application/zip">
<label>Supplementary Figure&#xa0;4</label>
<caption>
<p>The effect of repeat content percentage for novel sequences on three strategies. <bold>(A)</bold> The distribution for recovered coverage of sequences from the testing dataset to novel genes from the validated dataset for the three strategies across various repeat content percentages of novel sequences from the validated dataset. <bold>(B)</bold> The distribution for recovered coverage of sequences from the testing dataset to insertions from the validated dataset for the three strategies across various repeat content percentages of novel sequences from the validated dataset. <bold>(C)</bold> Density of recovered repeat content percentages across the three pan-genome construction strategies. <bold>(D)</bold> Recall distribution for the three strategies across various repeat content percentages of novel sequences from the validated data set. 0 means novel sequences from the validated data set have repeat content percentage [0,0.1], 1 means novel sequences from the validated data set have repeat content percentage (0.1,0.2], 2 means novel sequences from the validated data set have repeat content percentage [0.2,0.3], 3 means novel sequences from the validated data set have repeat content percentage [0.3,0.4], 4 means novel sequences from the validated data set have repeat content percentage [0.4,0.5], 5 means novel sequences from the validated data set have repeat content percentage [0.5,0.6], 6 means novel sequences from the validated data set have repeat content percentage [0.6,0.7], 7 means novel sequences from the validated data set have repeat content percentage [0.7,0.8], 8 means novel sequences from the validated data set have repeat content percentage [0.8,0.9], 9 means novel sequences from the validated data set have repeat content percentage (0.9,1].</p>
</caption>
</supplementary-material>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Altschul</surname> <given-names>S. F.</given-names>
</name>
<name>
<surname>Gish</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Miller</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Myers</surname> <given-names>E. W.</given-names>
</name>
<name>
<surname>Lipman</surname> <given-names>D. J.</given-names>
</name>
</person-group> (<year>1990</year>). <article-title>Basic local alignment search tool</article-title>. <source>J. Mol. Biol.</source> <volume>215</volume>, <fpage>403</fpage>&#x2013;<lpage>410</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0022-2836(05)80360-2</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bayer</surname> <given-names>P. E.</given-names>
</name>
<name>
<surname>Golicz</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Tirnaz</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Chan</surname> <given-names>C. K. K.</given-names>
</name>
<name>
<surname>Edwards</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Batley</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Variation in abundance of predicted resistance genes in the Brassica oleracea pangenome</article-title>. <source>Plant Biotechnol. J.</source> <volume>17</volume>, <fpage>789</fpage>&#x2013;<lpage>800</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/pbi.13015</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Using repeatMasker to identify repetitive elements in genomic sequences</article-title>. <source>Curr. Protoc. Bioinforma</source> <volume>5</volume>, <fpage>4.10.1</fpage>&#x2013;<lpage>4.10.14</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/0471250953.bi0410s05</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Contreras-Moreira</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Cantalapiedra</surname> <given-names>C. P.</given-names>
</name>
<name>
<surname>Garc&#xed;a-Pereira</surname> <given-names>M. J.</given-names>
</name>
<name>
<surname>Gordon</surname> <given-names>S. P.</given-names>
</name>
<name>
<surname>Vogel</surname> <given-names>J. P.</given-names>
</name>
<name>
<surname>Igartua</surname> <given-names>E.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Analysis of plant pan-genomes and transcriptomes with GET_HOMOLOGUES-EST, a clustering solution for sequences of the same species</article-title>. <source>Front. Plant Sci.</source> <volume>8</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2017.00184</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Emms</surname> <given-names>D. M.</given-names>
</name>
<name>
<surname>Kelly</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>OrthoFinder: phylogenetic orthology inference for comparative genomics</article-title>. <source>Genome Biol.</source> <volume>20</volume>, <fpage>238</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13059-019-1832-y</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fouts</surname> <given-names>D. E.</given-names>
</name>
<name>
<surname>Brinkac</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Beck</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Inman</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sutton</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>PanOCT: Automated clustering of orthologs using conserved gene neighborhood for pan-genomic analysis of bacterial strains and closely related species</article-title>. <source>Nucleic Acids Res.</source> <volume>40</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/nar/gks757</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Niu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>CD-HIT: Accelerated for clustering the next-generation sequencing data</article-title>. <source>Bioinformatics</source> <volume>28</volume>, <fpage>3150</fpage>&#x2013;<lpage>3152</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/bts565</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Gonda</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Bao</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Tieman</surname> <given-names>D. M.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>The tomato pan-genome uncovers new genes and a rare allele regulating fruit flavor</article-title>. <source>Nat. Genet.</source> <volume>51</volume>, <fpage>1044</fpage>&#x2013;<lpage>1051</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41588-019-0410-2</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Golicz</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Bayer</surname> <given-names>P. E.</given-names>
</name>
<name>
<surname>Barker</surname> <given-names>G. C.</given-names>
</name>
<name>
<surname>Edger</surname> <given-names>P. P.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>H. R.</given-names>
</name>
<name>
<surname>Martinez</surname> <given-names>P. A.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). <article-title>The pangenome of an agronomically important crop plant Brassica oleracea</article-title>. <source>Nat. Commun.</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/ncomms13390</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Golicz</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Bayer</surname> <given-names>P. E.</given-names>
</name>
<name>
<surname>Bhalla</surname> <given-names>P. L.</given-names>
</name>
<name>
<surname>Batley</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Edwards</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Pangenomics comes of age: from bacteria to plant and animal applications</article-title>. <source>Trends Genet.</source> <volume>36</volume>, <fpage>132</fpage>&#x2013;<lpage>145</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.tig.2019.11.006</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gordon</surname> <given-names>S. P.</given-names>
</name>
<name>
<surname>Contreras-Moreira</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Woods</surname> <given-names>D. P.</given-names>
</name>
<name>
<surname>Des Marais</surname> <given-names>D. L.</given-names>
</name>
<name>
<surname>Burgess</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Shu</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Extensive gene content variation in the Brachypodium distachyon pan-genome correlates with population structure</article-title>. <source>Nat. Commun.</source> <volume>8</volume>, <fpage>2184</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41467-017-02292-8</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>K. C.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>EUPAN enables pan-genome studies of a large number of eukaryotic genomes</article-title>. <source>Bioinformatics</source> <volume>33</volume>, <fpage>2408</fpage>&#x2013;<lpage>2409</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btx170</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Myers</surname> <given-names>J. R.</given-names>
</name>
<name>
<surname>Marth</surname> <given-names>G. T.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>ART: a next-generation sequencing read simulator</article-title>. <source>Bioinformatics</source> <volume>28</volume>, <fpage>593</fpage>&#x2013;<lpage>594</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btr708</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>H&#xfc;bner</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Bercovich</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Todesco</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Mandel</surname> <given-names>J. R.</given-names>
</name>
<name>
<surname>Odenheimer</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ziegler</surname> <given-names>E.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Sunflower pan-genome analysis shows that hybridization altered gene content and disease resistance</article-title>. <source>Nat. Plants</source> <volume>5</volume>, <fpage>54</fpage>&#x2013;<lpage>62</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41477-018-0329-0</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hufford</surname> <given-names>M. B.</given-names>
</name>
<name>
<surname>Seetharam</surname> <given-names>A. S.</given-names>
</name>
<name>
<surname>Woodhouse</surname> <given-names>M. R.</given-names>
</name>
<name>
<surname>Chougule</surname> <given-names>K. M.</given-names>
</name>
<name>
<surname>Ou</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>
<italic>De novo</italic> assembly, annotation, and comparative analysis of 26 diverse maize genomes</article-title>. <source>Science</source> <volume>80-) 373</volume>, <fpage>655</fpage>&#x2013;<lpage>662</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1126/science.abg5289</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hurgobin</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Golicz</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Bayer</surname> <given-names>P. E.</given-names>
</name>
<name>
<surname>Chan</surname> <given-names>C. K. K.</given-names>
</name>
<name>
<surname>Tirnaz</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Dolatabadian</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Homoeologous exchange is a major cause of gene presence/absence variation in the amphidiploid Brassica napus</article-title>. <source>Plant Biotechnol. J.</source> <volume>16</volume>, <fpage>1265</fpage>&#x2013;<lpage>1274</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/pbi.12867</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jeffares</surname> <given-names>D. C.</given-names>
</name>
<name>
<surname>Jolly</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Hoti</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Speed</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Shaw</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Rallis</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Transient structural variations have strong effects on quantitative traits and reproductive isolation in fission yeast</article-title>. <source>Nat. Commun.</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/ncomms14061</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kurtz</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Phillippy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Delcher</surname> <given-names>A. L.</given-names>
</name>
<name>
<surname>Smoot</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Shumway</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Antonescu</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2004</year>). <article-title>Versatile and open software for comparing large genomes</article-title>. <source>Genome Biol.</source> <volume>5</volume>, <elocation-id>R12</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/gb-2004-5-2-r12</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM</article-title>. <source>ArXiv preprint</source>. <volume>arXiv:1303.3997</volume>.</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>C. M.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sadakane</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lam</surname> <given-names>T. W.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>MEGAHIT: An ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph</article-title>. <source>Bioinformatics</source> <volume>31</volume>, <fpage>1674</fpage>&#x2013;<lpage>1676</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btv033</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Stoeckert</surname> <given-names>C. J. J.</given-names>
</name>
<name>
<surname>Roos</surname> <given-names>D. S.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>OrthoMCL: identification of ortholog groups for eukaryotic genomes</article-title>. <source>Genome Res.</source> <volume>13</volume>, <fpage>2178</fpage>&#x2013;<lpage>2189</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1101/gr.1224503.candidates</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>L. G.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2014</year>). <article-title>
<italic>De novo</italic> assembly of soybean wild relatives for pan-genome analysis of diversity and agronomic traits</article-title>. <source>Nat. Biotechnol.</source> <volume>32</volume>, <fpage>1045</fpage>&#x2013;<lpage>1052</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/nbt.2979</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Pan-genome of wild and cultivated soybeans</article-title>. <source>Cell</source> <volume>182</volume>, <fpage>162</fpage>&#x2013;<lpage>176.e13</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cell.2020.05.023</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lovell</surname> <given-names>J. T.</given-names>
</name>
<name>
<surname>Sreedasyam</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Schranz</surname> <given-names>M. E.</given-names>
</name>
<name>
<surname>Wilson</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Carlson</surname> <given-names>J. W.</given-names>
</name>
<name>
<surname>Harkess</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>GENESPACE tracks regions of interest and gene copy number variation across multiple genomes</article-title>. <source>Elife</source> <volume>11</volume>, <fpage>1</fpage>&#x2013;<lpage>20</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.7554/eLife.78526</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2012</year>). <article-title>SOAPdenovo2: an&#xa0;empirically improved memory-efficient short-read de novo assembler</article-title>. <source>Gigascience</source> <volume>1</volume>&#xa0;(<issue>1</issue>), <fpage>18</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/2047-217X-1-18</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Montenegro</surname> <given-names>J. D.</given-names>
</name>
<name>
<surname>Golicz</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Bayer</surname> <given-names>P. E.</given-names>
</name>
<name>
<surname>Hurgobin</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>H. T.</given-names>
</name>
<name>
<surname>Chan</surname> <given-names>C. K. K.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>The pangenome of hexaploid bread wheat</article-title>. <source>Plant J.</source> <volume>90</volume>, <fpage>1007</fpage>&#x2013;<lpage>1013</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/tpj.13515</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morgante</surname> <given-names>M.</given-names>
</name>
<name>
<surname>De Paoli</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Radovic</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Transposable elements and the plant pan-genomes</article-title>. <source>Curr. Opin. Plant Biol.</source> <volume>10</volume>, <fpage>149</fpage>&#x2013;<lpage>155</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.pbi.2007.02.001</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ou</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Pan-genome of cultivated pepper (Capsicum) and its use in gene presence&#x2013;absence variation analyses</article-title>. <source>New Phytol.</source> <volume>220</volume>, <fpage>360</fpage>&#x2013;<lpage>363</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/nph.15413</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ouyang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Hamilton</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Campbell</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Childs</surname> <given-names>K.</given-names>
</name>
<etal/>
</person-group>. (<year>2007</year>). <article-title>The TIGR Rice Genome Annotation Resource: improvements and new features</article-title>. <source>Nucleic Acids Res.</source> <volume>35</volume>, <fpage>D883</fpage>&#x2013;<lpage>D887</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/nar/gkl976</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Page</surname> <given-names>A. J.</given-names>
</name>
<name>
<surname>Cummins</surname> <given-names>C. A.</given-names>
</name>
<name>
<surname>Hunt</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wong</surname> <given-names>V. K.</given-names>
</name>
<name>
<surname>Reuter</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Holden</surname> <given-names>M. T. G.</given-names>
</name>
<etal/>
</person-group>. (<year>2015</year>). <article-title>Roary: Rapid large-scale prokaryote pan genome analysis</article-title>. <source>Bioinformatics</source> <volume>31</volume>, <fpage>3691</fpage>&#x2013;<lpage>3693</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btv421</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qin</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Pan-genome analysis of 33 genetically diverse rice accessions reveals hidden genomic variations</article-title>. <source>Cell</source> <volume>184</volume>, <fpage>3542</fpage>&#x2013;<lpage>3558.e16</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cell.2021.04.046</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Santos</surname> <given-names>A. R.</given-names>
</name>
<name>
<surname>Barbosa</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Fiaux</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zurita-Turk</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chaitankar</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Kamapantula</surname> <given-names>B.</given-names>
</name>
<etal/>
</person-group>. (<year>2013</year>). <article-title>PANNOTATOR: An automated tool for annotation of pan-genomes</article-title>. <source>Genet. Mol. Res.</source> <volume>12</volume>, <fpage>2982</fpage>&#x2013;<lpage>2989</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.4238/2013.August.16.2</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>He</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>A super pan-genomic landscape of rice</article-title>. <source>Cell Res.</source> <volume>32</volume>, <fpage>878</fpage>&#x2013;<lpage>896</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41422-022-00685-z</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>RPAN: Rice pan-genome browser for &#x223c;3000 rice genomes</article-title>. <source>Nucleic Acids Res.</source> <volume>45</volume>, <fpage>597</fpage>&#x2013;<lpage>605</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/nar/gkw958</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cruickshank</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Teng</surname> <given-names>F.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Extensive variation within the pan-genome of cultivated and wild sorghum</article-title>. <source>Nat. Plants</source> <volume>7</volume>, <fpage>766</fpage>&#x2013;<lpage>773</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41477-021-00925-x</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tettelin</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Masignani</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Michael J</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Claudio</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Duccio</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Naomi L</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>(<year>2005</year>)<article-title>Genome analysis of multiple pathogenic isolates of Streptococcus agalactiae: implications for the microbial &#x201c;pan-genome.&#x201d;</article-title>
<source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>102</volume>, <fpage>13950</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1073/pnas.0506758102</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Debarry</surname> <given-names>J. D.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2012</year>). <article-title>MCScanX: A toolkit for detection and evolutionary analysis of gene synteny and collinearity</article-title>. <source>Nucleic Acids Res.</source> <volume>40</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/nar/gkr1293</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L. L.</given-names>
</name>
<name>
<surname>Xing</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Kudrna</surname> <given-names>D. A.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Copetti</surname> <given-names>D.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). <article-title>Extensive sequence divergence between the reference genomes of two elite indica rice varieties Zhenshan 97 and Minghui 63</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>113</volume>, <fpage>E5163</fpage>&#x2013;<lpage>E5171</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1073/pnas.1611012113</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Pan-genome of Raphanus highlights genetic variation and introgression among domesticated, wild, and weedy radishes</article-title>. <source>Mol. Plant</source> <volume>14</volume>, <fpage>2032</fpage>&#x2013;<lpage>2055</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.molp.2021.08.005</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Diao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>C. J.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>The poplar pangenome provides insights into the evolutionary history of the genus</article-title>. <source>Commun. Biol.</source> <volume>2</volume>, <fpage>215</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s42003-019-0474-7</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>Q.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Pan-genome analysis highlights the extent of genomic variation in cultivated and wild rice</article-title>. <source>Nat. Genet.</source> <volume>50</volume>, <fpage>278</fpage>&#x2013;<lpage>284</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41588-018-0041-z</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ling</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2014</year>). <article-title>PanGP: A tool for quickly analyzing bacterial pan-genome profile</article-title>. <source>Bioinformatics</source> <volume>30</volume>, <fpage>1297</fpage>&#x2013;<lpage>1299</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btu017</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>PGAP: Pan-genomes analysis pipeline</article-title>. <source>Bioinformatics</source> <volume>28</volume>, <fpage>416</fpage>&#x2013;<lpage>418</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btr655</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Silverstein</surname> <given-names>K. A. T.</given-names>
</name>
<name>
<surname>Ramaraj</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Guhlin</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Denny</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Exploring structural variation and gene family architecture with <italic>De Novo</italic> assemblies of 15 Medicago genomes</article-title>. <source>BMC Genomics</source> <volume>18</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12864-017-3654-1</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>