<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="methods-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2023.1087447</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Marine Science</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Symbiont-screener: A reference-free tool to separate host sequences from symbionts for error-prone long reads</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Xu</surname>
<given-names>Mengyang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2043512"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Guo</surname>
<given-names>Lidong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2080576"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Qi</surname>
<given-names>Yanwei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/312147"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Shi</surname>
<given-names>Chengcheng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/856373"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Xiaochuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2180303/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Jianwei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/284009"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Han</surname>
<given-names>Jinglin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Deng</surname>
<given-names>Li</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Liu</surname>
<given-names>Xin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/761542"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Fan</surname>
<given-names>Guangyi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/656418"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>BGI-Qingdao, BGI-Shenzhen</institution>, <addr-line>Qingdao</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>BGI-Shenzhen</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>College of Life Sciences, University of Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>State Key Laboratory of Agricultural Genomics, BGI-Shenzhen</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Neil Ross McEwan, Robert Gordon University, United Kingdom</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Matthew Hegarty, Aberystwyth University, United Kingdom; Shuyuan Wang, Harbin Medical University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Guangyi Fan, <email xlink:href="mailto:fanguangyi@genomics.cn">fanguangyi@genomics.cn</email>;  Xin Liu, <email xlink:href="mailto:liuxin@genomics.cn">liuxin@genomics.cn</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work</p>
</fn>
<fn fn-type="other" id="fn002">
<p>This article was submitted to Microbial Symbioses, a section of the journal Frontiers in Marine Science</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>01</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>10</volume>
<elocation-id>1087447</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>11</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>16</day>
<month>01</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Xu, Guo, Qi, Shi, Liu, Chen, Han, Deng, Liu and Fan</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Xu, Guo, Qi, Shi, Liu, Chen, Han, Deng, Liu and Fan</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Metagenomic sequencing facilitates large-scale constitutional analysis and functional characterization of complex microbial communities without cultivation. Recent advances in long-read sequencing techniques utilize long-range information to simplify repeat-aware metagenomic assembly puzzles and complex genome binning tasks. However, it remains methodologically challenging to remove host-derived DNA sequences from the microbial community at the read resolution due to high sequencing error rates and the absence of reference genomes. We here present Symbiont-Screener (<uri xlink:href="https://github.com/BGI-Qingdao/Symbiont-Screener">https://github.com/BGI-Qingdao/Symbiont-Screener</uri>), a reference-free approach to identifying high-confidence host&#x2019;s long reads from symbionts and contaminants and overcoming the low sequencing accuracy according to a trio-based screening model. The remaining host&#x2019;s sequences are then automatically grouped by unsupervised clustering. When applied to both simulated and real long-read datasets, it maintains higher precision and recall rates of identifying the host&#x2019;s raw reads compared to other tools and hence promises the high-quality reconstruction of the host genome and associated metagenomes. Furthermore, we leveraged both PacBio HiFi and nanopore long reads to separate the host&#x2019;s sequences on a real host-microbe system, an algal-bacterial sample, and retrieved an obvious improvement of host assembly in terms of assembly contiguity, completeness, and purity. More importantly, the residual symbiotic microbiomes illustrate improved genomic profiling and assemblies after the screening, which elucidates a solid basis of data for downstream bioinformatic analyses, thus providing a novel perspective on symbiotic research.</p>
</abstract>
<kwd-group>
<kwd>symbiosis</kwd>
<kwd>decontamination</kwd>
<kwd>metagenomic sequencing</kwd>
<kwd>long reads</kwd>
<kwd>bioinformatics</kwd>
<kwd>alignment-free</kwd>
<kwd>reference-free</kwd>
<kwd>
<italic>de novo</italic> assembly</kwd>
</kwd-group>
<contract-sponsor id="cn001">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">National Key Research and Development Program of China<named-content content-type="fundref-id">10.13039/501100012166</named-content>
</contract-sponsor>
<counts>
<fig-count count="3"/>
<table-count count="0"/>
<equation-count count="2"/>
<ref-count count="54"/>
<page-count count="10"/>
<word-count count="5610"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Powered by advanced biotechnologies and big data analytics, genome sequences become the significant biological basis and genomic resources of modern life science. Simultaneous genome sequencing of the host-symbiont ecosystem reveals the bioinformatic information of both the host species and associated microbial communities, thus prompting the symbiotic studies to move from gene-centric to genome-centric fields (<xref ref-type="bibr" rid="B51">Xie et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B52">Xie et&#xa0;al., 2020</xref>).</p>
<p>However, valuable insights into the construction and dynamics of the symbiotic ecosystem require successful separation of the host, symbiont, and contaminant data (<xref ref-type="bibr" rid="B13">Cornet and Baurain, 2022</xref>). Complicated and laborious experimental approaches have been developed to isolate host sequences from prokaryotic contamination (<xref ref-type="bibr" rid="B3">Arimoto et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B9">Cheng et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B46">Wang et&#xa0;al., 2020</xref>). Current bioinformatic methods rely on the species differentiation in statistical features (<xref ref-type="bibr" rid="B50">Woyke et&#xa0;al., 2006</xref>; <xref ref-type="bibr" rid="B2">Alneberg et&#xa0;al., 2014</xref>), or nucleotide and protein similarity of pre-assemblies to known genomes or public databases (<xref ref-type="bibr" rid="B12">Coghlan et&#xa0;al., 2019</xref>). Unfortunately, most methods designed for next-generation sequencing (NGS) short reads provide incomplete or inaccurate information and are facing challenges such as strong dependence on public data libraries or pre-assembly quality. It has been demonstrated that public genomic data may contain foreign sequences, leading to erroneous genetic characteristics (<xref ref-type="bibr" rid="B32">Neimark, 2015</xref>; <xref ref-type="bibr" rid="B43">Steinegger and Salzberg, 2020</xref>; <xref ref-type="bibr" rid="B15">Douvlataniotis et&#xa0;al., 2020</xref>). Moreover, those sequences that cannot be aligned to the reference genomes usually provide more critical findings, for instance, the identification of novel COVID-19 variants (<xref ref-type="bibr" rid="B39">Ricker et&#xa0;al., 2012</xref>; <xref ref-type="bibr" rid="B21">Kim et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B8">Cheng et&#xa0;al., 2022</xref>).</p>
<p>Short-read assemblies cannot resolve the highly nonuniform coverage of the composing species and the presence of long intra-genomic and inter-genomic repeats (<xref ref-type="bibr" rid="B23">Kolmogorov et&#xa0;al., 2020</xref>), resulting in uncompleted draft genomes with gaps (<xref ref-type="bibr" rid="B17">Fraser et&#xa0;al., 2002</xref>; <xref ref-type="bibr" rid="B54">Xu et&#xa0;al., 2020</xref>). Third-generation sequencing (TGS) long reads, including Pacific Biosciences (PacBio) and Oxford Nanopore Technologies (ONT), provide unique long-range information, which straightforwardly simplifies complicated biomedical problems (<xref ref-type="bibr" rid="B31">Nagarajan and Pop, 2013</xref>; <xref ref-type="bibr" rid="B38">Rhoads and Au, 2015</xref>; <xref ref-type="bibr" rid="B36">Qi et&#xa0;al., 2022</xref>). The reconstruction of the complete genome sequence of hosts and microbes enables the analysis of the symbiotic relationship and microbial diversity, including the detection of horizontal gene transfer of mobile elements, large-scale structural rearrangements, and search for biosynthetic gene clusters (<xref ref-type="bibr" rid="B11">Chin et&#xa0;al., 2013</xref>; <xref ref-type="bibr" rid="B4">Bertrand et&#xa0;al., 2019</xref>). Long-read metagenomic classifiers such as Centrifuge (<xref ref-type="bibr" rid="B22">Kim et&#xa0;al., 2016</xref>), Kraken2 (<xref ref-type="bibr" rid="B49">Wood et&#xa0;al., 2019</xref>), and MetaMaps (<xref ref-type="bibr" rid="B14">Dilthey et&#xa0;al., 2019</xref>) can build indexed databases according to acknowledged reference genomes and NCBI taxonomy, and assign corresponding long reads to the host. Meanwhile, MetaProb (<xref ref-type="bibr" rid="B18">Girotto et&#xa0;al., 2016</xref>), BusyBee (<xref ref-type="bibr" rid="B27">Laczny et&#xa0;al., 2017</xref>) and MetaBCC-LR (<xref ref-type="bibr" rid="B47">Wickramarachchi et&#xa0;al., 2020</xref>) do not require references to classify long reads based on the unsupervised clustering results of <italic>k</italic>-mer coverage or oligonucleotide composition, but cannot indicate which cluster belongs to the host. But the relatively high sequencing error rates of TGS data might be greater than the genetic difference between organisms, resulting in a low capture ratio of the host&#x2019;s data and large computational consumption (<xref ref-type="bibr" rid="B5">Bharti and Grimm, 2019</xref>). It becomes almost impossible to classify highly similar sequences shared by both the host and symbionts, for instance, symbiotic algae in a floating island of seaweeds (<xref ref-type="bibr" rid="B44">Thiel and Gutow, 2005</xref>; <xref ref-type="bibr" rid="B40">Roth&#xe4;usler et&#xa0;al., 2012</xref>). Besides, it is even more complex for the <italic>de novo</italic> projects without sufficient priori knowledge, that is, lack of reference genomes or libraries.</p>
<p>The combination of TGS&#x2019;s unprecedented read lengths and the trio&#x2019;s global inherited information has been demonstrated to improve the genome assembly and further reconstruct haplotypes (<xref ref-type="bibr" rid="B25">Koren et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B16">Ebert et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B53">Xu et&#xa0;al., 2021</xref>). This idea also motivates us to introduce a reference-free and alignment-free way to solve the screening problem prior to assembly. Laboratory cultivation of sexually reproducing diploid host with associated microorganisms enables us to gather the parent-offspring pedigree information without loss of symbiotic microbial information. In this work, we established a novel screening model of TGS raw reads according to the transmissibility of heterozygous variants in the trio of host species, the stability of symbiotic relations, and the randomness of contaminant sources. Based on this model, Symbiont-Screener selects high-confidence host&#x2019;s reads. Then it captures more host sequences by an unsupervised clustering algorithm. The final data, in which most of the foreign genomes have been screened out, can recover a high-quality host genome. On the other hand, the residual microbial long reads can enhance the variation profiling, metagenomic assemblies, and taxonomic binning.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Trio-based screening model</title>
<p>The design of Symbiont-Screener focuses on the stepwise purification of the host&#x2019;s sequences with sufficient precision and recall rates to reconstruct the genome by combining the advantage of long read lengths with trio-binning markers and minimizing the effect of high error rates (<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>). The mixed sample possibly comprises steady symbionts and random DNA contaminants induced by laboratory pollution or artificial experimental errors other than the host genome. Among them, symbionts sharing highly similar sequences with the host are the most difficult to be isolated. According to the species sources and the relation of parent-offspring trios, the possible foreign genomes in the offspring&#x2019;s data can be categorized into four types: the offspring-only (OC), perhaps random contaminants; shared by father and offspring (POC); shared by mother and offspring (MOC); and shared by all three (SC), perhaps steady symbionts. Theoretically, the trio-specific markers inherited from contaminated parents allow the identification and reconstruction of the host chromosomes, and meanwhile, SC can be filtered out since their markers will no longer occur in the parent-specific marker libraries after set operations. The set difference, however, cannot remove parent-specific foreign genomes, POC or MOC. On the other hand, the intersection of marker libraries for all individuals ascertains the host autosomes alongside with SC. Nevertheless, the foreign species shared only by one parent (POC or MOC) will be discarded. At last, none of the markers will be enriched in the sequences of OC. Parental samples of the trio are required to be collected and sequenced to provide paternal- and maternal-specific markers. In principle, the set operations of characteristic marker libraries can eliminate most of the foreign species if the host heterozygosity, read length, and read sing-base accuracy meet certain thresholds.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Workflow of Symbiont-Screener. The input sequencing long reads of the host-microbe mixture can be categorized based on whether they contain characteristic markers or not. The tool starts by calculating and matching the parent-specific and shared <italic>strobemers</italic>. The second step is to classify long reads according to the species differentiation of characteristic marker densities, and detect the high-confidence host&#x2019;s data, which satisfies the relatively more accurate PacBio reads (single-base accuracy &#x2265;95%). Then, for ONT long reads with higher error rates (single-base accuracy&lt;95%), all the long reads including those without any matched markers are clustered by the features of characteristic <italic>strobemer</italic> density, GC content, and trinucleotide composition. Next, the clusters which belong to the host are detected by high-confidence long reads preselected in Step2, and other remaining clusters are labeled as metagenomic long reads. The final output includes a high-quality haplotype-collapsed or two haplotype-resolved assemblies for the diploid host as well as complete metagenome-assembled assemblies for the host-associated microbiomes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-10-1087447-g001.tif"/>
</fig>
<p>The read length of current PacBio or ONT data meets the requirement of this model, although the high sequencing error rate remains the greatest obstacle. Theoretically, the expected number of characteristic <italic>k</italic>-mers (discriminability) in a single read is proportional to the read length, heterozygosity ratio, and <italic>k</italic>th power of single-base sequencing accuracy. In addition to the use of error-tolerate <italic>strobemers</italic> (<xref ref-type="bibr" rid="B41">Sahlin, 2021</xref>), we identify raw long-read data based on the genomic feature, which are further clustered with the unsupervised Bayesian Gaussian Mixture model (BGMM) method after principal component analysis (PCA) dimensionality reduction. In practice, the parent-specific and shared <italic>strobemers</italic>, the species differentiation in GC content and oligonucleotide frequencies consist of the main features in the algorithm. The procedure of genome binning&#xa0;is&#xa0;equivalent&#xa0;to the read clustering based on the Mahalanobis distances to centers in the 36-dimensional feature space. Moreover, the characteristic markers can be used to identify which read cluster belongs to the host.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Generation of characteristic markers</title>
<p>According to the above model, characteristic markers existing in the paternal group other than the maternal are defined as paternal-only makers, while those only existing in the maternal group are defined as maternal-only. Meanwhile, markers shared by both parents are defined as shared. We removed markers in low- and high-frequency regions and then ran set operations to calculate characteristic markers (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;1</bold>
</xref>). <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;2</bold>
</xref> shows the marker category of reference assemblies for the host, symbionts, and contaminants, which reflects the feasibility of this screening model in identifying different types of foreign species.</p>
<p>Plots of normalized densities of parent-specific and shared markers demonstrate that only host long reads synchronously own abundant parent-specific and shared markers (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;3</bold>
</xref>). Thus, the high-confidence host reads can be determined by the following formula</p>
<disp-formula>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mn mathvariant="italic">1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>1</mml:mi>
</mml:msub>
<mml:mo>&gt;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#x2016;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>2</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>1</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>&gt;</mml:mo>
</mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mtext>&amp;&amp;</mml:mtext>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>3</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>2</mml:mi>
</mml:msub>
<mml:mo>&gt;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>x<sub>1,i</sub>
</italic>and <italic>x<sub>2,i</sub>
</italic> refer to the parent-only maker densities in the <italic>i</italic>th long read, <italic>x<sub>3,i</sub>
</italic> refers to the shared maker density, while <italic>C<sub>1</sub>
</italic> and <italic>C<sub>2</sub>
</italic> refer to different thresholds.</p>
<statement id="algo1">
<label>Algorithm 1. ReadSourceType</label>
<p>
<preformat>&#xD;<bold>Input</bold> Long read <bold>LR</bold>, marker set <bold>POK, MOK</bold>, and <bold>SK &#xD;&#xD; Output</bold> the source type of read <bold>LR</bold>&#xD;&#xD;1 flag_pok = <bold>
<italic>KmerLookup</italic> ( LR , POK )</bold>&#xD;&#xD;2 flag_mok = <bold>
<italic>KmerLookup</italic> ( LR , MOK )</bold>&#xD;3 flag_sk = <bold>
<italic>KmerLookup</italic> ( LR , SK )</bold>&#xD;4 <italic>
<bold>if</bold>
</italic> ( ( flag_pok || flag_mok ) &amp;&amp; flag_sk )&#xD;&#xD;5 <bold>
<italic>return</italic>
</bold> Host&#xD;&#xD;6 <bold>
<italic>else if</italic>
</bold> ( flag_pok &amp;&amp; ! ( flag_mok &amp;&amp; flag_sk ) )&#xD;&#xD;7 <bold>
<italic>return</italic>
</bold> POC&#xD;&#xD;8 <bold>
<italic>else if</italic>
</bold> ( flag_mok &amp;&amp; ! ( flag_pok &amp;&amp; flag_sk ) )&#xD;&#xD;9 <bold>
<italic>return</italic>
</bold> MOC&#xD;&#xD;10 <bold>
<italic>else if</italic>
</bold> ( ! ( flag_pok &amp;&amp; flag_mok &amp;&amp; flag_sk ) )&#xD;&#xD;11 <bold>
<italic>return OC</italic>
</bold>&#xD;&#xD;12 <bold>
<italic>else if</italic>
</bold> ( ! ( flag_pok || flag_mok ) &amp;&amp; flag_sk ) )&#xD;&#xD;13 <bold>
<italic>return SC</italic>
</bold>&#xD;&#xD;14 <bold>
<italic>end</italic>
</bold>&#xD;</preformat>
</p>
</statement>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>strobemer vs. k-mer</title>
<p>Characteristic markers can be <italic>k</italic>-mers as used in conventional trio-binning and genome-binning approaches (<xref ref-type="bibr" rid="B25">Koren et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B16">Ebert et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B53">Xu et&#xa0;al., 2021</xref>), or error-tolerant <italic>strobemers</italic> (<xref ref-type="bibr" rid="B41">Sahlin, 2021</xref>). The utilization of <italic>k</italic>-mers can statistically capture parent-specific markers in raw data as long as the reads are sufficiently long and the host homologous chromosomes have enough heterozygous sites (<xref ref-type="bibr" rid="B25">Koren et&#xa0;al., 2018</xref>). However, the captured ratio is still severely limited by the sequencing errors for such a complex application of screening. Therefore, we chose <italic>strobemer</italic> implementation for the error-tolerant indexing and matching to provide more evenly distributed matches with higher genome coverage and less sensitive to sequencing errors, especially for insertions and deletions (<xref ref-type="bibr" rid="B41">Sahlin, 2021</xref>). In <italic>k</italic>-mer mode, we applied meryl (<xref ref-type="bibr" rid="B37">Rhie et&#xa0;al., 2020</xref>) to generating and counting 21-mers. In <italic>strobemer</italic> mode, we first used Jellyfish (<xref ref-type="bibr" rid="B29">Marcais and Kingsford, 2011</xref>) to build large canonical <italic>k</italic>-mers (<italic>k</italic>=40) of two contaminated parental datasets, and then transformed them to <italic>strobemers</italic> [<italic>randstrobes</italic> (20,10,10,30)] by custom C++ scripts.</p>
<p>To benchmark the effect of <italic>k</italic>-mers and <italic>strobemers</italic>, we generated random sequences with a fixed length of 100 kbp and varied the mutation (error) rates. The mutation spots were randomly selected, in which the error probabilities of substitutions, insertions, and deletions were equal. Benchmarking of matched markers under different sampling protocols with mutation rates of 1%, 5%, and 10% was listed in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;1</bold>
</xref>. Each test was independently run 100 times. Either <italic>minstrobes</italic> or <italic>randstrobes</italic> (two types of <italic>strobemers</italic>) match more mutation spots than <italic>k</italic>-mers, especially for higher error rates. This benchmark is available at <uri xlink:href="https://github.com/BGI-Qingdao/strobemer_cpptest">https://github.com/BGI-Qingdao/strobemer_cpptest</uri>. <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;4</bold>
</xref> shows the precision-recall curve with trio-binning <italic>strobemers</italic> and <italic>k</italic>-mers for three simulated datasets. The implementation of <italic>strobemers</italic> obtains relatively better performance on average.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Unsupervised clustering of remaining reads</title>
<p>The following procedure of screening relies on the raw read clustering analogous to the genomic binning. The high-dimensional feature space for the unsupervised clustering of long reads consists of characteristic marker densities as Feature 1-3, the GC content and canonical 3-mer frequencies as Feature 4-36 (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;2</bold>
</xref>). The preprocessing of PCA decomposes those features into new <italic>K</italic> independent variables of a 36-dimensional matrix. We assume that the <italic>K</italic> principal components belong to the same parametric family of Gaussian distribution but with various parameters (mean, variance). The preprocessing of whitening has been employed to reduce information redundancy. The BGMM algorithm is selected according to the applicable geometry and running speed from the comparison of clustering algorithms in scikit-learn (<xref ref-type="bibr" rid="B35">Pedregosa et&#xa0;al., 2011</xref>). Briefly, the Bayesian framework infers the posterior distribution of the parameters <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3d5;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, and the expectation&#x2013;maximization algorithm is utilized to update these parameters.</p>
<disp-formula>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>q</mml:mi> <mml:mo>|</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:munderover>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3d5;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
<mml:mi>&#x25aa;</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3d5;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">&#x2dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> are the weight, mean and variance of the <italic>i</italic>th component. Application Programming Interface from scikit-learn (<xref ref-type="bibr" rid="B35">Pedregosa et&#xa0;al., 2011</xref>) was used to implement these steps. This unsupervised machine learning algorithm is irrelevant to training. Therefore, no reference genomes or high-quality public databases are needed.</p>
<p>We illustrated the differences in screening results before and after clustering in the <italic>k</italic>-mer mode in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2A</bold>
</xref> and <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;5</bold>
</xref>. Overall, the clustering increased classification F1-scores by improving the recall rates but sacrificing fractional precision. Note that we only used long reads longer than 5,000 bp for clustering since the inaccurate sequence statistics of shorter reads might mislead the results. The error-tolerant <italic>strobemer</italic> mode usually did not need further clustering, as the recall rate was already satisfactory. We did not cluster the <italic>k</italic>-mer-based result of the real ONT dataset, because the performance was adequate and extra clustering could not improve it further.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Performance of screening over different benchmarking datasets. <bold>(A)</bold> Visualization of identified host reads by trio-binning markers and unsupervised clustering after dimension reduction. We arbitrarily selected 20,000 long reads to cancel image blurring. Each point refers to one long read. For each dataset, top figure is colored by screening results <italic>via</italic> Symbiont-Screener, while bottom is colored by the original species. Note that the simulated PacBio HiFi and CLR reads with higher single-base accuracy are classified by trio-binning markers only. The other three datasets are further identified by clustering. <bold>(B)</bold> Comparisons of screening precision and recall rates over five simulated and real PacBio and ONT long-read datasets. Note that for the <italic>de novo</italic> tool, MetaBCC-LR, we benchmarked the result based on the extracted clusters with more than 5,000 reads and precision &gt;0.5 as the host clusters cannot be identified without the reference genome. <bold>(C)</bold> Distinct <italic>k</italic>-mer completeness for each species in long-read data (top) and final assemblies (bottom) after screening. The low <italic>k</italic>-mer completeness of Contaminant 1 and 2 before filtering is because of the 1&#xd7; input. On average, distinct <italic>k</italic>-mers for other foreign species including Symbiont3 sharing highly similar sequences with the host are reduced from 99.6% to 0.5% in raw long-read data and from 99.4% to 0.2% in assemblies, while &gt;98.2% of host&#x2019;s <italic>k</italic>-mers are retained in both raw data and following assemblies.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-10-1087447-g002.tif"/>
</fig>
<p>Besides, the randomness of BGMM clustering may affect the final classification of host reads. Thus, we independently ran the clustering procedure multiple times (default 10) and achieved consensus results. The best host read cluster was annotated according to the criterion that whether this cluster contained the most preselected high-confidence host reads with the smallest variance. The final host group was automatically produced by merging raw reads, which repeatedly occurred in the best and second-best clusters. <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;6</bold>
</xref> illustrates the frequencies of occurrence of different species in the best and second-best clusters for 10 runs of BGMM clustering. In the dumbbell-shaped overall profile, the foreign genomic data aggregate in the low-frequency region while the host&#x2019;s stay in the high-frequency region. Users can determine the frequency threshold based on the position of the second peak in the high-frequency region to extract host data for <italic>de novo</italic> projects.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Reconstruction of the host and metagenomic genomes</title>
<p>TGS assemblers, for instance, Canu, Flye, and metaFlye (<xref ref-type="bibr" rid="B26">Koren et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B24">Kolmogorov et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B23">Kolmogorov et&#xa0;al., 2020</xref>) can reconstruct the host chromosomes and symbiotic microbial genomes. We employed default parameters to assemble PacBio HiFi, PacBio CLR, and ONT data types. Note that although the trio-binning markers can partition paternal and maternal reads, we did not show the result of haplotype-resolved assemblies because the host sequencing coverage depth in our datasets is insufficient.</p>
</sec>
<sec id="s2_6">
<label>2.6</label>
<title>Benchmarking datasets</title>
<p>We used the following five long-read datasets for evaluation.</p>
<p>
<italic>Dataset1</italic>: Simulated PacBio HiFi dataset: human chromosome 19 of HG002 as the host, two bacteria from the Unified Human Gastrointestinal Genome (UHGG) collection (<xref ref-type="bibr" rid="B1">Almeida et&#xa0;al., 2021</xref>) as inter-phyla symbionts, additional eight UHGG bacteria as random contaminants, and mouse chromosome 7 as a symbiont sharing similar sequences with the host genome. PacBio HiFi long reads were simulated by PBSIM2 (<xref ref-type="bibr" rid="B34">Ono et&#xa0;al., 2021</xref>) based on the reference genomes with an average read length of 10 kbp and an average error rate of 1% (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;3</bold>
</xref>).</p>
<p>
<italic>Dataset2</italic>: Simulated PacBio CLR dataset: same composition as <italic>Dataset1</italic>. The average read length is 10 kbp and the average error rate is 5%.</p>
<p>
<italic>Dataset3</italic>: Simulated ONT dataset: same composition as <italic>Dataset1</italic>. The average read length is 30 kbp and the average error rate is 15%.</p>
<p>
<italic>Dataset4</italic>: Real PacBio RSII CLR dataset: human chromosome 19 of HG002 as the host, two bacteria of mock microbial datasets from the ZymoBIOMICS Microbial Community Standards as inter-phyla symbionts, other eight bacteria and yeasts as random contaminants. In this case, we challenged the chimpanzee chromosome 21 as a symbiont owning highly similar sequences (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;4</bold>
</xref>). The human HG002 raw reads were downloaded from NCBI GIAB, and those mapped to chromosome 19 of the HG002 reference assembly (GCA_011064465.1) were extracted (<xref ref-type="bibr" rid="B42">Shumate et&#xa0;al., 2020</xref>). The chimpanzee raw reads were downloaded from (<xref ref-type="bibr" rid="B28">Logsdon et&#xa0;al., 2021</xref>), and those mapped to the chromosome 21 reference (GCA_000001515.5) were extracted. The mock raw reads were downloaded from (<xref ref-type="bibr" rid="B30">McIntyre et&#xa0;al., 2019</xref>).</p>
<p>
<italic>Dataset5</italic>: Real ONT PromethION dataset: same composition as <italic>Dataset4</italic>. The human and chimpanzee raw reads were downloaded and extracted in the same way. The mock data were downloaded from (<xref ref-type="bibr" rid="B33">Nicholls et&#xa0;al., 2019</xref>).</p>
<p>We used default error profiles (substitution: insertion: deletion=6: 50: 54 for PacBio and 23:31:46 for ONT) provided by PBSIM2 for all simulations. 50&#xd7; host (human chromosome 19) data were simulated using the HG002 reference (GCA_011064465.1), while 50&#xd7; mouse chromosome 7 data were simulated based on the GRCm39 reference (GCA_000001635.9). Other bacteria were simulated based on UHGG references with various coverages. For the real PacBio and ONT data, we also gained 50&#xd7; human and 50&#xd7; chimpanzee raw reads, and maintained the same proportions of 10 bacteria and yeasts in the mock sequencing data.</p>
<p>All the long reads were annotated by their species names before binning to benchmark the precision and recall rates of screening. The recall rate is defined as the ratio of the number of correctly identified host reads to the total number of host reads in the mixed input. The precision rate is defined as the ratio of the number of correctly identified host reads to the total number of final extracted reads.</p>
</sec>
<sec id="s2_7">
<label>2.7</label>
<title>Evaluation methods</title>
<p>To assess the effect of screening on the host genome assembly for three simulated datasets, we generated species-specific non-repeating canonical 21-mers (distinct <italic>k</italic>-mers) according to the reference genomes. We calculated the completeness and contamination rates in the mixed and purified long-read raw data and assemblies, respectively (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;5</bold>
</xref>). The completeness is defined as the ratio of distinct <italic>k</italic>-mer numbers belonging to each species in reads or assemblies to the total number in the reference. The contamination rate is defined as the ratio of the distinct <italic>k</italic>-mer number belonging to other species to the total distinct <italic>k</italic>-mer number of the host reference assembly.</p>
<p>The host reference assembly was used to validate the assemblies before and after screening. The assembly statistics were reported by QUAST (<xref ref-type="bibr" rid="B20">Gurevich et&#xa0;al., 2013</xref>) with default parameters except -m 1000.</p>
<p>The metagenomic analysis of host-associated microbial community is supposed to be improved after screening if we ignore the random contamination. However, the evaluation of this effect does not apply to our benchmarking datasets.</p>
</sec>
<sec id="s2_8">
<label>2.8</label>
<title>Parameters of other tools</title>
<p>We used recommended or default parameters for MetaProb, BusyBee, MetaBCC-LR, Centrifuge, Kraken2, and MetaMaps. However, MetaProb and BusyBee cannot support the large data size. We performed parameter sweeps for MetaBCC-LR to obtain its best performance (&#x2013;sample-count 1%, &#x2013;bin-size 10). Note that the host clusters generated by MetaBCC-LR cannot be detected as the host without references. In addition, the host data are usually split into several clusters. Thus, we extracted all clusters with more than 5,000 reads and precision &gt;0.5 for evaluation. Centrifuge, Kraken2, and MetaMaps are reference-based classifiers. We first built indexed databases according to host and foreign reference assemblies, as well as NCBI taxonomy. Next, we classified datasets and extracted all reads assigned to the host species for benchmarking.</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Screening of simulated and real datasets</title>
<p>We applied Symbiont-Screener to three simulated and two real long-read datasets with varying read lengths and error rates, covering PacBio HiFi, PacBio CLR, and ONT types. Each dataset consists of a host species, inter-phyla symbionts, a symbiont with similar sequences, and several random microbial contaminants.</p>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2A</bold>
</xref> shows the host&#x2019;s data identified by characteristic markers and further clustered by other genomic features. Compared with the corresponding ground truth, the parent-specific and shared <italic>strobemers</italic> precisely detect host long reads for three simulated datasets. Limited by the sequencing errors, the simulated ONT dataset requires species differentiation in GC content and trinucleotide frequencies to obtain more host data as supplementary features. Long reads sufficiently close to the preselected reads in the space after dimension reduction are also marked as host&#x2019;s and extracted for the following assembly. Real long-read datasets for symbiotic samples are rare. Thus, we chose chimpanzee to imitate an indistinguishable symbiont, sharing approximately 98% of the genome with the host, human, which is a conundrum of alignment-based screening. Although a few foreign reads are misidentified by trio-binning markers in the real PacBio dataset, they are further corrected by genomic signatures. The relatively higher error rate accompanied by the high sequence similarity leads to the lower accuracy of screening for real ONT dataset. Besides, we evaluated the contribution of each feature to the final clustering results (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;7</bold>
</xref>). For three simulated datasets, the relative importance score for characteristic <italic>strobemers</italic> is 24.4% on average, among which the parent-specific markers are more important. None of their contributions are negligible in the clustering, which proves those features are highly complementary.</p>
<p>We have also tested several state-of-the-art reference-free or reference-based tools for screening using the same datasets. <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2B</bold>
</xref> shows the classification precision and recall of the <italic>strobemer</italic> mode, <italic>k</italic>-mer mode, MetaBCC-LR, Centrifuge, Kraken2, and MetaMaps. Hampered by the computing performance, neither MetaProb nor BusyBee can support clustering the entire data. Overall, Symbiont-Screener outperforms MetaBCC-LR for all five datasets. The implementation of trio-binning <italic>strobemers</italic> and clustering allows Symbiont-Screener to surmount the obstacle of high error rates, thus extracting more host long reads. By contrast, Centrifuge, Kraken2, and MetaMaps benefit from accurate and complete reference genomes as well as taxonomic relationships, and obtain relatively higher precision and recall rates, especially for the simulated datasets. Nevertheless, for more complex relations in the real PacBio dataset where the similar symbiont, chimpanzee shares approximately 98% of the genome with the host, the highest F1-scores are found in both modes of Symbiont-Screener. It indicates that the trio-binning information is a qualified substitute for reference genomes or public databases if they are not available.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Effect on host assembly</title>
<p>The effect of screening on the final assembly was first assessed by the species-specific genomic non-repeating canonical 21-mers. The distinct <italic>k</italic>-mer completeness ratio indicated the ability to reconstruct the whole host genome. Meanwhile, the distinct <italic>k</italic>-mer contamination ratio represented the assembly accuracy.</p>
<p>For three simulated datasets, the filtered long reads obtained up to 99.4% of the host&#x2019;s 42,988,682 distinct <italic>k</italic>-mers regardless of repeats or non-ACGT bases on average (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2C</bold>
</xref>), thus ensuring the high quality of genome assembly (~99.2% completeness). On the contrary, <italic>k</italic>-mer completeness was significantly reduced to 0.3% in raw reads and 0.1% in final assembly for all ten bacteria after screening. For the challenging symbiont with highly similar sequences, the most difficult component to be cleaned, only 2.4% of <italic>k</italic>-mers were retained in the results. They could not support the foreign genome reconstruction in the following host assembly.</p>
<p>The QUAST-based evaluations also reflected the advantage after a nearly perfect screening. <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;7</bold>
</xref> showed the Canu assembly statistics for three simulated and two real datasets. The assembled total length and unaligned length were significantly reduced as the foreign genomes were removed after the screening, while the genome fraction and misassemblies remained almost the same. The comparison was nearly consistent with that of Flye assemblies as shown in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;8</bold>
</xref>.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Application to a red seaweed with symbionts</title>
<p>We applied Symbiont-Screener to an economically important red seaweed, <italic>Neoporphyra haitanensis</italic>, to demonstrate the success of screening results in the natural world. Previous studies have shown a complex relationship between the host algae and the associated metagenomes, involving the microbial components, functional microbial lineages, and the exchange of diverse chemical currencies, which mainly rely on the sequence alignments of short-read reads or genome assemblies (<xref ref-type="bibr" rid="B6">Brawley et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B45">Wang et&#xa0;al., 2022</xref>). Here, we sequenced 51.6 Gb ONT long reads (read N50 = 25.2 kb) and 23.2 Gb PacBio HiFi reads (read N50 = 18.3 kb) and separate the host seaweed sequences from the symbionts. Symbiont-Screener employed the characteristic <italic>strobemers</italic> to automatically identify host raw reads without reference genomes. The 108,837,094 characteristic <italic>strobemers</italic> were generated by the trio pedigree relations of the lab-cultured parents and offspring. The clustering procedure further gathered host long reads to overcome the limit of high sequencing error rate. Finally, the whole identified host raw reads were assembled by Flye, while the remaining associated bacteria were assembled by metaFlye, respectively. We regarded all foreign genomes as symbiotic bacteria. We also applied metaFlye to the assembly of the mixed data for comparison.</p>
<p>The total length of the reconstructed seaweed genome was 45,172,822 bp, consisting of 59 contigs (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;9</bold>
</xref>). The assembly contiguity reached a chromosomal level with a contig N50 of 7,218,067 bp, compared to the previously published closely-related species (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2022</xref>). The 81.454% genome fraction against the 53.3 Mbp closely-related genome implied the assembly completeness. Only 2,084,431 bp were unaligned. The significant difference of the chromatin contacts from pair-wise Hi-C reads confirmed the thorough isolation of the host, which further constructed 5 complete chromosomes (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3A</bold>
</xref>). On the other hand, the metagenome-assembled genomes involved 608 contigs with a contig N50 of 4,376,601 bp. The total genome size was 260,496,743 bp, of which only 3.829% could be aligned to the closely-related reference genome. GC-depth plot is an alternative method to benchmark the screening result. Multiple peaks in the preliminary assembly of the whole mixed data indicated different species with various GC contents and covered read depths (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3B</bold>
</xref>). In contrast, the purified host assembly after screening presented a more concentrated peak with a more convergent distribution of GC ratio.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Application to an algal-bacterial ecosystem. <bold>(A)</bold> Hi-C interaction heatmaps of purified (top) and mixed (bottom) assemblies. <bold>(B)</bold> GC-Coverage plots for simulated PacBio HiFi assemblies of purified (top) and mixed (bottom) data. <bold>(C)</bold> Differences in the taxonomic profiling of associated microbial community structures at the phylum level based on the indexed RefSeq database between purified (top) and mixed (bottom) data. <bold>(D)</bold> Bandage visualization of assembly graphs with connections for the mixed and purified data. Note that the primary assembled sequences are annotated by Kraken2.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-10-1087447-g003.tif"/>
</fig>
<p>Additionally, the metagenomes were binned and taxonomically annotated by Kraken2 using the indexed NCBI RefSeq database k2_pluspfp_20200919, which illustrated that the sequences annotated as eukaryote were dramatically eliminated after screening (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3C</bold>
</xref>). The profiling results also disclosed a bias of different sequencing platforms, possibly due to the more misalignments induced by the relatively higher error rates for ONT. Bandage (<xref ref-type="bibr" rid="B48">Wick et&#xa0;al., 2015</xref>) were used to visualize the <italic>de novo</italic> assembly graphs with sequence connections. There were totally 14 complete, closed and circularized metagenomes were assembled (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3D</bold>
</xref>). Although the 322,277,090 bp-long mixed assembly reconstructed an additional circularized genome, it was annotated as eukaryote. We investigated the mixed assembly and found 84.4 Mbp contigs could be aligned to the closely-related reference of the host seaweed, implying the host assembly errors. The high-quality genomes of <italic>Neoporphyra haitanensis</italic> and associated bacteria might provide a comprehensive approach for elucidating genome coevolution and the influence of symbiotic metagenomes to the adaptation of <italic>Pyropia</italic> to intertidal zone habitats.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Performance</title>
<p>We benchmarked the performance of Symbiont-Screener on a Linux system with Intel Core Processor (Broadwell, IBRS), 15 CPU cores and 30 threads. We individually recorded the CPU and memory usage for each assembly and calculated the percentage of saved consumption after screening. <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;8</bold>
</xref> recorded the computational consumption for three simulated PacBio HiFi, PacBio CLR and ONT datasets, representing that the screening result saves considerable CPU and memory usage.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>We introduce a novel but accurate model for screening that classifies reliable host raw long reads from the mixed sample according to the trio-binning information, which is computationally efficient without the requirement of reference genomes or sequence alignments. Symbiont-Screener further utilizes other supplementary features to directly cluster error-prone long reads. The multi-dimensional clustering system is open-ended and accepts additional features such as remaining genomic markers after sterilization to avoid overreliance on the presence or accuracy of one specific feature. Moreover, the trio-binning markers support the haplotype-resolved partitioning and genome assembly of extracted host&#x2019;s long reads. We did not show the haplotype-resolved assemblies due to the insufficient sequencing coverage depth of host&#x2019;s data.</p>
<p>The application of this algorithm requires parental sequencing data with or without symbionts and contamination for sexually reproducing diploid or allotetraploid species. Therefore, the patient&#x2019;s parents need to provide their clean or contaminated DNA samples for the microbial pathogen identification in clinical applications. For samples of animals or plants collected from the wild, sexual reproduction in the field or laboratory culture is required to eliminate symbionts and random contaminants. If parental data are unobtainable, then reference assemblies of closely-related species if available, or parental lines for cross-bred crops can be used to mark long reads corresponding to the conserved genomic regions instead.</p>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The simulated PacBio HiFi, PacBio CLR and ONT datasets have been deposited in the CNGB Sequence Archive CNSA, (<uri xlink:href="https://db.cngb.org/cnsa">https://db.cngb.org/cnsa</uri>) under the accession number CNP0001829. We downloaded real PacBio CLR and ONT ultra-long data of HG002/NA24385 as host from GIAB (<uri xlink:href="https://ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_MtSinai_NIST">https://ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_MtSinai_NIST</uri> and <uri xlink:href="https://ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/Ultralong_OxfordNanopore">https://ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/Ultralong_OxfordNanopore</uri>). The real PacBio and ONT datasets for chimpanzee are available in the NCBI under the accession number PRJNA659034. PacBio data for the mock microbial community from ZymoBIOMICS Microbial Community Standards are extracted, which are publicly available from (<xref ref-type="bibr" rid="B30">McIntyre et al., 2019</xref>). The ONT data for the same mock standard are obtained from (<xref ref-type="bibr" rid="B33">Nicholls et al., 2019</xref>). The algal-bacterial data have been deposited in the CNSA under the accession number CNP0003571. But restrictions apply to the availability of these algal-bacterial data, which are not publicly available. Data are however available from the corresponding author upon request. The RefSeq-based database used for the Kraken2 analysis can be downloaded at <uri xlink:href="https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20200919.tar.gz">https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_20200919.tar.gz</uri>. The source code used in this manuscript is available at <uri xlink:href="https://github.com/BGI-Qingdao/Symbiont-Screener">https://github.com/BGI-Qingdao/Symbiont-Screener</uri>.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>MX and LG performed software design and implementation. MX, LG, YQ, CS, XCL, JC, and JH contributed to data modeling, data curation, and benchmarking. MX wrote the draft manuscript, and LG, LD, and GF contributed to manuscript editing. XL and GF supervised the project. MX and GF secured funding. All authors read and approved the final manuscript. All authors contributed to the article and approved the submitted version.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>This work was supported by the National Natural Science Foundation of China (Grant No. 32100514); and the National Key Research and Development Program of China (Grant No. 2018YFD0900301-05).</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>We thank Kristoffer Sahlin for help with implementation of <italic>strobemers</italic>. We thank Xiaobin Liu for fruitful discussions during the development and performance test. The data that support the findings of this study have been deposited into CNGB Nucleotide Sequence Archive (CNSA) (<xref ref-type="bibr" rid="B19">Guo et&#xa0;al., 2020</xref>) of China National GeneBank DataBase (CNGBdb) (<xref ref-type="bibr" rid="B10">Chen et&#xa0;al., 2020</xref>) with accession numbers CNP0001829.</p>
</ack>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fmars.2023.1087447/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fmars.2023.1087447/full#supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Almeida</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Nayfach</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Boland</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Strozzi</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Beracochea</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>Z. J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A unified catalog of 204,938 reference genomes from the human gut microbiome</article-title>. <source>Nat. Biotechnol.</source> <volume>39</volume>, <fpage>105</fpage>&#x2013;<lpage>114</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-020-0603-3</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alneberg</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Bjarnason</surname> <given-names>B. S.</given-names>
</name>
<name>
<surname>De Bruijn</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Schirmer</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Quick</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ijaz</surname> <given-names>U. Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2014</year>). <article-title>Binning metagenomic contigs by coverage and composition</article-title>. <source>Nat. Methods</source> <volume>11</volume>, <fpage>1144</fpage>&#x2013;<lpage>1146</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nmeth.3103</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arimoto</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Hikosaka-Katayama</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Hikosaka</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Tagawa</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Inoue</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Ueki</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>A draft nuclear-genome assembly of the acoel flatworm praesagittifera naikaiensis</article-title>. <source>Gigascience</source> <volume>8</volume>. doi: <pub-id pub-id-type="doi">10.1093/gigascience/giz023</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bertrand</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Shaw</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kalathiyappan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Ng</surname> <given-names>A. H. Q.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>M. S.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Hybrid metagenomic assembly enables high-resolution analysis of resistance determinants and mobile elements in human microbiomes</article-title>. <source>Nat. Biotechnol.</source> <volume>37</volume>, <fpage>937</fpage>&#x2013;<lpage>944</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-019-0191-2</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bharti</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Grimm</surname> <given-names>D. G.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Current challenges and best-practice protocols for microbiome analysis</article-title>. <source>Briefings Bioinf.</source> <volume>22</volume>, <fpage>178</fpage>&#x2013;<lpage>193</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbz155</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brawley</surname> <given-names>S. H.</given-names>
</name>
<name>
<surname>Blouin</surname> <given-names>N. A.</given-names>
</name>
<name>
<surname>Ficko-Blean</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Wheeler</surname> <given-names>G. L.</given-names>
</name>
<name>
<surname>Lohr</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Goodson</surname> <given-names>H. V.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). <article-title>Insights into the red algae and eukaryotic evolution from the genome of porphyra umbilicalis (Bangiophyceae, rhodophyta)</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>114</volume>, <fpage>E6361</fpage>&#x2013;<lpage>e6370</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.1703088114</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>J. S.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>R.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Insights into the ancient adaptation to intertidal environments by red algae based on a genomic and multiomics investigation of neoporphyra haitanensis</article-title>. <source>Mol. Biol. Evol.</source> <volume>39</volume>. doi: <pub-id pub-id-type="doi">10.1093/molbev/msab315</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>X.-W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>W.-J.</given-names>
</name>
<name>
<surname>Zong</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Identification of SARS-CoV-2 variants and their clinical significance in hefei, China</article-title>. <source>Front. Med.</source> <volume>8</volume>. doi: <pub-id pub-id-type="doi">10.3389/fmed.2021.784632</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xian</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Marin</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Keller</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Genomes of subaerial zygnematophyceae provide insights into land plant evolution</article-title>. <source>Cell</source> <volume>179</volume>, <fpage>1057</fpage>&#x2013;<lpage>1067.e14</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cell.2019.10.019</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>F. Z.</given-names>
</name>
<name>
<surname>You</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L. N.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>X. Q.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>F.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>CNGBdb: China national GeneBank DataBase</article-title>. <source>Hereditas</source> <volume>42</volume>, <fpage>799</fpage>&#x2013;<lpage>809</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.16288/j.yczz.20-080</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chin</surname> <given-names>C. S.</given-names>
</name>
<name>
<surname>Alexander</surname> <given-names>D. H.</given-names>
</name>
<name>
<surname>Marks</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Klammer</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Drake</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Heiner</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2013</year>). <article-title>Nonhybrid, finished microbial genome assemblies from long-read SMRT sequencing data</article-title>. <source>Nat. Methods</source> <volume>10</volume>, <fpage>563</fpage>&#x2013;<lpage>569</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nmeth.2474</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Coghlan</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Tyagi</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Cotton</surname> <given-names>J. A.</given-names>
</name>
<name>
<surname>Holroyd</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Rosa</surname> <given-names>B. A.</given-names>
</name>
<name>
<surname>Tsai</surname> <given-names>I. J.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Comparative genomics of the major parasitic worms</article-title>. <source>Nat. Genet.</source> <volume>51</volume>, <fpage>163</fpage>&#x2013;<lpage>174</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41588-018-0262-1</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cornet</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Baurain</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Contamination detection in genomic data: More is not enough</article-title>. <source>Genome Biol.</source> <volume>23</volume>, <fpage>60</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13059-022-02619-9</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dilthey</surname> <given-names>A. T.</given-names>
</name>
<name>
<surname>Jain</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Koren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Phillippy</surname> <given-names>A. M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Strain-level metagenomic assignment and compositional estimation for long reads with MetaMaps</article-title>. <source>Nat. Commun.</source> <volume>10</volume>, <fpage>3066</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-019-10934-2</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Douvlataniotis</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Bensberg</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Lentini</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Gylemo</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Nestor</surname> <given-names>C. E.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>No evidence for DNA <italic>N</italic>
<sup>6</sup>-methyladenine in mammals</article-title>. <source>Sci. Adv.</source> <volume>6</volume>, <elocation-id>eaay3335</elocation-id>. doi: <pub-id pub-id-type="doi">10.1126/sciadv.aay3335</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ebert</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Audano</surname> <given-names>P. A.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Rodriguez-Martin</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Porubsky</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Bonder</surname> <given-names>M. J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Haplotype-resolved diverse human genomes and integrated analysis of structural variation</article-title>. <source>Science.</source> <volume>372</volume> doi: <pub-id pub-id-type="doi">10.1126/science.abf7117</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fraser</surname> <given-names>C. M.</given-names>
</name>
<name>
<surname>Eisen</surname> <given-names>J. A.</given-names>
</name>
<name>
<surname>Nelson</surname> <given-names>K. E.</given-names>
</name>
<name>
<surname>Paulsen</surname> <given-names>I. T.</given-names>
</name>
<name>
<surname>Salzberg</surname> <given-names>S. L.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>The value of complete microbial genome sequencing (you get what you pay for)</article-title>. <source>J. Bacteriol</source> <volume>184</volume>, <fpage>6403</fpage>&#x2013;<lpage>5; discusion 6405</lpage>. doi: <pub-id pub-id-type="doi">10.1128/JB.184.23.6403-6405.2002</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Girotto</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Pizzi</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Comin</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>MetaProb: accurate metagenomic reads binning based on probabilistic sequence signatures</article-title>. <source>Bioinformatics</source> <volume>32</volume>, <fpage>i567</fpage>&#x2013;<lpage>i575</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btw466</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>You</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>CNSA: a data repository for archiving omics data</article-title>. <source>Database</source> <volume>2020</volume>. doi: <pub-id pub-id-type="doi">10.1093/database/baaa055</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gurevich</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Saveliev</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Vyahhi</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Tesler</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>QUAST: Quality assessment tool for genome assemblies</article-title>. <source>Bioinformatics</source> <volume>29</volume>, <fpage>1072</fpage>&#x2013;<lpage>1075</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btt086</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J. Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J. S.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J. W.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>V. N.</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>The architecture of SARS-CoV-2 transcriptome</article-title>. <source>Cell</source> <volume>181</volume>, <fpage>914</fpage>&#x2013;<lpage>921.e10</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cell.2020.04.011</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Breitwieser</surname> <given-names>F. P.</given-names>
</name>
<name>
<surname>Salzberg</surname> <given-names>S. L.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Centrifuge: Rapid and sensitive classification of metagenomic sequences</article-title>. <source>Genome Res.</source> <volume>26</volume>, <fpage>1721</fpage>&#x2013;<lpage>1729</lpage>. doi: <pub-id pub-id-type="doi">10.1101/gr.210641.116</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kolmogorov</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Bickhart</surname> <given-names>D. M.</given-names>
</name>
<name>
<surname>Behsaz</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Gurevich</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Rayko</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Shin</surname> <given-names>S. B.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>metaFlye: Scalable long-read metagenome assembly using repeat graphs</article-title>. <source>Nat. Methods</source> <volume>17</volume>, <fpage>1103</fpage>&#x2013;<lpage>1110</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41592-020-00971-x</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kolmogorov</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Pevzner</surname> <given-names>P. A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Assembly of long, error-prone reads using repeat graphs</article-title>. <source>Nat. Biotechnol.</source> <volume>37</volume>, <fpage>540</fpage>&#x2013;<lpage>546</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-019-0072-8</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Koren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Rhie</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Walenz</surname> <given-names>B. P.</given-names>
</name>
<name>
<surname>Dilthey</surname> <given-names>A. T.</given-names>
</name>
<name>
<surname>Bickhart</surname> <given-names>D. M.</given-names>
</name>
<name>
<surname>Kingan</surname> <given-names>S. B.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>
<italic>De novo</italic> assembly of haplotype-resolved genomes with trio binning</article-title>. <source>Nat. Biotechnol.</source> <volume>36</volume>, <fpage>1174</fpage>&#x2013;<lpage>1182</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nbt.4277</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Koren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Walenz</surname> <given-names>B. P.</given-names>
</name>
<name>
<surname>Berlin</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Miller</surname> <given-names>J. R.</given-names>
</name>
<name>
<surname>Bergman</surname> <given-names>N. H.</given-names>
</name>
<name>
<surname>Phillippy</surname> <given-names>A. M.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Canu: Scalable and accurate long-read assembly <italic>via</italic> adaptive k-mer weighting and repeat separation</article-title>. <source>Genome Res.</source> <volume>27</volume>, <fpage>722</fpage>&#x2013;<lpage>736</lpage>. doi: <pub-id pub-id-type="doi">10.1101/gr.215087.116</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Laczny</surname> <given-names>C. C.</given-names>
</name>
<name>
<surname>Kiefer</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Galata</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Fehlmann</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Backes</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Keller</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>BusyBee web: Metagenomic data analysis by bootstrapped supervised binning and annotation</article-title>. <source>Nucleic Acids Res.</source> <volume>45</volume>, <fpage>W171</fpage>&#x2013;<lpage>w179</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkx348</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Logsdon</surname> <given-names>G. A.</given-names>
</name>
<name>
<surname>Vollger</surname> <given-names>M. R.</given-names>
</name>
<name>
<surname>Hsieh</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liskovykh</surname> <given-names>M. A.</given-names>
</name>
<name>
<surname>Koren</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>The structure, function and evolution of a complete human chromosome 8</article-title>. <source>Nature.</source> <volume>593</volume>, <page-range>101&#x2013;107</page-range> doi: <pub-id pub-id-type="doi">10.1038/s41586-021-03420-7</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marcais</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Kingsford</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>A fast, lock-free approach for efficient parallel counting of occurrences of k-mers</article-title>. <source>Bioinformatics</source> <volume>27</volume>, <fpage>764</fpage>&#x2013;<lpage>770</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btr011</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McIntyre</surname> <given-names>A. B. R.</given-names>
</name>
<name>
<surname>Alexander</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Grigorev</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Bezdan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Sichtig</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chiu</surname> <given-names>C. Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Single-molecule sequencing detection of N6-methyladenine in microbial reference materials</article-title>. <source>Nat. Commun.</source> <volume>10</volume>, <fpage>579</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-019-08289-9</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nagarajan</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Pop</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Sequence assembly demystified</article-title>. <source>Nat. Rev. Genet.</source> <volume>14</volume>, <fpage>157</fpage>&#x2013;<lpage>167</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nrg3367</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Neimark</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Line of attack</article-title>. <source>Science</source> <volume>347</volume>, <fpage>938</fpage>&#x2013;<lpage>940</lpage>. doi: <pub-id pub-id-type="doi">10.1126/science.347.6225.938</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nicholls</surname> <given-names>S. M.</given-names>
</name>
<name>
<surname>Quick</surname> <given-names>J. C.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Loman</surname> <given-names>N. J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Ultra-deep, long-read nanopore sequencing of mock microbial community standards</article-title>. <source>GigaScience</source> <volume>8</volume>. doi: <pub-id pub-id-type="doi">10.1093/gigascience/giz043</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ono</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Asai</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Hamada</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>PBSIM2: a simulator for long-read sequencers with a novel generative model of quality scores</article-title>. <source>Bioinformatics</source> <volume>37</volume>, <fpage>589</fpage>&#x2013;<lpage>595</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa835</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname> <given-names>O.</given-names>
</name>
<etal/>
</person-group>. (<year>2011</year>). <article-title>Scikit-learn: Machine learning in Python</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1201.0490</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Gu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>MetaTrass: A high-quality metagenome assembler of the human gut microbiome by cobarcoding sequencing reads</article-title>. <source>iMeta</source>, <volume>1</volume>, <elocation-id>e46</elocation-id>. doi: <pub-id pub-id-type="doi">10.1002/imt2.46</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rhie</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Walenz</surname> <given-names>B. P.</given-names>
</name>
<name>
<surname>Koren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Phillippy</surname> <given-names>A. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Merqury: reference-free quality, completeness, and phasing assessment for genome assemblies</article-title>. <source>Genome Biol.</source> <volume>21</volume>, <fpage>245</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13059-020-02134-9</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rhoads</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Au</surname> <given-names>K. F.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>PacBio sequencing and its applications</article-title>. <source>Genomics Proteomics Bioinf.</source> <volume>13</volume>, <fpage>278</fpage>&#x2013;<lpage>289</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.gpb.2015.08.002</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ricker</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Qian</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Fulthorpe</surname> <given-names>R. R.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>The limitations of draft assemblies for understanding prokaryotic adaptation and evolution</article-title>. <source>Genomics</source> <volume>100</volume>, <fpage>167</fpage>&#x2013;<lpage>175</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ygeno.2012.06.009</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roth&#xe4;usler</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Gutow</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Thiel</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Floating Seaweeds and Their Communities. In: Wiencke, C., Bischof, K. (eds) Seaweed Biology. Ecological Studies</article-title>. (<publisher-loc>Berlin, Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>) <volume>219</volume>. doi: <pub-id pub-id-type="doi">10.1007/978-3-642-28451-9_17</pub-id></citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sahlin</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Effective sequence similarity detection with strobemers</article-title>. <source>Genome Res.</source> <volume>31</volume>, <fpage>2080</fpage>&#x2013;<lpage>2094</lpage>. doi: <pub-id pub-id-type="doi">10.1101/gr.275648.121</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shumate</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zimin</surname> <given-names>A. V.</given-names>
</name>
<name>
<surname>Sherman</surname> <given-names>R. M.</given-names>
</name>
<name>
<surname>Puiu</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Wagner</surname> <given-names>J. M.</given-names>
</name>
<name>
<surname>Olson</surname> <given-names>N. D.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Assembly and annotation of an ashkenazi human reference genome</article-title>. <source>Genome Biol.</source> <volume>21</volume>, <fpage>129</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13059-020-02047-7</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Steinegger</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Salzberg</surname> <given-names>S. L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Terminating contamination: large-scale search identifies more than 2,000,000 contaminated entries in GenBank</article-title>. <source>Genome Biol.</source> <volume>21</volume>, <fpage>115</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13059-020-02023-1</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thiel</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Gutow</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>The ecology of rafting in the marine environment. II. the rafting organisms and community</article-title>. <source>Oceanography Mar. Biol.</source> <volume>43</volume>, <fpage>279</fpage>&#x2013;<lpage>418</lpage>. doi: <pub-id pub-id-type="doi">10.1201/9781420037449.ch7</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Mo</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Metagenome-assembled genomes from pyropia haitanensis microbiome provide insights into the potential metabolic functions to the seaweed</article-title>. <source>Front. Microbiol.</source> <volume>13</volume>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2022.857901</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Bi</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zelzion</surname> <given-names>E.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Pyropia yezoensis genome reveals diverse mechanisms of carbon acquisition in the intertidal environment</article-title>. <source>Nat. Commun.</source> <volume>11</volume>, <fpage>4028</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-020-17689-1</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wickramarachchi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Mallawaarachchi</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Rajan</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>MetaBCC-LR: metagenomics binning by coverage and composition for long reads</article-title>. <source>Bioinformatics</source> <volume>36</volume>, <fpage>i3</fpage>&#x2013;<lpage>i11</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa441</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wick</surname> <given-names>R. R.</given-names>
</name>
<name>
<surname>Schultz</surname> <given-names>M. B.</given-names>
</name>
<name>
<surname>Zobel</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Holt</surname> <given-names>K. E.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Bandage: interactive visualization of <italic>de novo</italic> genome assemblies</article-title>. <source>Bioinformatics</source> <volume>31</volume>, <fpage>3350</fpage>&#x2013;<lpage>3352</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btv383</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wood</surname> <given-names>D. E.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Langmead</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Improved metagenomic analysis with kraken 2</article-title>. <source>Genome Biol.</source> <volume>20</volume>, <fpage>257</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13059-019-1891-0</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Woyke</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Teeling</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ivanova</surname> <given-names>N. N.</given-names>
</name>
<name>
<surname>Huntemann</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Richter</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Gloeckner</surname> <given-names>F. O.</given-names>
</name>
<etal/>
</person-group>. (<year>2006</year>). <article-title>Symbiosis insights through metagenomic analysis of a microbial consortium</article-title>. <source>Nature</source> <volume>443</volume>, <fpage>950</fpage>&#x2013;<lpage>955</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nature05192</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). <article-title>Metagenomic analysis reveals symbiotic relationship among bacteria in microcystis-dominated community</article-title>. <source>Front. Microbiol.</source> <volume>7</volume>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2016.00056</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Igarashi</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>PacBio long reads improve metagenomic assemblies, gene catalogs, and genome binning</article-title>. <source>Front. Genet.</source> <volume>11</volume>. doi: <pub-id pub-id-type="doi">10.3389/fgene.2020.516269</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Peters</surname> <given-names>B. A.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Accurate haplotype-resolved assembly reveals the origin of structural variants for human trios</article-title>. <source>Bioinformatics.</source> <volume>37</volume>. <page-range>2095&#x2013;2102</page-range> doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btab068</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Gu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Peters</surname> <given-names>B. A.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>TGS-GapCloser: A fast and accurate gap closer for large genomes with low coverage of error-prone long reads</article-title>. <source>GigaScience</source> <volume>9</volume>. doi: <pub-id pub-id-type="doi">10.1093/gigascience/giaa094</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>