<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2024.1485073</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Microbiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Optimizing microbiome reference databases with PacBio full-length 16S rRNA sequencing for enhanced taxonomic classification and biomarker discovery</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Han</surname> <given-names>Hyejung</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Choi</surname> <given-names>Yoon Hee</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kim</surname> <given-names>Si Yeong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Park</surname> <given-names>Jung Hwa</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chung</surname> <given-names>Jin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/963098/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Na</surname> <given-names>Hee Sam</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1040791/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Oral Microbiology, School of Dentistry, Pusan National University</institution>, <addr-line>Yangsan</addr-line>, <country>Republic of Korea</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Internal Medicine, Dongnam Institute of Radiological and Medical Sciences</institution>, <addr-line>Busan</addr-line>, <country>Republic of Korea</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: Zhangran Chen, Xiamen University, China</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: Maozhen Han, Anhui Medical University, China</p>
<p>Babak Pakbin, Texas A&#x0026;M University, United States</p>
<p>Gongchao Jing, Chinese Academy of Sciences (CAS), China</p>
<p>Congmin Xu, Georgia Institute of Technology, United States</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Jin Chung, <email>jchung@pusan.ac.kr</email>; Hee Sam Na, <email>heesamy@pusan.ac.kr</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>25</day>
<month>11</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1485073</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>08</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>10</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 Han, Choi, Kim, Park, Chung and Na.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Han, Choi, Kim, Park, Chung and Na</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec id="sec1">
<title>Background</title>
<p>The study of the human microbiome is crucial for understanding disease mechanisms, identifying biomarkers, and guiding preventive measures. Advances in sequencing platforms, particularly 16S rRNA sequencing, have revolutionized microbiome research. Despite the benefits, large microbiome reference databases (DBs) pose challenges, including computational demands and potential inaccuracies. This study aimed to determine if full-length 16S rRNA sequencing data produced by PacBio could be used to optimize reference DBs and be applied to Illumina V3-V4 targeted sequencing data for microbial study.</p>
</sec>
<sec id="sec2">
<title>Methods</title>
<p>Oral and gut microbiome data (PRJNA1049979) were retrieved from NCBI. DADA2 was applied to full-length 16S rRNA PacBio data to obtain amplicon sequencing variants (ASVs). The RDP reference DB was used to assign the ASVs, which were then used as a reference DB to train the classifier. QIIME2 was used for V3-V4 targeted Illumina data analysis. BLAST was used to analyze alignment statistics. Linear discriminant analysis Effect Size (LEfSe) was employed for discriminant analysis.</p>
</sec>
<sec id="sec3">
<title>Results</title>
<p>ASVs produced by PacBio showed coverage of the oral microbiome similar to the Human Oral Microbiome Database. A phylogenetic tree was trimmed at various thresholds to obtain an optimized reference DB. This established method was then applied to gut microbiome data, and the optimized gut microbiome reference DB provided improved taxa classification and biomarker discovery efficiency.</p>
</sec>
<sec id="sec4">
<title>Conclusion</title>
<p>Full-length 16S rRNA sequencing data produced by PacBio can be used to construct a microbiome reference DB. Utilizing an optimized reference DB can increase the accuracy of microbiome classification and enhance biomarker discovery.</p>
</sec>
</abstract>
<kwd-group>
<kwd>oral microbiome</kwd>
<kwd>gut microbiome</kwd>
<kwd>PacBio</kwd>
<kwd>Illumina</kwd>
<kwd>next generation sequencing</kwd>
<kwd>reference database</kwd>
</kwd-group>
<counts>
<fig-count count="6"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="46"/>
<page-count count="12"/>
<word-count count="6556"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Microbial Symbioses</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec5">
<title>Introduction</title>
<p>The study of the human microbiome serves several important purposes, encompassing a wide range of medical objectives. It can help identify the imbalances associated with various diseases, such as inflammatory bowel disease (IBD), diabetes, obesity, and cardiovascular diseases (<xref ref-type="bibr" rid="ref24">Li et al., 2016</xref>; <xref ref-type="bibr" rid="ref22">Jie et al., 2017</xref>; <xref ref-type="bibr" rid="ref17">Haneishi et al., 2023</xref>). Studying the human microbiome enables the identification of microbial biomarkers for early diagnosis, prognosis, and disease monitoring (<xref ref-type="bibr" rid="ref5">Boppana et al., 2024</xref>). It can also detect pathogenic microorganisms that may contribute to infections or chronic diseases (<xref ref-type="bibr" rid="ref14">Dong et al., 2024</xref>). Additionally, microbiome profiles can be used to predict the risk of developing various diseases (<xref ref-type="bibr" rid="ref19">He et al., 2024</xref>).</p>
<p>The development of sequencing platforms has revolutionized the study of microbial communities. The gold standard for studying the taxonomic composition of a bacterial community is the sequencing of the 16S rRNA gene (<xref ref-type="bibr" rid="ref45">Woese and Fox, 1977</xref>). 16S rRNA gene is around 1,500&#x2009;bp long and has 9 variable regions that collect the main evolutionary changes among microbial taxa (<xref ref-type="bibr" rid="ref38">Stackebrandt and Goebel, 1994</xref>). Compared to whole genome sequencing (WGS), 16S rRNA sequencing is more cost-effective, making it accessible for large-scale studies and routine analysis. Also, the methodologies for 16S rRNA gene amplification, sequencing, and analysis are well-established, providing a robust framework for researchers (<xref ref-type="bibr" rid="ref3">Bolyen et al., 2019</xref>).</p>
<p>There are extensive public databases (DBs) (e.g., SILVA, Greengenes, RDP) for reference, facilitating accurate taxonomic assignment (<xref ref-type="bibr" rid="ref43">Wang et al., 2007</xref>; <xref ref-type="bibr" rid="ref30">Quast et al., 2013</xref>; <xref ref-type="bibr" rid="ref12">DeSantis et al., 2006</xref>; <xref ref-type="bibr" rid="ref10">Cole et al., 2014</xref>). While large microbiome reference DBs offer numerous advantages, such as improved resolution and comprehensive taxonomic coverage, they also come with certain disadvantages. Large DBs require significant computational power and memory for searching and aligning sequences. The sheer volume of data in large reference DBs can lead to longer processing times for sequence alignment and classification (<xref ref-type="bibr" rid="ref2">Baker, 2010</xref>). Large DBs often contain redundant sequences or highly similar entries, which can complicate classification and lead to ambiguities in taxonomic assignments. The likelihood of incorporating erroneous or misannotated sequences increases, which can reduce the accuracy of taxonomic classifications and potentially lead to false conclusions (<xref ref-type="bibr" rid="ref33">Sczyrba et al., 2017</xref>). Thus, selecting an optimal reference DB is crucial for microbiome studies. An optimal reference DB ensures accurate identification and classification of microbial taxa, reducing the chances of misidentification or ambiguous results, which is essential for understanding the true composition of the microbiome (<xref ref-type="bibr" rid="ref28">Monika Balvo&#x010D;i&#x016B;t&#x0117; et al., 2017</xref>).</p>
<p>The oral and gut microbiome are the two most commonly studied human microbiome. Studying the oral microbiome has several advantages over the gut microbiome. The oral microbiome typically has a lower microbial diversity compared to the gut microbiome (<xref ref-type="bibr" rid="ref20">Human Microbiome Project C, 2012</xref>). Also, the oral microbiome has been extensively studied, resulting in well-characterized reference DBs such as Human Oral Microbiome Database (HOMD) specifically tailored for oral bacteria, which facilitates more accurate taxonomic assignment (<xref ref-type="bibr" rid="ref13">Dewhirst et al., 2010</xref>).</p>
<p>For microbiome study, Illumina platform has been widely used. Illumina platforms can sequence millions of reads per run, making it suitable for large-scale studies. The cost of sequencing per base is relatively low, and it provides high accuracy with low error rates. However, typical sequencing read length is rather short (2&#x2009;&#x00D7;&#x2009;300&#x2009;bps), which cannot cover the full-length of the 16S rRNA gene, which could lead to potential misclassification or ambiguous taxonomic assignment (<xref ref-type="bibr" rid="ref32">Satam et al., 2023</xref>). Pacbio and Nanopore can provide long read sequences to overcome this limitation. Especially, Pacbio system can provide improved sequencing quality with the development of circular consensus sequencing (CCS) protocols which generates highly accurate long high-fidelity reads, also known as HiFi reads (<xref ref-type="bibr" rid="ref44">Wenger et al., 2019</xref>). Callahan et al. demonstrated that Pacbio HiFi could offer a single-nucleotide resolution by DADA2 approach based on Amplicon Sequence Variant (ASV) classification (<xref ref-type="bibr" rid="ref7">Callahan et al., 2019</xref>). Thus, we hypothesized that full-length 16S rRNA sequencing data produced by PacBio could be used to optimize reference database in human microbiome studies.</p>
<p>Recently, there have been several studies that simultaneously utilized PacBio and Illumina platform for microbiome study and compared their performance (<xref ref-type="bibr" rid="ref6">Buetas et al., 2024</xref>; <xref ref-type="bibr" rid="ref37">Souza et al., 2023</xref>; <xref ref-type="bibr" rid="ref23">Katiraei et al., 2022</xref>). Especially, She et al. have performed microbiome analysis on 53 sites of 7 surface human organs using both Illumina V3-V4 short read sequencing and Pacbio 16S rRNA full-length sequencing (<xref ref-type="bibr" rid="ref35">She et al., 2024</xref>). In this study, we tested if full-length 16S rRNA sequencing data produced by Pacbio could be used to serve as a reference DB and compared it with commonly used reference DB (e.g., HOMD) for coverage and classification performance against V3-V4 short read sequencing data. To validate the method, we applied the optimization method to gut microbiome data. Optimized reference DB was constructed with ASVs, and it was compared to SILVA and Greengene reference DB in taxonomy assignment and biomarker discovery against Illumina V3-V4 short read sequencing data.</p>
</sec>
<sec sec-type="materials|methods" id="sec6">
<title>Materials and methods</title>
<sec id="sec7">
<title>Data</title>
<p>The raw sequencing data have been retrieved from NCBI GenBank BioProject ID PRJNA1049979. For oral microbiome study, 32 samples were sequenced by Pacbio and 198 samples were sequenced by Illumina platform. For gut microbiome study, 45 samples were sequenced by Pacbio and 128 samples were sequenced by Illumina. Summary of sampling site and sample number is shown in <xref ref-type="table" rid="tab1">Tables 1</xref>, <xref ref-type="table" rid="tab2">2</xref>.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Summary of sampling site, sample number and read counts during PacBio data preprocessing.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="middle">Platform</th>
<th align="left" valign="middle">Organ</th>
<th align="left" valign="middle">Site</th>
<th align="left" valign="middle">Sample (<italic>n</italic>)</th>
<th align="left" valign="middle">Input</th>
<th align="left" valign="middle">Primers</th>
<th align="left" valign="middle">Filtered</th>
<th align="left" valign="middle">Denoised</th>
<th align="left" valign="middle">Non-chimera</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="5">PacBio</td>
<td align="left" valign="middle">Oral</td>
<td align="left" valign="middle">Oral (pooled)</td>
<td align="left" valign="middle">32</td>
<td align="left" valign="middle">12,951&#x2009;&#x00B1;&#x2009;910</td>
<td align="left" valign="middle">9,945&#x2009;&#x00B1;&#x2009;1,080</td>
<td align="left" valign="middle">8,684&#x2009;&#x00B1;&#x2009;1,781</td>
<td align="left" valign="middle">8,098&#x2009;&#x00B1;&#x2009;1,910</td>
<td align="left" valign="middle">7,954&#x2009;&#x00B1;&#x2009;1,895</td>
</tr>
<tr>
<td align="left" valign="middle">Large Intestine</td>
<td align="left" valign="middle">ANAL</td>
<td align="left" valign="middle">14</td>
<td align="left" valign="middle">12,911&#x2009;&#x00B1;&#x2009;1,029</td>
<td align="left" valign="middle">10,596&#x2009;&#x00B1;&#x2009;977</td>
<td align="left" valign="middle">10,167&#x2009;&#x00B1;&#x2009;1,412</td>
<td align="left" valign="middle">9,789&#x2009;&#x00B1;&#x2009;1,331</td>
<td align="left" valign="middle">9,644&#x2009;&#x00B1;&#x2009;1,301</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="3">Small Intestine</td>
<td align="left" valign="middle">IIC</td>
<td align="left" valign="middle">10</td>
<td align="left" valign="middle">13,475&#x2009;&#x00B1;&#x2009;1,048</td>
<td align="left" valign="middle">9,548&#x2009;&#x00B1;&#x2009;1,102</td>
<td align="left" valign="middle">8,805&#x2009;&#x00B1;&#x2009;1,439</td>
<td align="left" valign="middle">8,261&#x2009;&#x00B1;&#x2009;1,550</td>
<td align="left" valign="middle">7,892&#x2009;&#x00B1;&#x2009;1,366</td>
</tr>
<tr>
<td align="left" valign="middle">IICP</td>
<td align="left" valign="middle">7</td>
<td align="left" valign="middle">12,753&#x2009;&#x00B1;&#x2009;751</td>
<td align="left" valign="middle">9,641&#x2009;&#x00B1;&#x2009;1,216</td>
<td align="left" valign="middle">8,361&#x2009;&#x00B1;&#x2009;1,253</td>
<td align="left" valign="middle">7,542&#x2009;&#x00B1;&#x2009;1,596</td>
<td align="left" valign="middle">6,959&#x2009;&#x00B1;&#x2009;1,476</td>
</tr>
<tr>
<td align="left" valign="middle">JEJ100</td>
<td align="left" valign="middle">14</td>
<td align="left" valign="middle">12,734&#x2009;&#x00B1;&#x2009;616</td>
<td align="left" valign="middle">9,701&#x2009;&#x00B1;&#x2009;836</td>
<td align="left" valign="middle">7,316&#x2009;&#x00B1;&#x2009;1,380</td>
<td align="left" valign="middle">6,555&#x2009;&#x00B1;&#x2009;1,328</td>
<td align="left" valign="middle">6,380&#x2009;&#x00B1;&#x2009;1,321</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Summary of sampling site, sample number and read counts during Illumina data preprocessing.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="middle">Platform</th>
<th align="left" valign="middle">Organ</th>
<th align="left" valign="middle">Site</th>
<th align="left" valign="middle">Sample (<italic>n</italic>)</th>
<th align="left" valign="middle">Input</th>
<th align="left" valign="middle">Primers</th>
<th align="left" valign="middle">Filtered</th>
<th align="left" valign="middle">Denoised</th>
<th align="left" valign="middle">Non-chimera</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="10">Illumina</td>
<td align="left" valign="top" rowspan="6">Oral</td>
<td align="left" valign="middle">LC</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">89,809&#x2009;&#x00B1;&#x2009;10,028</td>
<td align="left" valign="middle">68,550&#x2009;&#x00B1;&#x2009;9,666</td>
<td align="left" valign="middle">66,691&#x2009;&#x00B1;&#x2009;9,475</td>
<td align="left" valign="middle">22,814&#x2009;&#x00B1;&#x2009;10,563</td>
<td align="left" valign="middle">5,418&#x2009;&#x00B1;&#x2009;1,953</td>
</tr>
<tr>
<td align="left" valign="middle">LL</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">96,880&#x2009;&#x00B1;&#x2009;12,073</td>
<td align="left" valign="middle">65,875&#x2009;&#x00B1;&#x2009;9,331</td>
<td align="left" valign="middle">58,444&#x2009;&#x00B1;&#x2009;9,685</td>
<td align="left" valign="middle">17,969&#x2009;&#x00B1;&#x2009;10,157</td>
<td align="left" valign="middle">6,117&#x2009;&#x00B1;&#x2009;2,832</td>
</tr>
<tr>
<td align="left" valign="middle">LM</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">89,682&#x2009;&#x00B1;&#x2009;7,948</td>
<td align="left" valign="middle">70,752&#x2009;&#x00B1;&#x2009;8,397</td>
<td align="left" valign="middle">66,226&#x2009;&#x00B1;&#x2009;9,156</td>
<td align="left" valign="middle">23,383&#x2009;&#x00B1;&#x2009;11,379</td>
<td align="left" valign="middle">6,803&#x2009;&#x00B1;&#x2009;2,598</td>
</tr>
<tr>
<td align="left" valign="middle">RC</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">88,192&#x2009;&#x00B1;&#x2009;9,343</td>
<td align="left" valign="middle">65,483&#x2009;&#x00B1;&#x2009;6,869</td>
<td align="left" valign="middle">63,748&#x2009;&#x00B1;&#x2009;6,766</td>
<td align="left" valign="middle">18,269&#x2009;&#x00B1;&#x2009;9,126</td>
<td align="left" valign="middle">3,552&#x2009;&#x00B1;&#x2009;1,626</td>
</tr>
<tr>
<td align="left" valign="middle">UL</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">92,227&#x2009;&#x00B1;&#x2009;11,036</td>
<td align="left" valign="middle">66,759&#x2009;&#x00B1;&#x2009;8,188</td>
<td align="left" valign="middle">61,644&#x2009;&#x00B1;&#x2009;8,331</td>
<td align="left" valign="middle">17,002&#x2009;&#x00B1;&#x2009;8,369</td>
<td align="left" valign="middle">4,131&#x2009;&#x00B1;&#x2009;1,818</td>
</tr>
<tr>
<td align="left" valign="middle">UM</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">100,157&#x2009;&#x00B1;&#x2009;13,446</td>
<td align="left" valign="middle">67,975&#x2009;&#x00B1;&#x2009;10,536</td>
<td align="left" valign="middle">63,106&#x2009;&#x00B1;&#x2009;9,812</td>
<td align="left" valign="middle">20,821&#x2009;&#x00B1;&#x2009;11,657</td>
<td align="left" valign="middle">6,522&#x2009;&#x00B1;&#x2009;3,475</td>
</tr>
<tr>
<td align="left" valign="top">Large intestine</td>
<td align="left" valign="middle">ANAL</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">88,919&#x2009;&#x00B1;&#x2009;6,446</td>
<td align="left" valign="middle">77,305&#x2009;&#x00B1;&#x2009;6,658</td>
<td align="left" valign="middle">74,090&#x2009;&#x00B1;&#x2009;7,276</td>
<td align="left" valign="middle">43,646&#x2009;&#x00B1;&#x2009;12,858</td>
<td align="left" valign="middle">8,647&#x2009;&#x00B1;&#x2009;2,256</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="3">Small intestin</td>
<td align="left" valign="middle">IIC</td>
<td align="left" valign="middle">31</td>
<td align="left" valign="middle">97,218&#x2009;&#x00B1;&#x2009;11,533</td>
<td align="left" valign="middle">67,811&#x2009;&#x00B1;&#x2009;9,166</td>
<td align="left" valign="middle">65,068&#x2009;&#x00B1;&#x2009;9,595</td>
<td align="left" valign="middle">38,822&#x2009;&#x00B1;&#x2009;13,853</td>
<td align="left" valign="middle">7,271&#x2009;&#x00B1;&#x2009;1,772</td>
</tr>
<tr>
<td align="left" valign="middle">IICP</td>
<td align="left" valign="middle">33</td>
<td align="left" valign="middle">90,417&#x2009;&#x00B1;&#x2009;12,012</td>
<td align="left" valign="middle">63,174&#x2009;&#x00B1;&#x2009;11,567</td>
<td align="left" valign="middle">60,082&#x2009;&#x00B1;&#x2009;10,734</td>
<td align="left" valign="middle">16,895&#x2009;&#x00B1;&#x2009;13,204</td>
<td align="left" valign="middle">4,465&#x2009;&#x00B1;&#x2009;2,614</td>
</tr>
<tr>
<td align="left" valign="middle">JEJ100</td>
<td align="left" valign="middle">31</td>
<td align="left" valign="middle">96,860&#x2009;&#x00B1;&#x2009;24,706</td>
<td align="left" valign="middle">74,051&#x2009;&#x00B1;&#x2009;14,430</td>
<td align="left" valign="middle">62,612&#x2009;&#x00B1;&#x2009;12,818</td>
<td align="left" valign="middle">9,310&#x2009;&#x00B1;&#x2009;8,573</td>
<td align="left" valign="middle">3,166&#x2009;&#x00B1;&#x2009;1,791</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec8">
<title>Bioinformatic analysis, statistical analysis, and visualization</title>
<p>For PacBio 16S full-length sequencing data, DADA2 algorithm was applied to dereplicate the reads and filter chimeric sequences. The ASVs were taxonomically assigned using RDP DB. Rarefaction analyses were conducted by vagan package.</p>
<p>To run stand-alone Basic Local Alignment Search Tool (BLAST) tool kits for alignment statistics, blast reference DB was constructed with PacBio ASVs and eHOMD, respectively. BLAST was performed against Illumina V3-V4 short read sequencing data to determine the alignment score, length of nucleotide identity and percentage of identity.</p>
<p>Phylogenetic tree construction by using <italic>align-to-tree-mafft-fasttree</italic> implemented in QIIME2 and visualized using iTOL (<xref ref-type="bibr" rid="ref21">Ivica Letunic et al., 2021</xref>). Trimming phylogenetic was performed using <italic>drop.tip</italic> in ape package.</p>
<p>For gut microbiome, ASVs from PacBio sequencing data was trimmed and was used to construct reference DB. For Illumina 16S V3-V4 sequencing data, raw paired-end reads of 16S rRNA gene sequence were quality-filtered and analyzed using QIIME2 software with default parameters (version 2023.9.0) (<xref ref-type="bibr" rid="ref16">Hall and Beiko, 1849</xref>) and associated plugins. Microbial community analysis, including <italic>&#x03B1;</italic>-diversity and <italic>&#x03B2;</italic>-diversity, were calculated using phyloseq R package. &#x03B1;-Diversity was evaluated by Choa1 index and Shannon&#x2019;s index. &#x03B2;-diversity was measured by Bray-Curtis distance, and principal coordinates analysis (PCoA) was used for ordination analysis. Bacterial taxonomy was determined by pre-trained Naive Bayes classifier using either Greengene DB, SILVA DB or optimized PacBio reference DB.</p>
<p>Differentially enriched microbes were analyzed using Linear discriminant analysis (LDA) Effect Size (LefSe) (<xref ref-type="bibr" rid="ref34">Segata et al., 2011</xref>), a methodology for performing differential abundance analysis of microbiome data. LDA score over 3 were considered significant. The codes are available at <ext-link xlink:href="http://doi.org/10.5281/zenodo.13937633" ext-link-type="uri">http://doi.org/10.5281/zenodo.13937633</ext-link>.</p>
</sec>
</sec>
<sec sec-type="results" id="sec9">
<title>Results</title>
<sec id="sec10">
<title>Analysis of oral microbiome data</title>
<p>A total of 569,845 reads from the 32 oral samples were generated by PacBio long read sequencing. The mean number of sequences per sample was 12,951&#x2009;&#x00B1;&#x2009;910, and the average read length was 1,457.7&#x2009;&#x00B1;&#x2009;18.2 (1,392 &#x2013; 1,595). After removing sequencing errors and chimera, a total of 349,997 reads remained, for an average of 7,954&#x2009;&#x00B1;&#x2009;1,894 reads per sample (<xref ref-type="table" rid="tab1">Table 1</xref>).</p>
<p>The average ASVs detected in each sample was 247.4&#x2009;&#x00B1;&#x2009;91.0 (34&#x2013;440). To assess the diversity and adequacy of sequencing depth, rarefaction curve was plotted for each sample. The rarefaction curve demonstrated good depth of coverage, leveling off at approximately 5,000 reads (<xref ref-type="fig" rid="fig1">Figure 1A</xref>). Since human microbiome is highly diverse and variable among individuals, we randomly combined oral samples to test if combining samples could increase the coverage. When 4, 8,16, and 32 samples were randomly combined, the average ASVs found in each combination was 940&#x2009;&#x00B1;&#x2009;167, 1783&#x2009;&#x00B1;&#x2009;228, 3267.5&#x2009;&#x00B1;&#x2009;74, and 5,950, respectively. Thus, the number of ASVs detected was increased as the number of samples combined was increased (<xref ref-type="fig" rid="fig1">Figure 1B</xref>).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p><bold>(A)</bold> Rarefaction curve for each oral sample. <bold>(B)</bold> Rarefaction curve for randomly combined oral samples. <bold>(C)</bold> Blast search result on Illunina V3-V4 oral microbiome data using various reference databases.</p>
</caption>
<graphic xlink:href="fmicb-15-1485073-g001.tif"/>
</fig>
<p>Although combining more samples produces a greater number of ASVs, it also increases the effort and budget required for the analysis. Therefore, determining an optimal number of samples should be essential. We constructed a BLAST reference DB with various combination of samples and compared the results against the eHOMD, a reference commonly used for oral microbiome analysis. For PacBio data, the proportion of successful BLAST searches increased with the number of ASVs in the DB. Comparing eHOMD and PacBio_4, which had a similar number of ASVs, the proportion of read counts with high identity (&#x003E;97%) was significantly higher in eHOMD. The PacBio sample combination that showed comparable BLAST search performance to eHOMD was PacBio_16. Furthermore, PacBio_32, which had six times more ASVs than eHOMD, showed only a slight improvement (<xref ref-type="fig" rid="fig1">Figure 1C</xref>). Thus, using a DB with more ASVs did not necessarily result in the detection of higher identity.</p>
<p>To improve blast efficiency, we selected ASVs (+Pac) that showed high identity (&#x003E;97%) by PacBio_32 while eHOMD showed less than 97% identity. Generally, species are clustered by sequence homology above 97% (<xref ref-type="bibr" rid="ref46">Yarza et al., 2014</xref>). Since biomarkers are typically identified at the species level, we selected 97% as the specificity threshold. If the BLAST search results show high identity in both eHOMD and PacBio_32, this indicates a good match, regardless of the reference DB. However, if the BLAST search results show high identity in PacBio_32 but low identity (below 97%) in eHOMD, it suggests that the ASV in the PacBio_32 may serve as a better reference, which is not found in eHOMD. When eHOMD was combined with 130 ASVs (eHOMD+Pac), highest taxonomic assignment efficiency in BLAST searches was achieved compared to other DBs (<xref ref-type="fig" rid="fig1">Figure 1C</xref>).</p>
</sec>
<sec id="sec11">
<title>Phylogenetic tree-based optimization of PacBio ASVs</title>
<p>When a phylogenetic tree was constructed using PacBio_32 ASVs combined with eHOMD sequences, most of the trees included sequences from both DBs, suggesting that both DBs covered similar taxa (<xref ref-type="fig" rid="fig2">Figure 2</xref>).</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Phylogenetic trees of combined with eHOMD reference sequences and ASVs obtained from PacBio. In inner ring, colors represent phyla assigned by eHOMD. In outer ring, bar height represents number of oral samples present with the corresponding ASVs.</p>
</caption>
<graphic xlink:href="fmicb-15-1485073-g002.tif"/>
</fig>
<p>Given the substantial size difference between the PacBio ASVs and eHOMD, we sought to optimize the PacBio ASVs. To select representative sequences among similar ASVs, we employed the <italic>drop_tip</italic> function from the vegan package with various threshold to remove terminal branches (<xref ref-type="fig" rid="fig3">Figures 3A</xref>&#x2013;<xref ref-type="fig" rid="fig3">E</xref>). After constructing the BLAST DB with ASVs trimmed with various thresholds, we performed BLAST searches against Illumina V3-V4 oral microbiome data. As the threshold value increased, the number of ASVs included in the BLAST DB decreased, and the proportion of read counts with high identity (&#x003E;97%) also decreased. With a trimming threshold of 0.0005, the number of ASVs in the BLAST reference DB was reduced by approximately 50%, yet the BLAST search performance remained similar to the original ASVs (<xref ref-type="fig" rid="fig3">Figure 3F</xref>). This approach allows for the efficient optimization of DB size while maintaining taxonomic assignment accuracy.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Phylogenetic trees of gut ASVs obtained from PacBio reads were trimmed with various thresholds. <bold>(A)</bold> Total ASVs, <bold>(B)</bold> threshold 0.0005, <bold>(C)</bold> threshold 0.001, <bold>(D)</bold> threshold 0.002, <bold>(E)</bold> threshold 0.001, <bold>(F)</bold> Blast search result on Illunina V3-V4 gut microbiome data using ASVs trimmed at various threshold as reference DB.</p>
</caption>
<graphic xlink:href="fmicb-15-1485073-g003.tif"/>
</fig>
</sec>
<sec id="sec12">
<title>Analysis of gut microbiome data</title>
<p>Among various sampling sites in the gut, samples from small intestine (IIC, IICP, and JEJ100) and large intestine (ANAL) were selected for the analysis in this study. A total of 583,036 reads from the 45 gut samples were generated by PacBio long read sequencing. The mean number of sequences per sample was 12,956&#x2009;&#x00B1;&#x2009;900. After removing sequencing errors and chimera, a total of 351,966 reads remained, for an average of 7,821&#x2009;&#x00B1;&#x2009;1,879 reads per sample (<xref ref-type="table" rid="tab2">Table 2</xref>). The PacBio reference DB was constructed by optimizing the ASVs based on oral microbiome results. After constructing the phylogenetic tree, tree tips were trimmed using a threshold of 0.0005. A total of 126 samples were tested from Illumina V3-V4 sequencing data.</p>
<p>Alpha diversity was measured to determine within microbiome diversity. The Chao1 index, reflecting richness, and Shannon index, reflecting evenness, were significantly different among gut sampling sites (<xref ref-type="fig" rid="fig4">Figures 4A</xref>,<xref ref-type="fig" rid="fig4">B</xref>). To compare bacterial community structure, beta-diversity analyses were performed on the corresponding samples. In the Bray Curtis-based principal coordinates analysis (PCoA), gut microbial community structure showed significant difference depending on the sampling sites (<xref ref-type="fig" rid="fig4">Figure 4C</xref>).</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Bacterial community comparisons among gut sampling sites. Alpha diversity was used to describe the microbial richness and evenness within samples using the <bold>(A)</bold> Chao1 and <bold>(B)</bold> Shannon index. <bold>(C)</bold> Beta diversity of gut microbiome depending on sampling sites. Principal coordinate analysis (PCoA) of the Bray-Curtis distance was performed to determine the microbial community structure. &#x002A;<italic>p</italic>&#x2009;&#x003C;&#x2009;0.05, &#x002A;&#x002A;<italic>p</italic>&#x2009;&#x003C;&#x2009;0.01, &#x002A;&#x002A;&#x002A;<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001.</p>
</caption>
<graphic xlink:href="fmicb-15-1485073-g004.tif"/>
</fig>
<p>Each V3-V4 paired-reads were taxonomically assigned by pre-trained Naive Bayes classifier using either Greengene DB, SILVA DB or DB constructed by gut PacBio ASVs. At genus level, the overall relative abundance showed similar proportion regardless of the DB. However, there were some differences depending on the reference DB. The abundance of <italic>Ruminococcus</italic> was much higher in ANAL, IIC, and IICP using Greengene DB and while it showed low proportion using Pacbio DB. The abundance of <italic>Clostridium</italic> was much higher in ANAL samples using Greengene DB compared to other references (<xref ref-type="fig" rid="fig5">Figure 5A</xref>). In addition, when alpha diversity was measured at genus level, SILVA showed significantly higher indexes compared to PacBio and Greengene (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures S1A,C</xref>).</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Average relative abundance of microbiome depending on various reference database. <bold>(A)</bold> Genus level, <bold>(B)</bold> <italic>Bacteroides</italic> at species level. <bold>(C)</bold> <italic>Prevotella</italic> at species level.</p>
</caption>
<graphic xlink:href="fmicb-15-1485073-g005.tif"/>
</fig>
<p>At the species level, we compared the abundance of <italic>Bacteroides</italic> and <italic>Prevotella</italic>. For <italic>Bacteriodes</italic>, Greengene and SILVA could not classify more than 50% to the species level and named them as <italic>Bacteriodes</italic>, while Pacbio distinguished most of the <italic>Bacteroides</italic> to the specific species. Moreover, some species were only found in PacBio. For example, <italic>B. cellulosilyticsu, B. dorei, B. thetaiotaomicron</italic> and <italic>B. xylanisolvens</italic> were assigned using Pacbio DB in all gut sampling sites, whereas they were not found in the other two DBs. Similarly, <italic>B. clarus</italic> was only found in Pacbio in IIC (<xref ref-type="fig" rid="fig5">Figure 5B</xref>). In <italic>Prevotella</italic>, there was some discrepancy in the proportion of the bacteria depending on the sampling site. The abundance of <italic>Prevotella</italic> was lower in IICP and JEJ compared to other reference DBs. However, Pacbio DB distinguished most of the <italic>Prevotella</italic> to the specific species, while Greengene and SILVA failed to assign to the specific species. Also, Pacbio was able to assign eight more <italic>Prevotella</italic> species. <italic>P. bergensis, P. corporis</italic> and <italic>P. timinensis</italic> were only found in ANAL. <italic>P. intermedia</italic> and <italic>P. loescheii</italic> were found in various small intestines (<xref ref-type="fig" rid="fig5">Figure 5C</xref>). In addition, when alpha diversity was measured at species level, PacBio showed significantly higher indexes compared to SILVA and Greengene (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures S1B,D</xref>). Taken together, an improvement in species assignment was observed when the PacBio DB was used across all four gut microbiome samples compared to the other two DBs.</p>
</sec>
<sec id="sec13">
<title>Species taxa comparison in depending on reference DB</title>
<p>Finally, LEfSe was applied to evaluate the differential analysis in bacterial species abundance among gut sampling sites using the taxa assigned by each reference DB. Despite analyzing the same raw data, the results demonstrated a clear difference in the identification of significant taxa depending on the reference DBs. PacBio identified significantly more species compared to the other two reference DBs. The number of significant taxa varied depending on the DB. Five species were found significant across all reference DBs: <italic>Bacteroides caccae, B. fragilis, B. plebeius, Bifidobacterium bifidum</italic>, and <italic>Campylobacter ureolyticus</italic>. Additionally, 30 species overlapped between the Greengenes and PacBio DBs, while 11 species overlapped between the SILVA and PacBio DBs. <italic>Prevotella pallens</italic> was identified as significant by both Greengenes and SILVA DBs. There were some unique taxa identified significant depending on the reference DB. The Greengenes DB found 11 unique species, PacBio identified 114 unique species, and SILVA detected 32 unique species. Interestingly, the Greengene DB identified 4 significant <italic>Clostridium</italic> species, while the SILVA DB identified 4 significant <italic>Clostridiales</italic> bacterium and PacBio DB identified 6 unique <italic>Clostridium</italic> species. Additionally, the PacBio DB identified several genera with multiple unique significant taxa, including 7 unique <italic>Anaerococcus</italic> species, 9 <italic>Bacteroides</italic> species, 5 <italic>Corynebacterium</italic> species, 8 <italic>Eubacterium</italic> species, 7 <italic>Peptoniphilus</italic> species, and 8 <italic>Prevotella</italic> species (<xref ref-type="fig" rid="fig6">Figure 6</xref>). Taken together, marked variations were observed in the identification of bacterial species depending on the reference DBs, with PacBio demonstrating highest number of unique and significant taxa, suggesting the importance of reference DB.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Comparisons of microbiota among various gut sampling sites that presented significantly different depending on reference database. <bold>(A)</bold> Greengene, <bold>(B)</bold> PacBio ASVs, <bold>(C)</bold> SILVA. The analysis was performed using linear discriminant analysis (LDA) and effect size analysis. LDA score&#x2009;&#x003E;&#x2009;3.0 are displayed.</p>
</caption>
<graphic xlink:href="fmicb-15-1485073-g006.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="sec14">
<title>Discussion</title>
<p>Sequencing of the 16S rRNA gene is a widely accepted standard for analyzing the taxonomic composition of bacterial communities (<xref ref-type="bibr" rid="ref45">Woese and Fox, 1977</xref>). Extensive public databases (e.g., SILVA, Greengenes, RDP) facilitate taxonomic assignment. Optimizing reference databases is crucial for human microbiome studies to ensure accurate identification and classification of microbial taxa, thereby reducing the chances of misidentification or ambiguous results (<xref ref-type="bibr" rid="ref28">Monika Balvo&#x010D;i&#x016B;t&#x0117; et al., 2017</xref>). Recent advancements in PacBio technology can generate highly accurate, long high-fidelity reads, offering single-nucleotide resolution (<xref ref-type="bibr" rid="ref44">Wenger et al., 2019</xref>; <xref ref-type="bibr" rid="ref7">Callahan et al., 2019</xref>). In this study, we tested whether 16S full-length sequencing data produced by PacBio could be used to construct a reference database and evaluated its application using Illumina V3-V4 targeted short read sequencing data in human microbiome studies.</p>
<p>To evaluate whether PacBio long-read sequencing data could be used to construct a microbiome reference database, we used oral microbiome data for testing. The oral microbiome typically has lower microbial diversity compared to the gut microbiome (<xref ref-type="bibr" rid="ref20">Human Microbiome Project C, 2012</xref>) and has been extensively studied, resulting in well-characterized reference DBs (18). First, we plotted the number of ASVs obtained from an individual to determine the minimum number of samples required to represent a population. For an individual oral sample, the average ASV count was 247, ranging from 34 to 440. Since combining and resequencing samples was not feasible, we randomly combined samples to simulate mixtures. When 4, 8, 16, and 32 samples were randomly combined, the number of ASVs detected increased gradually with the number of samples in the group (<xref ref-type="fig" rid="fig1">Figure 1B</xref>). To assess their efficiency in classifying Illumina data, we constructed a BLAST reference database using the PacBio ASVs obtained from various combinations. A stand-alone BLAST search was performed against Illumina data, and the results were compared against eHOMD to obtain discrete statistics. BLAST operates by aligning query sequences to a database of sequences, identifying regions of similarity using a heuristic algorithm to find high-scoring sequence alignments quickly. It produces a list of sequences in the database that are most similar to the query sequence, along with alignment scores and statistics, including identical nucleotide length and percentage (<xref ref-type="bibr" rid="ref1">Altschul et al., 1990</xref>; <xref ref-type="bibr" rid="ref8">Camacho et al., 2009</xref>).</p>
<p>When comparing eHOMD and PacBio_4 (4 samples mixed), which had a similar number of reference counts, the proportion of high identity (&#x003E;97%) was significantly higher using eHOMD, while the overall positively blasted (&#x003E;90%) percentage of reads was over 95% for both. Generally, sequence identity of 97, 95, and 90% or less for 16S rRNA genes is considered distinctive for species, genera, and family, respectively (<xref ref-type="bibr" rid="ref46">Yarza et al., 2014</xref>; <xref ref-type="bibr" rid="ref40">Tindall et al., 2010</xref>). The combination that showed comparable high identity performance to eHOMD was PacBio_16 (16 samples mixed). Thus, a minimum of 4 samples was sufficient to determine 95% of reads at the family level, while at least 16 samples were required to determine 95% of reads at the species level. Given that PacBio_32 included nearly 6,000 ASVs compared to eHOMD&#x2019;s 1,032 sequences, we tested whether the eHOMD could be enhanced by adding ASVs from PacBio_32. Specifically, we filtered Illumina reads that showed less than 97% identity against eHOMD but higher than 97% identity against PacBio ASVs. We identified 130 ASVs, and the database created by combining eHOMD with these 130 ASVs (eHOMD+Pac) demonstrated the highest taxonomic assignment performance (<xref ref-type="fig" rid="fig1">Figure 1C</xref>). Taken together, with sufficient samples, PacBio full-length sequencing data can be utilized to construct a reference DB from a scratch for oral microbiome study.</p>
<p>To investigate any discrepancies in microbiome coverage between the PacBio DB and eHOMD, a phylogenetic tree was constructed using PacBio_32 OTU sequences combined with eHOMD. Phylogenetic analysis, which can be used for biological classification (<xref ref-type="bibr" rid="ref11">de Queiroz and Gauthier, 1994</xref>) and predicting characteristics of clonal populations and unstudied species (<xref ref-type="bibr" rid="ref29">Pearson et al., 2009</xref>), revealed that most of the trees included sequences from both databases, suggesting that both databases cover similar taxa (<xref ref-type="fig" rid="fig2">Figure 2</xref>).</p>
<p>Given the substantial size difference between the PacBio_32 ASVs and eHOMD databases, we aimed to optimize the PacBio ASVs. One method to optimize the database is by constructing a phylogenetic tree, trimming closely related branches, and retaining the representative taxa (<xref ref-type="bibr" rid="ref27">Mikula, 2018</xref>). To find the optimal condition, terminal branches were trimmed at various thresholds. When these trimmed ASVs were used to BLAST Illumina sequencing data, a negative correlation was observed between the threshold and identity outcome. With a trimming threshold of 0.0005, the number of ASVs in the BLAST reference database was reduced by 50%, while the BLAST search performance remained similar to that of the PacBio_32 ASVs (<xref ref-type="fig" rid="fig3">Figure 3F</xref>). Taken together, this approach allows for efficient database optimization while maintaining high taxonomic assignment accuracy.</p>
<p>To evaluate whether PacBio ASVs could be applied to other less-studied microbiomes, we tested them against gut microbiome data. The gut microbiome, particularly in the small intestine, presents unique challenges. The microbial community composition in the small intestine differs from that in fecal or oral samples, often containing a higher proportion of fastidious and less well-characterized bacteria, which complicates taxonomic identification (<xref ref-type="bibr" rid="ref39">Thadepalli et al., 1979</xref>; <xref ref-type="bibr" rid="ref42">Villmones et al., 2022</xref>). Obtaining samples from the small intestine typically requires invasive procedures such as endoscopy or intubation, which are more complex, costly, and uncomfortable for patients compared to non-invasive fecal or oral sample collection (<xref ref-type="bibr" rid="ref4">Booijink et al., 2007</xref>). Additionally, the small intestine has a lower microbial biomass compared to the colon, making it more difficult to obtain sufficient microbial DNA for analysis (<xref ref-type="bibr" rid="ref18">Hayashi et al., 2005</xref>). We constructed optimized gut microbiome reference DB using gut PacBio ASVs.</p>
<p>A pre-trained Naive Bayes classifier was prepared using the Greengene DB, SILVA DB, and gut PacBio ASVs. Gut Illumina V3-V4 paired-reads microbiome data from the ileum, jejunum, and anus were taxonomically assigned by each classifier. Unlike BLAST, the Naive Bayes classifier assigns taxonomy to rRNA sequences by calculating the probability of the sequence belonging to a particular taxon. It is fast and efficient for classifying large numbers of sequences and provides taxonomic assignments with confidence scores, which depend on the quality and comprehensiveness of the training DB (<xref ref-type="bibr" rid="ref43">Wang et al., 2007</xref>). At the genus level, the overall relative abundance showed similar proportions regardless of the DB used (<xref ref-type="fig" rid="fig5">Figure 5A</xref>). At the species level, classifiers trained with Greengene and SILVA DBs assigned more than 50% of the operational taxonomic units (OTUs) as <italic>Bacteroides</italic>, while the classifier trained with the PacBio DB distinguished most OTUs to specific species. Moreover, some species were only identified by the PacBio DB-trained classifier. Our results support that a well-curated, microbiome-specific DB can improve the reliability of 16S sequencing analyses and taxonomic annotations (<xref ref-type="bibr" rid="ref31">Ritari et al., 2015</xref>; <xref ref-type="bibr" rid="ref36">Sierra et al., 2020</xref>). Taken together, an improvement in species assignment was observed when using the PacBio DB across all four gut microbiome samples compared to the other two DBs.</p>
<p>One of the primary purposes of microbiome studies is to discover biomarkers for diseases (<xref ref-type="bibr" rid="ref15">Hajjo et al., 2022</xref>). Biomarker discovery can provide a deeper understanding of disease mechanisms (<xref ref-type="bibr" rid="ref9">Cani, 2018</xref>) and can be applied to disease prediction and treatment (<xref ref-type="bibr" rid="ref41">Veziant et al., 2021</xref>; <xref ref-type="bibr" rid="ref26">Marcos-Zambrano et al., 2021</xref>). We applied LEfSe to evaluate the biomarker discovery efficiency using classifiers trained with various reference DBs. The choice of reference DB significantly impacted the identification of significant taxa. The PacBio reference DB identified significantly more species compared to the other reference DBs. Although further validation is necessary, having more candidate species increases the likelihood of identifying important taxa.</p>
<p>In addition, recent advancements in the accuracy of sequencing long DNA reads using Nanopore technology, particularly in homopolymer regions, may present a new potential method for preparing microbiome reference DBs (<xref ref-type="bibr" rid="ref25">Mantas Sereika et al., 2022</xref>).</p>
</sec>
<sec sec-type="conclusions" id="sec15">
<title>Conclusion</title>
<p>In conclusion, full-length 16S rRNA sequencing data produced by PacBio can be used to construct an optimized microbiome reference database that demonstrates coverage and efficiency comparable to the well-established HOMD in oral microbiome studies. Applying these optimization methods to gut microbiome data indicated that this approach could be extended to other microbiomes, enhancing the accuracy of microbiome classification and improving biomarker discovery.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec16">
<title>Data availability statement</title>
<p>The raw sequencing data have been retrieved from NCBI GenBank BioProject ID PRJNA1049979. For oral microbiome study, 32 samples were sequenced by Pacbio and 198 samples were sequenced by Illumina platform. For gut microbiome study, 45 samples were sequenced by Pacbio and 128 samples were sequenced by Illumina. Summary of sampling site and sample number is shown in <xref ref-type="table" rid="tab1">Tables 1</xref>, <xref ref-type="table" rid="tab2">2</xref>.</p>
</sec>
<sec sec-type="author-contributions" id="sec17">
<title>Author contributions</title>
<p>HH: Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing, Methodology, Software. YC: Data curation, Funding acquisition, Investigation, Software, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. SK: Methodology, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing, Visualization. JP: Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. JC: Supervision, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. HN: Conceptualization, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="funding-information" id="sec18">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This research was supported by Basic Science Research Program through the National Research Foundation of Korea (NRF), funded by the Ministry of Education (NRF-2017M3A9B6062021, NRF-2023R1A2C2002783). This work was supported by a 2-Year Research Grant of Pusan National University.</p>
</sec>
<sec sec-type="COI-statement" id="sec19">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="sec20">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec21">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fmicb.2024.1485073/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fmicb.2024.1485073/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Supplementary_file_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Altschul</surname> <given-names>S. F.</given-names></name> <name><surname>Gish</surname> <given-names>W.</given-names></name> <name><surname>Miller</surname> <given-names>W.</given-names></name> <name><surname>Myers</surname> <given-names>E. W.</given-names></name> <name><surname>Lipman</surname> <given-names>D. J.</given-names></name></person-group> (<year>1990</year>). <article-title>Basic local alignment search tool</article-title>. <source>J. Mol. Biol.</source> <volume>215</volume>, <fpage>403</fpage>&#x2013;<lpage>410</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0022-2836(05)80360-2</pub-id></citation>
</ref>
<ref id="ref2">
<citation citation-type="journal"><person-group person-group-type="author">
<name><surname>Baker</surname> <given-names>M.</given-names></name>
</person-group> (<year>2010</year>). <article-title>Next-generation sequencing: adjusting to data overload</article-title>. <source>Nat. Methods</source> <volume>7</volume>, <fpage>495</fpage>&#x2013;<lpage>499</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nmeth0710-495</pub-id></citation>
</ref>
<ref id="ref3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bolyen</surname> <given-names>E.</given-names></name> <name><surname>Rideout</surname> <given-names>J. R.</given-names></name> <name><surname>Dillon</surname> <given-names>M. R.</given-names></name> <name><surname>Bokulich</surname> <given-names>N. A.</given-names></name> <name><surname>Abnet</surname> <given-names>C. C.</given-names></name> <name><surname>Al-Ghalith</surname> <given-names>G. A.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2</article-title>. <source>Nat. Biotechnol.</source> <volume>37</volume>, <fpage>852</fpage>&#x2013;<lpage>857</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-019-0209-9</pub-id>, PMID: <pub-id pub-id-type="pmid">31341288</pub-id></citation>
</ref>
<ref id="ref4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Booijink</surname> <given-names>C. C.</given-names></name> <name><surname>Zoetendal</surname> <given-names>E. G.</given-names></name> <name><surname>Kleerebezem</surname> <given-names>M.</given-names></name> <name><surname>de Vos</surname> <given-names>W. M.</given-names></name></person-group> (<year>2007</year>). <article-title>Microbial communities in the human small intestine: coupling diversity to metagenomics</article-title>. <source>Future Microbiol.</source> <volume>2</volume>, <fpage>285</fpage>&#x2013;<lpage>295</lpage>. doi: <pub-id pub-id-type="doi">10.2217/17460913.2.3.285</pub-id>, PMID: <pub-id pub-id-type="pmid">17661703</pub-id></citation>
</ref>
<ref id="ref5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Boppana</surname> <given-names>K.</given-names></name> <name><surname>Almansouri</surname> <given-names>N. E.</given-names></name> <name><surname>Bakkannavar</surname> <given-names>S.</given-names></name> <name><surname>Faheem</surname> <given-names>Y.</given-names></name> <name><surname>Jaiswal</surname> <given-names>A.</given-names></name> <name><surname>Shergill</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Alterations in gut microbiota as early biomarkers for predicting inflammatory bowel disease onset and progression: a systematic review</article-title>. <source>Cureus</source> <volume>16</volume>:<fpage>e58080</fpage>. doi: <pub-id pub-id-type="doi">10.7759/cureus.58080</pub-id>, PMID: <pub-id pub-id-type="pmid">38741828</pub-id></citation>
</ref>
<ref id="ref6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Buetas</surname> <given-names>E.</given-names></name> <name><surname>Jordan-Lopez</surname> <given-names>M.</given-names></name> <name><surname>Lopez-Roldan</surname> <given-names>A.</given-names></name> <name><surname>D'Auria</surname> <given-names>G.</given-names></name> <name><surname>Martinez-Priego</surname> <given-names>L.</given-names></name> <name><surname>De Marco</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Full-length 16S rRNA gene sequencing by PacBio improves taxonomic resolution in human microbiome samples</article-title>. <source>BMC Genomics</source> <volume>25</volume>:<fpage>310</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12864-024-10213-5</pub-id>, PMID: <pub-id pub-id-type="pmid">38528457</pub-id></citation>
</ref>
<ref id="ref7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Callahan</surname> <given-names>B. J.</given-names></name> <name><surname>Wong</surname> <given-names>J.</given-names></name> <name><surname>Heiner</surname> <given-names>C.</given-names></name> <name><surname>Oh</surname> <given-names>S.</given-names></name> <name><surname>Theriot</surname> <given-names>C. M.</given-names></name> <name><surname>Gulati</surname> <given-names>A. S.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>High-throughput amplicon sequencing of the full-length 16S rRNA gene with single-nucleotide resolution</article-title>. <source>Nucleic Acids Res.</source> <volume>47</volume>:<fpage>e103</fpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkz569</pub-id>, PMID: <pub-id pub-id-type="pmid">31269198</pub-id></citation>
</ref>
<ref id="ref8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Camacho</surname> <given-names>C.</given-names></name> <name><surname>Coulouris</surname> <given-names>G.</given-names></name> <name><surname>Avagyan</surname> <given-names>V.</given-names></name> <name><surname>Ma</surname> <given-names>N.</given-names></name> <name><surname>Papadopoulos</surname> <given-names>J.</given-names></name> <name><surname>Bealer</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>BLAST+: architecture and applications</article-title>. <source>BMC Bioinform.</source> <volume>10</volume>:<fpage>421</fpage>. doi: <pub-id pub-id-type="doi">10.1186/1471-2105-10-421</pub-id>, PMID: <pub-id pub-id-type="pmid">20003500</pub-id></citation>
</ref>
<ref id="ref9">
<citation citation-type="journal"><person-group person-group-type="author">
<name><surname>Cani</surname> <given-names>P. D.</given-names></name>
</person-group> (<year>2018</year>). <article-title>Human gut microbiome: hopes, threats and promises</article-title>. <source>Gut</source> <volume>67</volume>, <fpage>1716</fpage>&#x2013;<lpage>1725</lpage>. doi: <pub-id pub-id-type="doi">10.1136/gutjnl-2018-316723</pub-id>, PMID: <pub-id pub-id-type="pmid">29934437</pub-id></citation>
</ref>
<ref id="ref10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cole</surname> <given-names>J. R.</given-names></name> <name><surname>Wang</surname> <given-names>Q.</given-names></name> <name><surname>Fish</surname> <given-names>J. A.</given-names></name> <name><surname>Chai</surname> <given-names>B.</given-names></name> <name><surname>McGarrell</surname> <given-names>D. M.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Ribosomal database project: data and tools for high throughput rRNA analysis</article-title>. <source>Nucleic Acids Res.</source> <volume>42</volume>, <fpage>D633</fpage>&#x2013;<lpage>D642</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkt1244</pub-id>, PMID: <pub-id pub-id-type="pmid">24288368</pub-id></citation>
</ref>
<ref id="ref11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>de Queiroz</surname> <given-names>K.</given-names></name> <name><surname>Gauthier</surname> <given-names>J.</given-names></name></person-group> (<year>1994</year>). <article-title>Toward a phylogenetic system of biological nomenclature</article-title>. <source>Trends Ecol. Evol.</source> <volume>9</volume>, <fpage>27</fpage>&#x2013;<lpage>31</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0169-5347(94)90231-3</pub-id></citation>
</ref>
<ref id="ref12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>DeSantis</surname> <given-names>T. Z.</given-names></name> <name><surname>Hugenholtz</surname> <given-names>P.</given-names></name> <name><surname>Larsen</surname> <given-names>N.</given-names></name> <name><surname>Rojas</surname> <given-names>M.</given-names></name> <name><surname>Brodie</surname> <given-names>E. L.</given-names></name> <name><surname>Keller</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2006</year>). <article-title>Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB</article-title>. <source>Appl. Environ. Microbiol.</source> <volume>72</volume>, <fpage>5069</fpage>&#x2013;<lpage>5072</lpage>. doi: <pub-id pub-id-type="doi">10.1128/AEM.03006-05</pub-id>, PMID: <pub-id pub-id-type="pmid">16820507</pub-id></citation>
</ref>
<ref id="ref13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dewhirst</surname> <given-names>F. E.</given-names></name> <name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Izard</surname> <given-names>J.</given-names></name> <name><surname>Paster</surname> <given-names>B. J.</given-names></name> <name><surname>Tanner</surname> <given-names>A. C.</given-names></name> <name><surname>Yu</surname> <given-names>W. H.</given-names></name> <etal/></person-group>. (<year>2010</year>). <article-title>The human oral microbiome</article-title>. <source>J. Bacteriol.</source> <volume>192</volume>, <fpage>5002</fpage>&#x2013;<lpage>5017</lpage>. doi: <pub-id pub-id-type="doi">10.1128/JB.00542-10</pub-id>, PMID: <pub-id pub-id-type="pmid">20656903</pub-id></citation>
</ref>
<ref id="ref14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dong</surname> <given-names>T.</given-names></name> <name><surname>Liang</surname> <given-names>Y.</given-names></name> <name><surname>Xie</surname> <given-names>J.</given-names></name> <name><surname>Fan</surname> <given-names>W.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Han</surname> <given-names>X.</given-names></name></person-group> (<year>2024</year>). <article-title>Integrative analyses identify opportunistic pathogens of patients with lower respiratory tract infections based on metagenomic next-generation sequencing</article-title>. <source>Heliyon</source> <volume>10</volume>:<fpage>e30896</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e30896</pub-id>, PMID: <pub-id pub-id-type="pmid">38765026</pub-id></citation>
</ref>
<ref id="ref15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hajjo</surname> <given-names>R.</given-names></name> <name><surname>Sabbah</surname> <given-names>D. A.</given-names></name> <name><surname>Al Bawab</surname> <given-names>A. Q.</given-names></name></person-group> (<year>2022</year>). <article-title>Unlocking the potential of the human microbiome for identifying disease diagnostic biomarkers</article-title>. <source>Diagnostics</source> <volume>12</volume>:<fpage>1742</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics12071742</pub-id>, PMID: <pub-id pub-id-type="pmid">35885645</pub-id></citation>
</ref>
<ref id="ref16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hall</surname> <given-names>M.</given-names></name> <name><surname>Beiko</surname> <given-names>R. G.</given-names></name></person-group> (<year>1849</year>). <article-title>16S rRNA gene analysis with QIIME2</article-title>. <source>Methods Mol. Biol.</source> <volume>1849</volume>, <fpage>113</fpage>&#x2013;<lpage>129</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-1-4939-8728-3_8</pub-id></citation>
</ref>
<ref id="ref17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Haneishi</surname> <given-names>Y.</given-names></name> <name><surname>Furuya</surname> <given-names>Y.</given-names></name> <name><surname>Hasegawa</surname> <given-names>M.</given-names></name> <name><surname>Picarelli</surname> <given-names>A.</given-names></name> <name><surname>Rossi</surname> <given-names>M.</given-names></name> <name><surname>Miyamoto</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>Inflammatory bowel diseases and gut microbiota</article-title>. <source>Int. J. Mol. Sci.</source> <volume>24</volume>:<fpage>3817</fpage>. doi: <pub-id pub-id-type="doi">10.3390/ijms24043817</pub-id>, PMID: <pub-id pub-id-type="pmid">36835245</pub-id></citation>
</ref>
<ref id="ref18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hayashi</surname> <given-names>H.</given-names></name> <name><surname>Takahashi</surname> <given-names>R.</given-names></name> <name><surname>Nishi</surname> <given-names>T.</given-names></name> <name><surname>Sakamoto</surname> <given-names>M.</given-names></name> <name><surname>Benno</surname> <given-names>Y.</given-names></name></person-group> (<year>2005</year>). <article-title>Molecular analysis of jejunal, ileal, caecal and recto-sigmoidal human colonic microbiota using 16S rRNA gene libraries and terminal restriction fragment length polymorphism</article-title>. <source>J. Med. Microbiol.</source> <volume>54</volume>, <fpage>1093</fpage>&#x2013;<lpage>1101</lpage>. doi: <pub-id pub-id-type="doi">10.1099/jmm.0.45935-0</pub-id>, PMID: <pub-id pub-id-type="pmid">16192442</pub-id></citation>
</ref>
<ref id="ref19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>B.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Zhuang</surname> <given-names>Z.</given-names></name> <name><surname>Deng</surname> <given-names>Q.</given-names></name> <name><surname>Qiu</surname> <given-names>Y.</given-names></name> <name><surname>Pan</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>The potential value of oral microbial signatures for prediction of oral squamous cell carcinoma based on machine learning algorithms</article-title>. <source>Head Neck</source> <volume>46</volume>, <fpage>1660</fpage>&#x2013;<lpage>1670</lpage>. doi: <pub-id pub-id-type="doi">10.1002/hed.27795</pub-id>, PMID: <pub-id pub-id-type="pmid">38695435</pub-id></citation>
</ref>
<ref id="ref20">
<citation citation-type="journal"><person-group person-group-type="author">
<collab id="coll1">Human Microbiome Project C</collab>
</person-group> (<year>2012</year>). <article-title>Structure, function and diversity of the healthy human microbiome</article-title>. <source>Nature</source> <volume>486</volume>, <fpage>207</fpage>&#x2013;<lpage>214</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nature11234</pub-id>, PMID: <pub-id pub-id-type="pmid">22699609</pub-id></citation>
</ref>
<ref id="ref21">
<citation citation-type="journal"><person-group person-group-type="author">
<collab id="coll2">Ivica Letunic</collab>
<name><surname>Bork</surname> <given-names>P.</given-names></name>
</person-group> (<year>2021</year>). <article-title>Interactive tree of life (iTOL) v5: an online tool for phylogenetic tree display and annotation</article-title>. <source>Nucleic Acids Res.</source> <volume>49</volume>, <fpage>W293</fpage>&#x2013;<lpage>W296</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkab301</pub-id></citation>
</ref>
<ref id="ref22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jie</surname> <given-names>Z.</given-names></name> <name><surname>Xia</surname> <given-names>H.</given-names></name> <name><surname>Zhong</surname> <given-names>S. L.</given-names></name> <name><surname>Feng</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>S.</given-names></name> <name><surname>Liang</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>The gut microbiome in atherosclerotic cardiovascular disease</article-title>. <source>Nat. Commun.</source> <volume>8</volume>:<fpage>845</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-017-00900-1</pub-id></citation>
</ref>
<ref id="ref23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Katiraei</surname> <given-names>S.</given-names></name> <name><surname>Anvar</surname> <given-names>Y.</given-names></name> <name><surname>Hoving</surname> <given-names>L.</given-names></name> <name><surname>JFP</surname> <given-names>B.</given-names></name> <name><surname>van Harmelen</surname> <given-names>V.</given-names></name> <name><surname>Willems van Dijk</surname> <given-names>K.</given-names></name></person-group> (<year>2022</year>). <article-title>Evaluation of full-length versus V4-region 16S rRNA sequencing for phylogenetic analysis of mouse intestinal microbiota after a dietary intervention</article-title>. <source>Curr. Microbiol.</source> <volume>79</volume>:<fpage>276</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s00284-022-02956-9</pub-id>, PMID: <pub-id pub-id-type="pmid">35907023</pub-id></citation>
</ref>
<ref id="ref24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>He</surname> <given-names>P.</given-names></name> <name><surname>Ma</surname> <given-names>S.</given-names></name> <name><surname>Du</surname> <given-names>J.</given-names></name> <name><surname>Jiang</surname> <given-names>R.</given-names></name></person-group> (<year>2016</year>). <article-title>Construction and analysis of functional networks in the gut microbiome of type 2 diabetes patients</article-title>. <source>Genom. Proteom. Bioinform.</source> <volume>14</volume>, <fpage>314</fpage>&#x2013;<lpage>324</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.gpb.2016.02.005</pub-id>, PMID: <pub-id pub-id-type="pmid">27746285</pub-id></citation>
</ref>
<ref id="ref25">
<citation citation-type="journal"><person-group person-group-type="author"><collab id="coll3">Mantas Sereika</collab><name><surname>Kirkegaard</surname> <given-names>R. H.</given-names></name> <name><surname>Karst</surname> <given-names>S. M.</given-names></name> <name><surname>Michaelsen</surname> <given-names>T. Y.</given-names></name> <name><surname>S&#x00F8;rensen</surname> <given-names>E. A.</given-names></name> <name><surname>Wollenberg</surname> <given-names>R. D.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Oxford Nanopore R10.4 long-read sequencing enables the generation of near-finished bacterial genomes from pure cultures and metagenomes without short-read or reference polishing</article-title>. <source>Nat. Methods</source> <volume>19</volume>, <fpage>823</fpage>&#x2013;<lpage>826</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41592-022-01539-7</pub-id></citation>
</ref>
<ref id="ref26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Marcos-Zambrano</surname> <given-names>L. J.</given-names></name> <name><surname>Karaduzovic-Hadziabdic</surname> <given-names>K.</given-names></name> <name><surname>Loncar Turukalo</surname> <given-names>T.</given-names></name> <name><surname>Przymus</surname> <given-names>P.</given-names></name> <name><surname>Trajkovik</surname> <given-names>V.</given-names></name> <name><surname>Aasmets</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Applications of machine learning in human microbiome studies: a review on feature selection, biomarker identification, disease prediction and treatment</article-title>. <source>Front Microbiol.</source> <volume>12</volume>:<fpage>634511</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2021.634511</pub-id></citation>
</ref>
<ref id="ref27">
<citation citation-type="other"><person-group person-group-type="author">
<name><surname>Mikula</surname> <given-names>O.</given-names></name>
</person-group> (<year>2018</year>). <article-title>Cutting tree branches to pick OTUs: A novel method of provisional species delimitation</article-title>. <source>bioRxiv.</source> <fpage>419887</fpage>. [Preprint].</citation>
</ref>
<ref id="ref28">
<citation citation-type="journal"><person-group person-group-type="author">
<collab id="coll4">Monika Balvo&#x010D;i&#x016B;t&#x0117;</collab>
<name><surname>Huson</surname> <given-names>D. H.</given-names></name>
</person-group> (<year>2017</year>). <article-title>SILVA, RDP, Greengenes, NCBI and OTT &#x2014; how do these taxonomies compare?</article-title> <source>BMC Genomics</source> <volume>18</volume>:<fpage>114</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12864-017-3501-4</pub-id></citation>
</ref>
<ref id="ref29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pearson</surname> <given-names>T.</given-names></name> <name><surname>Okinaka</surname> <given-names>R. T.</given-names></name> <name><surname>Foster</surname> <given-names>J. T.</given-names></name> <name><surname>Keim</surname> <given-names>P.</given-names></name></person-group> (<year>2009</year>). <article-title>Phylogenetic understanding of clonal populations in an era of whole genome sequencing</article-title>. <source>Infect. Genet. Evol.</source> <volume>9</volume>, <fpage>1010</fpage>&#x2013;<lpage>1019</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.meegid.2009.05.014</pub-id></citation>
</ref>
<ref id="ref30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Quast</surname> <given-names>C.</given-names></name> <name><surname>Pruesse</surname> <given-names>E.</given-names></name> <name><surname>Yilmaz</surname> <given-names>P.</given-names></name> <name><surname>Gerken</surname> <given-names>J.</given-names></name> <name><surname>Schweer</surname> <given-names>T.</given-names></name> <name><surname>Yarza</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>The SILVA ribosomal RNA gene database project: improved data processing and web-based tools</article-title>. <source>Nucleic Acids Res.</source> <volume>41</volume>, <fpage>D590</fpage>&#x2013;<lpage>D596</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gks1219</pub-id>, PMID: <pub-id pub-id-type="pmid">23193283</pub-id></citation>
</ref>
<ref id="ref31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ritari</surname> <given-names>J.</given-names></name> <name><surname>Salojarvi</surname> <given-names>J.</given-names></name> <name><surname>Lahti</surname> <given-names>L.</given-names></name> <name><surname>de Vos</surname> <given-names>W. M.</given-names></name></person-group> (<year>2015</year>). <article-title>Improved taxonomic assignment of human intestinal 16S rRNA sequences by a dedicated reference database</article-title>. <source>BMC Genomics</source> <volume>16</volume>:<fpage>1056</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12864-015-2265-y</pub-id>, PMID: <pub-id pub-id-type="pmid">26651617</pub-id></citation>
</ref>
<ref id="ref32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Satam</surname> <given-names>H.</given-names></name> <name><surname>Joshi</surname> <given-names>K.</given-names></name> <name><surname>Mangrolia</surname> <given-names>U.</given-names></name> <name><surname>Waghoo</surname> <given-names>S.</given-names></name> <name><surname>Zaidi</surname> <given-names>G.</given-names></name> <name><surname>Rawool</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Next-generation sequencing technology: current trends and advancements</article-title>. <source>Biology</source> <volume>12</volume>:<fpage>997</fpage>. doi: <pub-id pub-id-type="doi">10.3390/biology12070997</pub-id></citation>
</ref>
<ref id="ref33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sczyrba</surname> <given-names>A.</given-names></name> <name><surname>Hofmann</surname> <given-names>P.</given-names></name> <name><surname>Belmann</surname> <given-names>P.</given-names></name> <name><surname>Koslicki</surname> <given-names>D.</given-names></name> <name><surname>Janssen</surname> <given-names>S.</given-names></name> <name><surname>Droge</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Critical assessment of metagenome interpretation-a benchmark of metagenomics software</article-title>. <source>Nat. Methods</source> <volume>14</volume>, <fpage>1063</fpage>&#x2013;<lpage>1071</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nmeth.4458</pub-id>, PMID: <pub-id pub-id-type="pmid">28967888</pub-id></citation>
</ref>
<ref id="ref34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Segata</surname> <given-names>N.</given-names></name> <name><surname>Izard</surname> <given-names>J.</given-names></name> <name><surname>Waldron</surname> <given-names>L.</given-names></name> <name><surname>Gevers</surname> <given-names>D.</given-names></name> <name><surname>Miropolsky</surname> <given-names>L.</given-names></name> <name><surname>Garrett</surname> <given-names>W. S.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Metagenomic biomarker discovery and explanation</article-title>. <source>Genome Biol.</source> <volume>12</volume>:<fpage>R60</fpage>. doi: <pub-id pub-id-type="doi">10.1186/gb-2011-12-6-r60</pub-id>, PMID: <pub-id pub-id-type="pmid">21702898</pub-id></citation>
</ref>
<ref id="ref35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>She</surname> <given-names>J. J.</given-names></name> <name><surname>Liu</surname> <given-names>W. X.</given-names></name> <name><surname>Ding</surname> <given-names>X. M.</given-names></name> <name><surname>Guo</surname> <given-names>G.</given-names></name> <name><surname>Han</surname> <given-names>J.</given-names></name> <name><surname>Shi</surname> <given-names>F. Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Defining the biogeographical map and potential bacterial translocation of microbiome in human 'surface organs'</article-title>. <source>Nat. Commun.</source> <volume>15</volume>:<fpage>427</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-024-44720-6</pub-id>, PMID: <pub-id pub-id-type="pmid">38199995</pub-id></citation>
</ref>
<ref id="ref36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sierra</surname> <given-names>M. A.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name> <name><surname>Pushalkar</surname> <given-names>S.</given-names></name> <name><surname>Paul</surname> <given-names>B.</given-names></name> <name><surname>Sandoval</surname> <given-names>T. A.</given-names></name> <name><surname>Kamer</surname> <given-names>A. R.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>The influences of bioinformatics tools and reference databases in analyzing the human Oral microbial community</article-title>. <source>Genes (Basel)</source> <volume>11</volume>:<fpage>878</fpage>. doi: <pub-id pub-id-type="doi">10.3390/genes11080878</pub-id>, PMID: <pub-id pub-id-type="pmid">32756341</pub-id></citation>
</ref>
<ref id="ref37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Souza</surname> <given-names>A. K.</given-names></name> <name><surname>Zangirolamo</surname> <given-names>A. F.</given-names></name> <name><surname>Droher</surname> <given-names>R. G.</given-names></name> <name><surname>FGC</surname> <given-names>B.</given-names></name> <name><surname>Alfieri</surname> <given-names>A. A.</given-names></name></person-group> (<year>2023</year>). <article-title>Carvalho da Costa M, et al. investigation of the vaginal microbiota of dairy cows through genetic sequencing of short (Illumina) and long (PacBio) reads and associations with gestational status</article-title>. <source>PLoS One</source> <volume>18</volume>:<fpage>e0290026</fpage>:<fpage>e0290026</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0290026</pub-id>, PMID: <pub-id pub-id-type="pmid">37611040</pub-id></citation>
</ref>
<ref id="ref38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stackebrandt</surname> <given-names>E.</given-names></name> <name><surname>Goebel</surname> <given-names>B. M.</given-names></name></person-group> (<year>1994</year>). <article-title>Taxonomic note: a place for DNA-DNA Reassociation and 16S rRNA sequence analysis in the present species definition in bacteriology</article-title>. <source>Int. J. Syst. Evol. Microbiol.</source> <volume>44</volume>, <fpage>846</fpage>&#x2013;<lpage>849</lpage>. doi: <pub-id pub-id-type="doi">10.1099/00207713-44-4-846</pub-id></citation>
</ref>
<ref id="ref39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Thadepalli</surname> <given-names>H.</given-names></name> <name><surname>Lou</surname> <given-names>M. A.</given-names></name> <name><surname>Bach</surname> <given-names>V. T.</given-names></name> <name><surname>Matsui</surname> <given-names>T. K.</given-names></name> <name><surname>Mandal</surname> <given-names>A. K.</given-names></name></person-group> (<year>1979</year>). <article-title>Microflora of the human small intestine</article-title>. <source>Am. J. Surg.</source> <volume>138</volume>, <fpage>845</fpage>&#x2013;<lpage>850</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0002-9610(79)90309-X</pub-id></citation>
</ref>
<ref id="ref40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tindall</surname> <given-names>B. J.</given-names></name> <name><surname>Rossello-Mora</surname> <given-names>R.</given-names></name> <name><surname>Busse</surname> <given-names>H. J.</given-names></name> <name><surname>Ludwig</surname> <given-names>W.</given-names></name> <name><surname>Kampfer</surname> <given-names>P.</given-names></name></person-group> (<year>2010</year>). <article-title>Notes on the characterization of prokaryote strains for taxonomic purposes</article-title>. <source>Int. J. Syst. Evol. Microbiol.</source> <volume>60</volume>, <fpage>249</fpage>&#x2013;<lpage>266</lpage>. doi: <pub-id pub-id-type="doi">10.1099/ijs.0.016949-0</pub-id>, PMID: <pub-id pub-id-type="pmid">19700448</pub-id></citation>
</ref>
<ref id="ref41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Veziant</surname> <given-names>J.</given-names></name> <name><surname>Villeger</surname> <given-names>R.</given-names></name> <name><surname>Barnich</surname> <given-names>N.</given-names></name> <name><surname>Bonnet</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>Gut microbiota as potential biomarker and/or therapeutic target to improve the Management of Cancer: focus on Colibactin-producing <italic>Escherichia coli</italic> in colorectal Cancer</article-title>. <source>Cancers (Basel)</source> <volume>13</volume>:<fpage>2215</fpage>. doi: <pub-id pub-id-type="doi">10.3390/cancers13092215</pub-id>, PMID: <pub-id pub-id-type="pmid">34063108</pub-id></citation>
</ref>
<ref id="ref42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Villmones</surname> <given-names>H. C.</given-names></name> <name><surname>Svanevik</surname> <given-names>M.</given-names></name> <name><surname>Ulvestad</surname> <given-names>E.</given-names></name> <name><surname>Stenstad</surname> <given-names>T.</given-names></name> <name><surname>Anthonisen</surname> <given-names>I. L.</given-names></name> <name><surname>Nygaard</surname> <given-names>R. M.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Investigating the human jejunal microbiota</article-title>. <source>Sci. Rep.</source> <volume>12</volume>:<fpage>1682</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-022-05723-9</pub-id></citation>
</ref>
<ref id="ref43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Q.</given-names></name> <name><surname>Garrity</surname> <given-names>G. M.</given-names></name> <name><surname>Tiedje</surname> <given-names>J. M.</given-names></name> <name><surname>Cole</surname> <given-names>J. R.</given-names></name></person-group> (<year>2007</year>). <article-title>Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy</article-title>. <source>Appl. Environ. Microbiol.</source> <volume>73</volume>, <fpage>5261</fpage>&#x2013;<lpage>5267</lpage>. doi: <pub-id pub-id-type="doi">10.1128/AEM.00062-07</pub-id>, PMID: <pub-id pub-id-type="pmid">17586664</pub-id></citation>
</ref>
<ref id="ref44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wenger</surname> <given-names>A. M.</given-names></name> <name><surname>Peluso</surname> <given-names>P.</given-names></name> <name><surname>Rowell</surname> <given-names>W. J.</given-names></name> <name><surname>Chang</surname> <given-names>P. C.</given-names></name> <name><surname>Hall</surname> <given-names>R. J.</given-names></name> <name><surname>Concepcion</surname> <given-names>G. T.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Accurate circular consensus long-read sequencing improves variant detection and assembly of a human genome</article-title>. <source>Nat. Biotechnol.</source> <volume>37</volume>, <fpage>1155</fpage>&#x2013;<lpage>1162</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-019-0217-9</pub-id>, PMID: <pub-id pub-id-type="pmid">31406327</pub-id></citation>
</ref>
<ref id="ref45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Woese</surname> <given-names>C. R.</given-names></name> <name><surname>Fox</surname> <given-names>G. E.</given-names></name></person-group> (<year>1977</year>). <article-title>Phylogenetic structure of the prokaryotic domain: the primary kingdoms</article-title>. <source>Proc. Natl. Acad. Sci. USA</source> <volume>74</volume>, <fpage>5088</fpage>&#x2013;<lpage>5090</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.74.11.5088</pub-id>, PMID: <pub-id pub-id-type="pmid">270744</pub-id></citation>
</ref>
<ref id="ref46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yarza</surname> <given-names>P.</given-names></name> <name><surname>Yilmaz</surname> <given-names>P.</given-names></name> <name><surname>Pruesse</surname> <given-names>E.</given-names></name> <name><surname>Glockner</surname> <given-names>F. O.</given-names></name> <name><surname>Ludwig</surname> <given-names>W.</given-names></name> <name><surname>Schleifer</surname> <given-names>K. H.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Uniting the classification of cultured and uncultured bacteria and archaea using 16S rRNA gene sequences</article-title>. <source>Nat. Rev. Microbiol.</source> <volume>12</volume>, <fpage>635</fpage>&#x2013;<lpage>645</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nrmicro3330</pub-id>, PMID: <pub-id pub-id-type="pmid">25118885</pub-id></citation>
</ref>
</ref-list>
</back>
</article>