<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2023.1118158</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Microbiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Combination of whole genome sequencing and supervised machine learning provides unambiguous identification of <italic>eae</italic>-positive Shiga toxin-producing <italic>Escherichia coli</italic></article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes">
<name><surname>Vorimore</surname> <given-names>Fabien</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<xref ref-type="author-notes" rid="fn001"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2128905/overview"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Jaudou</surname> <given-names>Sandra</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn001"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2296688/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tran</surname> <given-names>Mai-Lan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2297131/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Richard</surname> <given-names>Hugues</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/46224/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Fach</surname> <given-names>Patrick</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/157477/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Delannoy</surname> <given-names>Sabine</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/165093/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>ANSES, Laboratory for Food Safety, Genomics Platform IdentyPath</institution>, <addr-line>Maisons-Alfort</addr-line>, <country>France</country></aff>
<aff id="aff2"><sup>2</sup><institution>ANSES, Laboratory for Food Safety, COLiPATH Unit</institution>, <addr-line>Maisons-Alfort</addr-line>, <country>France</country></aff>
<aff id="aff3"><sup>3</sup><institution>Bioinformatics Unit, Genome Competence Center (MF1), Robert Koch Institute</institution>, <addr-line>Berlin</addr-line>, <country>Germany</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Abani Kumar Pradhan, University of Maryland, College Park, United States</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Zachary R. Stromberg, Pacific Northwest National Laboratory (DOE), United States; Patrick Murigu Kamau Njage, Technical University of Denmark, Denmark</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Fabien Vorimore <email>fabien.vorimore&#x00040;anses.fr</email></corresp>
<fn fn-type="equal" id="fn001"><p>&#x02020;These authors have contributed equally to this work</p></fn></author-notes>
<pub-date pub-type="epub">
<day>12</day>
<month>05</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1118158</elocation-id>
<history>
<date date-type="received">
<day>07</day>
<month>12</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>04</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2023 Vorimore, Jaudou, Tran, Richard, Fach and Delannoy.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Vorimore, Jaudou, Tran, Richard, Fach and Delannoy</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license> </permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The objective of this study was to develop, using a genome wide machine learning approach, an unambiguous model to predict the presence of highly pathogenic STEC in <italic>E. coli</italic> reads assemblies derived from complex samples containing potentially multiple <italic>E. coli</italic> strains. Our approach has taken into account the high genomic plasticity of <italic>E. coli</italic> and utilized the stratification of STEC and <italic>E. coli</italic> pathogroups classification based on the serotype and virulence factors to identify specific combinations of biomarkers for improved characterization of <italic>eae</italic>-positive STEC (also named EHEC for enterohemorrhagic <italic>E.coli</italic>) which are associated with bloody diarrhea and hemolytic uremic syndrome (HUS) in human.</p></sec>
<sec>
<title>Methods</title>
<p>The Machine Learning (ML) approach was used in this study on a large curated dataset composed of 1,493 <italic>E. coli</italic> genome sequences and 1,178 Coding Sequences (CDS). Feature selection has been performed using eight classification algorithms, resulting in a reduction of the number of CDS to six. From this reduced dataset, the eight ML models were trained with hyper-parameter tuning and cross-validation steps.</p></sec>
<sec>
<title>Results and discussion</title>
<p>It is remarkable that only using these six genes, EHEC can be clearly identified from <italic>E. coli</italic> read assemblies obtained from in silico mixtures and complex samples such as milk metagenomes. These various combinations of discriminative biomarkers can be implemented as novel marker genes for the unambiguous EHEC characterization from different <italic>E. coli</italic> strains mixtures as well as from raw milk metagenomes.</p></sec></abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>Shiga toxin-producing <italic>Escherichia coli</italic></kwd>
<kwd>food safety</kwd>
<kwd>metagenomics</kwd>
<kwd>raw milk</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="52"/>
<page-count count="13"/>
<word-count count="9333"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Food Microbiology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>Shiga toxin-producing <italic>Escherichia coli</italic> (STEC) are important zoonotic pathogens comprising more than 400 serotypes (Beutin and Fach, <xref ref-type="bibr" rid="B2">2015</xref>). Pathogenic STEC strains such as enterohemorrhagic <italic>E. coli</italic> (EHEC) may cause hemorrhagic colitis (HC) and hemolytic-uremic syndrome (HUS) in humans. However, it remains difficult to fully define human pathogenic STEC or identify virulence factors for STEC that clearly foresee their capacity to cause human disease (European Food Safety Authority and European Centre for Disease Prevention and Control, <xref ref-type="bibr" rid="B13">2021</xref>). The production of Shiga toxin (<italic>stx</italic> genes) by highly pathogenic STEC (i.e., EHEC) is the major virulence factor responsible for HUS, but many <italic>E. coli</italic> strains that produce Shiga toxin do not cause HUS. Therefore, the identification of virulent STEC strains based solely on the presence of <italic>stx</italic> genes may be misleading. Shiga toxins comprise a growing family of genes with a vast type diversity (Scheutz et al., <xref ref-type="bibr" rid="B45">2012</xref>). The Stx family splits into two major branches, Stx1 and Stx2, which are immunologically not cross-reactive and show about 55% difference in their amino acid sequences (M&#x000FC;thing et al., <xref ref-type="bibr" rid="B38">2009</xref>). In addition to producing one or both types of Shiga toxin, typical EHEC strains harbor a genomic pathogenicity, called the &#x0201C;locus of enterocyte effacement&#x0201D; (LEE). This locus was first identified in enteropathogenic <italic>E. coli</italic> (EPEC), a leading cause of infant diarrhea in developing countries. The LEE carries genes encoding proteins involved in the pathogenicity of <italic>E. coli</italic> strains, as they participate in bacterial colonization of the gut and destruction of the intestinal mucosa (Nataro and Kaper, <xref ref-type="bibr" rid="B39">1998</xref>). For example, the intimin-encoding gene (<italic>eae</italic>) is directly involved in the attaching and effacing (A/E) process and serves as an indicator for the A/E function in the bacteria (Zhang et al., <xref ref-type="bibr" rid="B52">2002</xref>). As mentioned above, prediction of STEC pathogenicity using available markers is challenging, but strains positive for Shiga toxin (in particular the <italic>stx2</italic> genes) and <italic>eae</italic> (intimin production) genes have been shown to be associated with a higher risk of causing more severe illness than other virulence factor combinations (European Food Safety Authority, <xref ref-type="bibr" rid="B14">2007</xref>, <xref ref-type="bibr" rid="B15">2013</xref>). STEC are traditionally considered to be zoonotic pathogens that are primarily food- and water-borne, with the main reservoir being the digestive tract of mammals, particularly ruminants (Gill et al., <xref ref-type="bibr" rid="B19">2022</xref>). Consumption of contaminated food, such as undercooked ground meat and unpasteurized dairy products, is the principal source of infection. Current methods for EHEC identification in feed and food samples rely on the molecular detection of <italic>stx, eae</italic>, and the top five or top seven EHEC serogroups, followed by strain isolation, as described in the ISO/TS 13136:2012 (EU) and MLG5C.02 (US) reference methods (International Organization for Standardization, <xref ref-type="bibr" rid="B27">2012</xref>; European Food Safety Authority and European Centre for Disease Prevention and Control, <xref ref-type="bibr" rid="B13">2021</xref>). The strain isolation step is necessary to demonstrate that both genes are present in the same strain. Indeed, the major challenge for EHEC identification based on <italic>stx</italic> and <italic>eae</italic> genes detection is that these genes are located on mobile genetic elements, and can be carried by non-pathogenic <italic>E. coli</italic> strains simultaneously present in the food matrix, as well as other <italic>Enterobacteriaceae</italic> (Herold et al., <xref ref-type="bibr" rid="B23">2004</xref>) or even free bacteriophages (Imamovic et al., <xref ref-type="bibr" rid="B25">2009</xref>). The high rates of unconfirmed presumptive positive results observed in food safety tests are a global challenge for the regulatory agencies and industry quality control laboratories performing STEC testing (Delannoy et al., <xref ref-type="bibr" rid="B11">2016</xref>, <xref ref-type="bibr" rid="B12">2022</xref>). It remains a desirable goal for the industry and decision makers to develop cost-effective sensitive detection tests that can guaranty the highest level of food safety. Our objective here was to refine the EHEC diagnostic systems for better identification and characterization of highly pathogenic STEC from any kind of food samples. This work was based on the hypothesis that the co-occurrence of the <italic>stx</italic> and <italic>eae</italic> genes in the same genome would imply the presence of other (variable) genes and should create complex genetic signatures. We took advantage of a Genome Wide Association Study program (GWAS) to explore a large number of <italic>E. coli</italic> assemblies available from public databases (Franz et al., <xref ref-type="bibr" rid="B17">2014</xref>) and generated a complex matrix summarizing presence and absence for groups of orthologuous genes. Machine learning (ML) methods perform admirably in detecting predictive patterns hidden within high dimensional data (Lupolova et al., <xref ref-type="bibr" rid="B33">2016</xref>; Moradigaravand et al., <xref ref-type="bibr" rid="B37">2018</xref>). Supervised learning was used to create ML models that can precisely predict the co-occurrence of <italic>stx</italic> and <italic>eae</italic> genes in a genome or an assembly. After testing on simple <italic>in silico</italic> mixtures of strains, we successfully applied these models on long-read metagenomic sequencing data of artificially <italic>eae</italic>-positive STEC contaminated raw milk samples.</p></sec>
<sec sec-type="materials and methods" id="s2">
<title>2. Materials and methods</title>
<sec>
<title>2.1. Genomic data collection</title>
<p>Available <italic>E. coli</italic> genomes (<italic>n</italic> = 31,230) were retrieved from the GenBank database during the database construction. Based on the genome sequence completeness (full <italic>E. coli</italic> genomes were included in priority), the country of isolation and the <italic>E. coli</italic> pathotype, 1,425 genomes were selected to maximize the diversity. Sixty-eight additional genomes sequenced and assembled in a previous study by Jaudou and colleagues (Jaudou et al., <xref ref-type="bibr" rid="B29">2023</xref>), were added to reach a total of 1,493 genomes. The genome accession numbers and the metadata associated with the selected genomes are reported in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref>. All the genomes were screened against a custom database (Available at <ext-link ext-link-type="uri" xlink:href="https://github.com/fabgenomics/ML_EHEC">https://github.com/fabgenomics/ML_EHEC</ext-link>) containing all the <italic>stx</italic> subtypes, <italic>eae</italic> and O-group genes using abricate v1.0.1. (<ext-link ext-link-type="uri" xlink:href="https://github.com/tseemann/abricate">https://github.com/tseemann/abricate</ext-link>). The phylogroup of each genome was determined using the EzClermont phylotyping tool available at <ext-link ext-link-type="uri" xlink:href="https://github.com/nickp60/EzClermont">https://github.com/nickp60/EzClermont</ext-link>.</p>
</sec>
<sec>
<title>2.2. Genome annotation and classification</title>
<p>The selected genomes were annotated using the rapid prokaryotic genome annotation software prokka v1.13.3 (Seemann, <xref ref-type="bibr" rid="B47">2014</xref>) using the <monospace>proteins</monospace> option with the reference genome of <italic>E. coli</italic> O157:H7 str. Sakai (NC 002695.2) (Hayashi et al., <xref ref-type="bibr" rid="B22">2001</xref>). Resulting General Feature Format (.gff) files were processed through a Pangenome analysis pipeline using panaroo v1.2.7 with <monospace>&#x02013;clean-mode strict, &#x02013;remove invalid-genes</monospace> <monospace>and &#x02013;merge paralogs</monospace> options (Tonkin-Hill et al., <xref ref-type="bibr" rid="B51">2020</xref>). Panaroo collapses genes into putative families with a family sequence identity level of 70% by default and creates groups. The gene_presence_absence.Rtab table provided by the panaroo output contains all the groups and genes and the presence/absence information from each genome. These groups and genes were renamed using a custom script (Available at <ext-link ext-link-type="uri" xlink:href="https://github.com/fabgenomics/ML_EHEC">https://github.com/fabgenomics/ML_EHEC</ext-link>) in which we used the information in the panaroo output file matrix (gene_presence_absence.csv available at <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.7129021">https://doi.org/10.5281/zenodo.7129021</ext-link>) to retrieve the corresponding locus tag number from the <italic>E. coli</italic> O157:H7 Sakai strain annotation (ECs number) relative to the group or the gene (CDS). Only groups renamed with the locus tag number by our custom script were retained for further analysis. We created a CDS presence/absence matrix (ECs_presence_absence.csv available at <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.7129021">https://doi.org/10.5281/zenodo.7129021</ext-link>), by adding a new column based on the pathotype of each genome. Genomes that were found to be <italic>stx</italic>&#x0002B;/<italic>eae</italic>&#x0002B; were assigned to the EHEC pathotype and noted 1 in the table and all the other genomes (STEC for <italic>stx</italic>&#x0002B;/<italic>eae</italic>-, EPEC for <italic>stx</italic>-/<italic>eae</italic>&#x0002B; and the abbreviation COM was used for <italic>stx</italic>-/<italic>eae</italic>-) were considered non-EHEC pathotype and noted 0. A phylogenetic tree was reconstructed using IQtree v2.0.3 (Minh et al., <xref ref-type="bibr" rid="B36">2020</xref>) with the Generalised Time Reversible (GTR) model on the core genome alignment produced by panaroo. The tree was annotated with the molecular serogroups using the CLC genomic workbench v21 (QIAGEN, Aarhus, Denmark).</p>
</sec>
<sec>
<title>2.3. Machine learning model training and evaluation</title>
<p>Before evaluating the performance of the different classifiers, the CDS presence/absence matrix was filtered on non-informative features. Any CDS with less than 10% variance on its presence/absence vector was removed, as these loci do not contain useful information to test machine learning algorithms. This step removed 3,603 CDS, resulting in a dataset with 1,178 CDS. Then, to avoid possible data leakage between training and testing datasets, we grouped the samples based on their similarity. For each pair of samples, their CDS presence/absence vector was used to compute a hamming distance (<italic>i.e</italic>. the number of differences). Any two samples with a hamming distance lower than or equal to <italic>D</italic> were allocated to the same cluster. We considered possible values for <italic>D</italic> of 5, 10, 50, 100 and 200. Note that the resulting clusters have extremely homogeneous pathotypes (at <italic>D</italic> &#x0003D; 100, only one cluster consists of a mixture of EHEC and non-EHEC samples). The cluster table is available at <ext-link ext-link-type="uri" xlink:href="https://github.com/fabgenomics/ML_EHEC">https://github.com/fabgenomics/ML_EHEC</ext-link>. The main dataset (<italic>n</italic> = 1,493) was then subsampled to keep only one genome from each cluster. Then, each subsampled dataset was randomly split using 80% of the samples for training/validation and the remaining 20% for testing. The <monospace>train_test_split</monospace> module from Sklearn was used, with stratify option to control the proportion of EHEC in both datasets. Eight classification algorithms were trained on each dataset using 10-fold Cross-Validation: Decision Tree, Extra Tree, Gradient Boosting, LGBMClassifier, Logistic Regression, Random Forest, XGBClassifier and Support Vector Machine. The evaluation metric used was the function <monospace>cross_val_score</monospace> from the sklearn library. For all the cross-validation scores (10 folds <italic>i.e</italic>. 10 scores), the mean accuracy was calculated (<xref ref-type="supplementary-material" rid="SM2">Supplementary Table 2</xref> and <xref ref-type="fig" rid="F1">Figure 1</xref>). Further analysis were performed using a distance <italic>D</italic> &#x0003D; 100 for clustering (dataset Cluster-D100). This implies that two samples in this dataset differ by at least 8.4% (100/1178 CDS) of their gene content. A module from Sklearn library v0.23.1 (<monospace>RandomUnderSampler</monospace>) was used to select randomly the same amount of non-EHEC genomes to be equal to the number of remaining EHEC genomes from the cluster analysis. The Cluster-D100 dataset was randomly split with a ratio of 80/20% for training and testing datasets respectively and the stratify option. Eight classification algorithms were used to select the most important features with the <monospace>SelectFromModel</monospace> library from Sklearn. The most important CDS are listed in <xref ref-type="table" rid="T1">Table 1</xref>. We arbitrarily chose to select the six most important features to create a new reduced dataset. With this resulting matrix, hyper-parameter tuning was done on each of the eight models using <monospace>RandomizedSearchCV</monospace> and <monospace>GridSearchCV</monospace> (scoring on roc_auc metric) and cross-validation steps (<italic>n</italic> &#x0003D; 5) when the option was available. Finally, each classifier was retrained with its best hyper-parameter and evaluated on the testing dataset previously set aside (accuracy, precision, recall and F1-score are obtained using the <monospace>classification_report</monospace> from Sklearn, see <xref ref-type="supplementary-material" rid="SM3">Supplementary Table 3</xref>). To understand which gene combination led to the prediction of the EHEC pathotype, we generated all 2<sup>6</sup> &#x0003D; 64 combinations of the six genes presence/absence and computed, for each ML model, the probability of the EHEC pathotype. We then kept the cases where the probability was &#x02265;0.7 and transformed the set of gene combinations into simplified boolean expressions (using a boolean Algebra Solver). The results are reported on <xref ref-type="fig" rid="F2">Figure 2</xref> Charts of the training pipeline and the prediction pipeline are presented in <xref ref-type="fig" rid="F3">Figures 3A</xref>, <xref ref-type="fig" rid="F3">B</xref>, respectively.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Influence of sample clustering on classification performance. For the different classification algorithms, the average accuracy (over 10 fold cross validation) is reported for different values for the clustering distance <italic>D</italic> (cluster sizes: D5 = 1,489, D10 = 1,470. D20 = 1,410, D50 = 1,203, D100 = 756, D200 = 137).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-14-1118158-g0001.tif"/>
</fig>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Top six most important features extracted from the training of the eight classification models.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:#8f9496">
<th valign="top" align="left"><bold>Rank</bold></th>
<th valign="top" align="center"><bold>Locus_tag</bold></th>
<th valign="top" align="center"><bold>Gene ID</bold></th>
<th valign="top" align="center"><bold>Gene name</bold></th>
<th valign="top" align="center"><bold>Encoded protein</bold></th>
<th valign="top" align="center"><bold>Number of models using the gene for EHEC prediction</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="center">ECs_1056</td>
<td valign="top" align="center">62675958</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">Phage excisionase</td>
<td valign="top" align="center">8</td>
</tr> <tr>
<td valign="top" align="left">2</td>
<td valign="top" align="center">ECs_1812</td>
<td valign="top" align="center">912909</td>
<td valign="top" align="center"><italic>nleA/espI</italic></td>
<td valign="top" align="center">T3SS secreted effector NleA/EspI</td>
<td valign="top" align="center">7</td>
</tr> <tr>
<td valign="top" align="left">3</td>
<td valign="top" align="center">ECs_1824</td>
<td valign="top" align="center">912888</td>
<td valign="top" align="center"><italic>nleG</italic></td>
<td valign="top" align="center">T3SS secreted effector NleG</td>
<td valign="top" align="center">5</td>
</tr> <tr>
<td valign="top" align="left">4</td>
<td valign="top" align="center">ECs_3858</td>
<td valign="top" align="center">916318</td>
<td valign="top" align="center"><italic>nleE</italic></td>
<td valign="top" align="center">T3SS secreted effector NleE</td>
<td valign="top" align="center">4</td>
</tr> <tr>
<td valign="top" align="left">5</td>
<td valign="top" align="center">ECs_1815</td>
<td valign="top" align="center">912903</td>
<td valign="top" align="center"><italic>nleF</italic></td>
<td valign="top" align="center">T3SS secreted effector NleF</td>
<td valign="top" align="center">4</td>
</tr>
<tr>
<td valign="top" align="left">6</td>
<td valign="top" align="center">ECs_1561</td>
<td valign="top" align="center">913337</td>
<td valign="top" align="center"><italic>espN</italic></td>
<td valign="top" align="center">T3SS secreted effector EspN</td>
<td valign="top" align="center">4</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The Gene ID of each locus is provided, and when known, the gene name as well.</p>
</table-wrap-foot>
</table-wrap>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Gene combinations that are predicted as EHEC. For each classifier, the combination of genes predicting the EHEC pathotype are given in columns. A black cells means the gene is present and a grey cell that there is no constraint. For instance a decision tree predicts the EHEC pathotype if either ECs_1056 and ECs_1812 are present (column 4), or if ECs_1812, ECs_1815, and ECs_1824 are present (column 5). The number of possible combinations for each of the classifier are ET: 3, DT: 2, SVM: 3, LR: 9, GB: 2, RF: 1.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-14-1118158-g0002.tif"/>
</fig>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p><bold>(A)</bold> Training pipeline for the eight models. <bold>(B)</bold> Pipeline of the prediction on <italic>in-silico</italic> mixtures and artificially contaminated raw milk.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-14-1118158-g0003.tif"/>
</fig>
</sec>
<sec>
<title>2.4. Evaluation of the eight models on <italic>in silico</italic> mixtures of <italic>E. coli</italic></title>
<p>From a previous study conducted by Jaudou et al. (<xref ref-type="bibr" rid="B29">2023</xref>), raw ONT MinION reads from two STEC (ECA279 (O174:H2) = SRR18191627 and 97HMPL652 (O110:H9) = SRR18191587), one <italic>stx</italic>-negative <italic>eae</italic>-positive <italic>E. coli</italic> (2142-O103 (O103:H25) = SRR18191529), one <italic>eae</italic>-positive STEC (<italic>E. coli</italic> 12-1 (O157:H7) = SRR18191640) and one commensal <italic>E. coli</italic> [i.e., negative for both <italic>stx</italic> and <italic>eae</italic> (NC809 (O41:H7) = SRR18191621)] were collected. One <italic>stx</italic>-negative <italic>eae</italic>-positive <italic>E. coli</italic> strain (KK072/05 - O156:H8) was newly sequenced during this study following the same protocol described by Jaudou <italic>et al</italic>. (Jaudou et al., <xref ref-type="bibr" rid="B29">2023</xref>) and the raw ONT MinION reads were deposited to the NCBI database under the number SRX18376762. Raw reads were subsampled using Rasusa v0.6.0 (Hall, <xref ref-type="bibr" rid="B21">2022</xref>) with 5.5Mb for the target number of bases and 10, 20, 30, 40 and 50x for the coverage. From these subsampled reads, twenty-five <italic>in silico</italic> mixtures were generated by concatenating into a 1:1 ratio the same coverage of subsampled reads. Details of the different mixtures are presented in the <xref ref-type="table" rid="T2">Table 2</xref>. From these mixtures, an assembly was generated using metaFlye v2.9-b1768 (Kolmogorov et al., <xref ref-type="bibr" rid="B31">2020</xref>) with <monospace>nano-raw</monospace> and <monospace>meta</monospace> parameters. The resulting assemblies were annotated with the same parameters as described in the Genome annotation and classification paragraph (Section 2.2). The produced GFF file of each annotation was integrated individually in the pangenome graph generated with the 1,493 genomes using the panaroo-integrate command from the panaroo program. From the new gene_presence_absence.csv and the gene presence absence.Rtab generated by the panaroo-integrate command, the row corresponding to the added mixture was extracted using a newly developed python script (Available at <ext-link ext-link-type="uri" xlink:href="https://github.com/fabgenomics/ML_EHEC">https://github.com/fabgenomics/ML_EHEC</ext-link>) to reconstruct the CDS presence/absence table. We extracted only the features required for the tested model and when a feature was absent, we created it and introduced a 0 value. The data extracted were used to perform the predictions (<xref ref-type="fig" rid="F3">Figure 3B</xref>). The <monospace>predict_proba</monospace> method from all the algorithms was used to estimate the probability that the sample is an EHEC.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Prediction of the class probabilities on the 25 generated mixtures of pure <italic>E. coli</italic> cultures.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:#8f9496">
<th valign="top" align="left"><bold>Strains and genome coverage used for <italic>in-silico</italic> mixture</bold></th>
<th valign="top" align="center"><bold><italic>E. coli</italic> Pathotype mixture<sup>&#x0002A;</sup></bold></th>
<th valign="top" align="center"><bold>Class<sup>&#x0002A;&#x0002A;</sup></bold></th>
<th valign="top" align="center"><bold>LGBM</bold></th>
<th valign="top" align="center"><bold>LR</bold></th>
<th valign="top" align="center"><bold>DT</bold></th>
<th valign="top" align="center"><bold>XGB</bold></th>
<th valign="top" align="center"><bold>RF</bold></th>
<th valign="top" align="center"><bold>SVM</bold></th>
<th valign="top" align="center"><bold>GB</bold></th>
<th valign="top" align="center"><bold>ET</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ECA279 &#x0002B; NC809 10x</td>
<td valign="top" align="center">STEC-COM</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.03</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; NC809 20x</td>
<td valign="top" align="center">STEC-COM</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.03</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; NC809 30x</td>
<td valign="top" align="center">STEC-COM</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.03</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; NC809 40x</td>
<td valign="top" align="center">STEC-COM</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.03</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; NC809 50x</td>
<td valign="top" align="center">STEC-COM</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.03</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; 2142-O103 10x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.22</td>
<td valign="top" align="center">0.27</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.27</td>
<td valign="top" align="center">0.46</td>
<td valign="top" align="center">0.21</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.47</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; 2142-O103 20x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.22</td>
<td valign="top" align="center">0.27</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.27</td>
<td valign="top" align="center">0.46</td>
<td valign="top" align="center">0.21</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.47</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; 2142-O103 30x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.88</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; 2142-O103 40x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.72</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.77</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; 2142-O103 50x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.72</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.77</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; KK072/05 10x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.26</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.25</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; KK072/05 20x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.26</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.25</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; KK072/05 30x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.26</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.25</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; KK072/05 40x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.26</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.25</td>
</tr> <tr>
<td valign="top" align="left">97HMPL652 &#x0002B; KK072/05 50x</td>
<td valign="top" align="center">STEC-EPEC</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.26</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.25</td>
</tr> <tr>
<td valign="top" align="left">Ecoli12-1 &#x0002B; NC809 10x</td>
<td valign="top" align="center">EHEC-COM</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">Ecoli12-1 &#x0002B; NC809 20x</td>
<td valign="top" align="center">EHEC-COM</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">Ecoli12-1 &#x0002B; NC809 30x</td>
<td valign="top" align="center">EHEC-COM</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">Ecoli12-1 &#x0002B; NC809 40x</td>
<td valign="top" align="center">EHEC-COM</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.86</td>
</tr> <tr>
<td valign="top" align="left">Ecoli12-1 &#x0002B; NC809 50x</td>
<td valign="top" align="center">EHEC-COM</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; Ecoli12-1 10x</td>
<td valign="top" align="center">STEC-EHEC</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.86</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; Ecoli12-1 20x</td>
<td valign="top" align="center">STEC-EHEC</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.86</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; Ecoli12-1 30x</td>
<td valign="top" align="center">STEC-EHEC</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.86</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; Ecoli12-1 40x</td>
<td valign="top" align="center">STEC-EHEC</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.86</td>
</tr> <tr>
<td valign="top" align="left">ECA279 &#x0002B; Ecoli12-1 50x</td>
<td valign="top" align="center">STEC-EHEC</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.86</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p><sup>&#x0002A;</sup>EHEC, enterohemorrhagic <italic>E. coli</italic>; STEC, Shiga toxin-producing <italic>Escherichia coli</italic>; EPEC, enteropathogenic <italic>E. coli</italic>; COM, commensal <italic>Escherichia coli</italic>.</p>
<p><sup>&#x0002A;&#x0002A;</sup>Non-EHEC = 0; EHEC = 1.</p>
<p>LGBM, LGBMClassifier; LR, Logistic Regression; DT, Decision Tree; XGB, XGBClassifier; RF, Random Forest; SVM, Support Vector Machine; GB, Gradient Boosting; ET, Extra Tree.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>2.5. Evaluation of the eight models on experimentally-contaminated raw milk</title>
<p>Eight metagenomes from artificially contaminated raw milks described in a previous study (Jaudou et al., <xref ref-type="bibr" rid="B28">2022</xref>) were downloaded from the Genbank public database (<xref ref-type="table" rid="T3">Table 3</xref>). The estimated level of contamination was 0.5x10<sup>3</sup>, 0.5x10<sup>2</sup> and 0.5x10<sup>1</sup> CFU.mL<sup>&#x02212;1</sup> of EHEC O26 plus one EHEC-free milk. Raw reads were processed using the STECmetadetector pipeline developed by Jaudou et al. (<xref ref-type="bibr" rid="B28">2022</xref>) available at <ext-link ext-link-type="uri" xlink:href="https://gitlab.com/Bfr_bioinformatics/STECmetadetector">https://gitlab.com/Bfr_bioinformatics/STECmetadetector</ext-link> and the extracted <italic>E. coli</italic> reads were assembled using metaFlye v2.9-b1768 with the same parameters as described in Section 2.4. The resulting assemblies were annotated with the same parameters as described in the Genome annotation and classification paragraph (Section 2.2). The resulting GFF file was treated with the same process than the <italic>in-silico</italic> mixture GFF file and the EHEC predictions were performed, as described in Section 2.3 (<xref ref-type="fig" rid="F3">Figure 3B</xref>).</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Prediction of the class probabilities on the milk metagenomes.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:#8f9496">
<th valign="top" align="left"><bold>Strain</bold></th>
<th valign="top" align="center"><bold>Accession number<sup>&#x0002A;</sup></bold></th>
<th valign="top" align="center"><bold>EHEC spiking level<sup>&#x0002A;&#x0002A;</sup></bold></th>
<th valign="top" align="center"><bold>Class<sup>&#x0002A;&#x0002A;&#x0002A;</sup></bold></th>
<th valign="top" align="center"><bold>LGBM</bold></th>
<th valign="top" align="center"><bold>LR</bold></th>
<th valign="top" align="center"><bold>DT</bold></th>
<th valign="top" align="center"><bold>XGB</bold></th>
<th valign="top" align="center"><bold>RF</bold></th>
<th valign="top" align="center"><bold>SVM</bold></th>
<th valign="top" align="center"><bold>GB</bold></th>
<th valign="top" align="center"><bold>ET</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">4712-O26</td>
<td valign="top" align="center">SRR19090780</td>
<td valign="top" align="center">0.5x10<sup>3</sup></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">6423-O26</td>
<td valign="top" align="center">SRR19090775</td>
<td valign="top" align="center">0.5x10<sup>3</sup></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">4712-O26</td>
<td valign="top" align="center">SRR19090792</td>
<td valign="top" align="center">0.5x10<sup>2</sup></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">6423-O26</td>
<td valign="top" align="center">SRR19090774</td>
<td valign="top" align="center">0.5x10<sup>2</sup></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">4712-O26</td>
<td valign="top" align="center">SRR19090778</td>
<td valign="top" align="center">0.5x10<sup>1</sup></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">6423-O26</td>
<td valign="top" align="center">SRR19090772</td>
<td valign="top" align="center">0.5x10<sup>1</sup></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr> <tr>
<td valign="top" align="left">6423-O26</td>
<td valign="top" align="center">SRR19090769</td>
<td valign="top" align="center">0.5x10<sup>1</sup></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.98</td>
</tr>
<tr>
<td valign="top" align="left">EHEC-neg</td>
<td valign="top" align="center">SRR19090777</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.15</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.03</td>
<td valign="top" align="center">0.18</td>
<td valign="top" align="center">0.21</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.13</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><sup>&#x0002A;</sup>From Jaudou et al. (<xref ref-type="bibr" rid="B28">2022</xref>).</p>
<p><sup>&#x0002A;&#x0002A;</sup>CFU.mL<sup>&#x02212;1</sup>.</p>
<p><sup>&#x0002A;&#x0002A;&#x0002A;</sup>Non-EHEC = 0; EHEC = 1.</p>
<p>LGBM, LGBMClassifier; LR, Logistic Regression; DT, Decision Tree; XGB, XGBClassifier; RF, Random Forest; SVM, Support Vector Machine; GB, Gradient Boosting; ET, Extra Tree.</p>
</table-wrap-foot>
</table-wrap></sec></sec>
<sec sec-type="results" id="s3">
<title>3. Results</title>
<sec>
<title>3.1. <italic>Escherichia coli</italic> pathotype assignation based on genomic information</title>
<p>To take advantages of ML to find patterns and preserve its generalization potential, we constituted an <italic>E. coli</italic> database for which we selected in priority complete <italic>E. coli</italic> genomes with a minimum of required metadata (origin, isolation date and location) and verified their pathotypes (<italic>stx</italic> and <italic>eae</italic> genes presence). A total of 1,493 genomes were downloaded from the GenBank database. The geographic origin of the strains was 33, 26, 23, 7, and 3% from Europe, Asia, America, Africa and Oceania, respectively and the 8% remaining were missing the country of origin. During the genome selection, we tried to respect an equal proportions of <italic>stx</italic>/<italic>eae</italic>-positive strains (i.e., EHEC) and non-EHEC strains based on the metadata provided. Genomes simultaneously positive for at least one <italic>stx</italic> gene and the <italic>eae</italic> gene (<italic>n</italic> = 632) were assigned to the EHEC group. The other genomes (<italic>n</italic> = 861) were assigned to the non-EHEC group (Available at <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.7129021">https://doi.org/10.5281/zenodo.7129021</ext-link>). In addition, the custom database was reporting all the O-group sequences so that the serogroup information, in particular the most frequent EHEC serogroup, was available (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref>). The top seven most represented serogroups were O26 (<italic>n</italic> = 160), O157 (<italic>n</italic> = 126), O103 (<italic>n</italic> = 66), O121 (<italic>n</italic> = 61), O145 (<italic>n</italic> = 60), O111 (<italic>n</italic> = 36), and O45 (<italic>n</italic> = 9). These serogroups comprised mostly EHEC strains with 126 EHEC O26 strains, 124 EHEC O157 strains, 57 EHEC O103 strains, 55 EHEC O121 strains, 59 EHEC O145 strains, and 31 EHEC O111 strains. The phylogroup analysis showed that the diversity of the species is well represented. The phylogeny of the final dataset is illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Phylogenetic tree of the 1,493 genomes reconstructed from the core genome alignment. The inner ring corresponds to the pathotype of each strain determined based on <italic>stx</italic> and <italic>eae</italic> presence/absence (0 for all non-EHEC and 1 for the EHEC). The outer ring corresponds to the top 10 O-groups.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-14-1118158-g0004.tif"/>
</fig>
</sec>
<sec>
<title>3.2. Generation of the input dataset</title>
<p>From 37,380 groups generated by panaroo, 13,952 remained with an ECs annotation. Some ECs were duplicated in the table due to the genetic diversity of some genes and the panaroo identity level threshold. We aggregated the results for these genes, which resulted in 4,780 unique CDS and kept the presence/absence information. Before splitting the ECs presence/absence table (Available at <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.7129021">https://doi.org/10.5281/zenodo.7129021</ext-link>), we first selected CDS with enough variation between the samples (see Section 2). This filtering step removed 3,602 CDS resulting in a dataset containing 1,178 CDS. To avoid optimistic performance estimates that would result from near duplicate samples both in training and testing sets, we performed a clustering based on gene content similarity (see Section 2). Clustering at distance <italic>D</italic> ensures that two clusters in the dataset have at least <italic>D</italic> genes that are different. We evaluated the change in accuracy for increasing values of <italic>D</italic>, starting from 5 and up to 200 (<xref ref-type="fig" rid="F1">Figure 1</xref>). Up to <italic>D</italic> &#x0003D; 100 (8.4% of the CDS), the accuracy of all classifiers remains very high (above 97%). This indicates that robust information can be extracted from gene profiles to predict EHEC pathotype with high accuracy. The performances drop by around 5&#x02013;10% for <italic>D</italic> &#x0003D; 200 for an accuracy of around 93%. While still high, this decrease could be attributed to the low number of clusters that remain for prediction (<italic>n</italic> = 137, 15 EHEC and 122 non-EHEC). From these results, a value of <italic>D</italic> &#x0003D; 100 was chosen to build the dataset for further analysis, as it combines both good performance with a sufficient sample size (<italic>n</italic> = 756). However, this dataset was imbalanced. To avoid overfitting problems, we subsampled the non-EHEC class to be equal to the EHEC class and ended with a balanced dataset (see Section 2). The matrix was then randomly split into a training dataset containing 80% of the genomes used in the study (<italic>n</italic> = 139) with 69 non-EHEC and 70 EHEC. The testing dataset contained the remaining 20% (<italic>n</italic> = 35) with 18 non-EHEC and 17 EHEC.</p>
</sec>
<sec>
<title>3.3. CDS selection using eight ML classifiers</title>
<p>We conducted a comprehensive analysis of the top features extracted from the training of eight classification models, and the results are summarized in <xref ref-type="table" rid="T1">Table 1</xref>. The table shows the top six most important features, ranked based on the number of times they were used by the models. The most important feature was found to be ECs_1056, which corresponds to a phage excisionase gene. This feature was used by eight of the models. The second most important feature was ECs_1812, which corresponds to the <italic>nleA/espI</italic> gene, coding for a type III secretion system (T3SS) secreted effector protein, and was used by seven of the models. The third most important feature was ECs_1824, which corresponds to the <italic>nleG</italic> gene, which encodes another T3SS secreted effector protein, and was used by five of the models. The remaining features, namely ECs_3858, ECs_1815, and ECs_1561, were used by four models each, and correspond to the <italic>nleE, nleF</italic>, and <italic>espN</italic> genes, respectively, all of which encodes T3SS secreted effector proteins. Multiple classifiers achieved near perfect performance when evaluated on the <italic>D</italic>100 dataset. To better understand which combinations of genes contribute to the prediction of the EHEC pathotype, we generated all 64 (2<sup>6</sup>) genes presence/absence profiles and recorded in which case one of the models predicted an EHEC pathotype with confidence (<xref ref-type="fig" rid="F2">Figure 2</xref> and Section 2). This confirms that some genes, such as ECs_1056 (phage excisionase) and ECs_1824 (<italic>nleG</italic>) have an importance (they are needed in the majority of the predictions). Decision Tree and Gradient boosting learned the same decision rule: &#x0201C;ECs_1812 (<italic>nleA</italic>) and (ECs_1056 or (ECs_1815 (<italic>nleF</italic>) and ECs_1824))&#x0201D;. The SVM classifier predicts EHEC for any two combination of ECs_1056, ECs_1815, and ECs_1824. All those classifiers can predict an EHEC pathotype with as little as two genes. On the other hand, Extra tree and logistic regression make predictions involving three to four genes, showing that they can have different sensitivity.</p>
</sec>
<sec>
<title>3.4. Performance of the eight models on the selected features</title>
<p><xref ref-type="supplementary-material" rid="SM3">Supplementary Table 3</xref> shows the evaluation metrics obtained by training the eight classifiers using the selected six features: ECs_1056, ECs_1812, ECs_1824, ECs_3858, ECs_1815, ECs_1561. All classifiers achieved high accuracy scores, ranging from 0.97 to 1.00, indicating a good performance in predicting the target variable. Logistic Regression, Extra Trees XGBCLassifier, LGBMClassifier, Decision Tree, and SVM achieved perfect accuracy scores of 1.00, while Random Forest and Gradient Boosting achieved a slightly lower accuracy score of 0.97. Extra Tree and Random Forest achieved a precision of 0.94 on the EHEC class and a recall of 0.94 on the non-EHEC class. All the other classifiers achieved perfect precision, recall, and F1-scores, indicating that the selected features were informative and sufficient to discriminate perfectly between the two classes.</p>
</sec>
<sec>
<title>3.5. EHEC prediction on <italic>in silico E. coli</italic> mixtures</title>
<p>We first tested the ability of the different models to predict the presence of an EHEC strain in a mixture of <italic>E. coli</italic> strains. For this purpose, <italic>in silico</italic> mixtures of raw MinION reads were assembled. Assemblies produced using meta-Flye ranged from 5.63 to7.94 Mb (mean = 6.57 Mb) and the number of contigs from 80 to 155 (mean = 106) (<xref ref-type="supplementary-material" rid="SM4">Supplementary Table 4</xref>). In all mixtures of <italic>E. coli</italic> strains, the assembly size was longer than a normal <italic>E. coli</italic> assembly (4.8&#x02013;5.5 Mb). Shorter assemblies were produced by meta-Flye with the STEC-COM mixture (5.63&#x02013;6.16 Mb). On the contrary, the EHEC-COM mixture produced longer assemblies (7.35&#x02013;7.94 Mb). The eight models were then used to perform predictions on the <italic>in silico</italic> mixtures (<xref ref-type="table" rid="T2">Table 2</xref>). Predictions of the pathotype ranged from 0 to 1 and the average prediction from 0.01 to 0.99. The cut-off for binary classification is usually set to 0.5. Above or equal to this cut-off value the presence of an EHEC is predicted, and below a non-EHEC is predicted. This cut-off of 0.5 was used to report the results presented in this study. The eight classifiers were able to predict the correct class 22 times over 25 predictions (88%). However, all models incorrectly predicted the presence of an EHEC three times for the STEC-EPEC mixture with the strain 97HMPL652 and 2142-O103 for a coverage of 30x, 40x and 50x, respectively. For all non-EHEC containing mixtures, the higher value was 0.47 for the same STEC-EPEC mixture with the Extra Tree classifier (<xref ref-type="table" rid="T2">Table 2</xref>). For the EHEC class the lower value was 0.75 for the EHEC-COM mixture and the STEC-EHEC mixture with the Decision Tree classifier and the XGBClassifier. Taken together, these data indicate that all the classifiers were able to predict with high confidence the presence of an EHEC in <italic>E. coli</italic> mixtures that combine different <italic>E. coli</italic> pathotypes. Only three false positives were predicted for the most difficult mixture combining a STEC and an EPEC.</p>
</sec>
<sec>
<title>3.6. EHEC prediction on artificially-contaminated raw milk</title>
<p>We then tested the performance of the eight models on complex mixtures using artificially contaminated raw milk. A bioinformatic pipeline called STECmetadetector developed by Jaudou et al. (<xref ref-type="bibr" rid="B28">2022</xref>), was used to specifically extract <italic>E. coli</italic> reads from raw milk samples artificially contaminated with an O26 EHEC strain (from 0 to 500 CFU.mL<sup>&#x02212;1</sup>). The <italic>E. coli</italic> reads were assembled using the same method as described in Section 2 (paragraph 2.5). Assemblies ranged from 5.2 Mb to 6.83 Mb (mean = 5.95 Mb) and the numbers of contigs from 5 to 62 (mean = 15). The eight classifiers were used to predict the pathotype of artificially contaminated raw milks (<xref ref-type="table" rid="T3">Table 3</xref>). All models were able to accurately predict the EHEC pathotype in the sample with high confidence, at the three contamination levels tested, regardless of the strain used for the artificial contamination. Notably, the raw milk used for spiking with the 6423-O26 strain was naturally containing commensal <italic>E. coli</italic> of serotype O185:H2 and O8:H19 (Jaudou et al., <xref ref-type="bibr" rid="B28">2022</xref>). Predictions of the pathotype ranged from 0.95 to 1 for the class EHEC for all contamination levels. The negative control (a non-contaminated raw milk), was classified accurately as non-EHEC by all eight classifiers with the higher value of 0.21 for the SVM (<xref ref-type="table" rid="T3">Table 3</xref>).</p></sec></sec>
<sec sec-type="discussion" id="s4">
<title>4. Discussion</title>
<p>The correct detection and identification of highly pathogenic STEC from food remains challenging. Conventional detection methods based on the detection of the <italic>stx</italic> and <italic>eae</italic> genes (as well as genes from the most frequent serogroups) require an isolation step to ensure the correct characterization of the strain. Detection of EHEC in food samples based on the presence of a small number of additional genes that are more specifically associated with strains possessing simultaneously the <italic>stx</italic> and <italic>eae</italic> genes would represent a significant improvement for screening food samples (Delannoy et al., <xref ref-type="bibr" rid="B11">2016</xref>, <xref ref-type="bibr" rid="B12">2022</xref>). With such an approach, the number of presumptive positive samples that require further investigation by isolation and genotypic characterization can be reduced by around 50% (Delannoy et al., <xref ref-type="bibr" rid="B11">2016</xref>, <xref ref-type="bibr" rid="B12">2022</xref>), allowing to save money and time. Still, the amount of unconfirmed presumptive positive samples may be a problem for both the food industry and the decision maker. In a previous study, we showed that long-read metagenomics was efficient in identifying <italic>eae</italic>-positive STEC strains from complex matrices such as raw milk in an isolation-independent way (Jaudou et al., <xref ref-type="bibr" rid="B28">2022</xref>). However, we have highlighted that the presence of multiple <italic>E. coli</italic> strains may hinder the identification of the <italic>eae</italic>-positive STEC due to the assembly-based approach used. We wanted to continue exploring the potential of long-read metagenomics and take full advantage of ML algorithms by applying them to predict the presence of an EHEC strain directly from <italic>E. coli</italic> reads assembly, even in the presence of multiple <italic>E. coli</italic> strains.</p>
<p>As of February 2021, 31,230 <italic>E. coli</italic> genomes were available in the NCBI Genbank database, around 10% of which are genomes of O157:H7 strains. To build our database, we downloaded complete <italic>E. coli</italic> genomes as well as some scaffolded genomes that contained accompanying metadata, while taking care of having the top 10 EHEC serotypes represented, as well as less frequent ones (<xref ref-type="fig" rid="F4">Figure 4</xref>). Because the geographical distribution of certain clones may be skewed, we also included strains originating from all continents. During the genome selection, our objective was to obtain a database constituted at 50% of EHEC genomes (targets) and 50% of non-EHEC genomes (non-targets). When selecting the non-EHEC genomes we were careful to include various pathotypes such as EPEC (<italic>eae</italic>&#x0002B; only), STEC (<italic>stx</italic>&#x0002B; only), commensals (<italic>stx</italic> and <italic>eae</italic> negative) and some Extra-intestinal pathogenic <italic>E. coli</italic> (ExPEC) strains. Despite the size of our database and the precautions we took to build it, our final dataset after dereplication, was composed of 87 EHEC and 87 non-EHEC. We originally included large numbers of genomes for each of the top 10 serotypes observed in clinical cases worldwide in order to be representative of the frequency of isolation of the various serotypes. However, the diversity within each serotype appears limited. Indeed, several studies on various EHEC serotypes have shown that even the most diverse ones (in terms of SNPs) show a high degree of synteny and collinearity between isolates of different clades or lineages (Dallman et al., <xref ref-type="bibr" rid="B9">2015</xref>; Ogura et al., <xref ref-type="bibr" rid="B42">2017</xref>; Nishida et al., <xref ref-type="bibr" rid="B40">2021</xref>). Also, the pool of genes included in the dataset is the pool present in the Sakai annotation. Therefore, by reducing the available pangenome and increasing the probability for each strain to possess one version of each CDS, we increased the similarity between the genomes in the dataset. To avoid possible data leakage, we chose to group all the genomes that had less than 100 genes difference in their repertoire (8.4% of the genes considered). This is a drastic filtering step, but it is, to our knowledge, the most reliable to avoid reporting biased performance estimates.</p>
<p>One of the first choices when designing the pipeline is whether to use raw reads or assembled data. Initial tests showed that performing the annotation directly on long reads generated a very large amount of data that was computationally too intensive for the downstream processing (not shown). Based on these results we chose to work with assemblies and used the Flye long-read assembler with the metagenome option in order to deal with highly non-uniform coverage, in particular with low level artificially-EHEC contaminated milks. An early step of the pipeline consists in the annotation of the assembled genomes. The advantage of the annotation software used, prokka, is that a reference genome can be used to standardize the annotation. In our case, we used the O157:H7 Sakai genome as reference over the K12 <italic>E. coli</italic> reference genome because it is an EHEC carrying around 20% more integrated genomic elements than K12 <italic>E. coli</italic>, like pathogenicity islands and phages (Hayashi et al., <xref ref-type="bibr" rid="B22">2001</xref>). To generate the first matrix of gene presence/absence we chose panaroo among GWAS programs such as Roary (Page et al., <xref ref-type="bibr" rid="B43">2015</xref>), PIRATE (Bayliss et al., <xref ref-type="bibr" rid="B1">2019</xref>), or PPanGGoLiN (Gautreau et al., <xref ref-type="bibr" rid="B18">2020</xref>) because it offers the possibility to add one new genome to an existing pangenome graph. This feature is the keystone of our pipeline because it is very important to add the new genome into the existing pangenome graph so as not to modify the original matrix used for training the models. Panaroo collapses genes into putative families with a family sequence identity level of 70% in the default mode. During the analysis of the generated matrix, we identified locus tags that were split in different groups and regrouped them. Indeed, the allelic variability of STEC virulence genes can be important (Michelacci et al., <xref ref-type="bibr" rid="B35">2016</xref>).</p>
<p>Other studies have used different algorithms like Support Vector Machine, Gradient Boosting or Random Forest (Lupolova et al., <xref ref-type="bibr" rid="B33">2016</xref>; Njage et al., <xref ref-type="bibr" rid="B41">2019</xref>; Im et al., <xref ref-type="bibr" rid="B24">2021</xref>; Shaik et al., <xref ref-type="bibr" rid="B49">2022</xref>) but the nature of the data and the predictive outcome were different. In this study we used the power of ML to evaluate a high number of genes (1,178 CDS). We successfully decreased the number of genes needed for EHEC presence prediction down to six genes while keeping a high accuracy. It is remarkable that none of these six genes are related to the Shiga toxins. Surprisingly, neither <italic>stx</italic>1 subunit A and B nor <italic>stx</italic>2 subunit A and B are needed to predict an EHEC. Because it is present in all EPEC strains (<italic>eae</italic>-positive <italic>E. coli</italic>, non-target) the absence of <italic>eae</italic> in the six genes scheme is expected. In the reduced set of selected genes, we found five Type 3 Secretion System (T3SS) effectors and a phage excisionase. The T3SS represents an important component of the <italic>E. coli</italic> mobile gene pool. Although the LEE carries constitutive elements of the T3SS, additional effectors are encoded by prophages inserted into the genome (Tobe et al., <xref ref-type="bibr" rid="B50">2006</xref>). A large number of studies have described T3SS effectors as associated virulence markers (Coombes et al., <xref ref-type="bibr" rid="B7">2008</xref>; Konczy et al., <xref ref-type="bibr" rid="B32">2008</xref>; Bugarel et al., <xref ref-type="bibr" rid="B4">2010a</xref>,<xref ref-type="bibr" rid="B5">b</xref>, <xref ref-type="bibr" rid="B6">2011</xref>; Imamovic et al., <xref ref-type="bibr" rid="B26">2010</xref>; Creuzburg et al., <xref ref-type="bibr" rid="B8">2011</xref>). Here, the most important features identified for EHEC prediction are located on four genomic islands that harbors putative virulence factors already demonstrated to be present in EHEC strains: Sp4 (ECs 1056 / phage excisionase), Sp6 (ECs 1561 / <italic>espN</italic>), Sp9 (ECs 1812 / <italic>nleA</italic>, ECs 1815 / <italic>nleF</italic>, ECs 1824 / <italic>nleG</italic>) and SpLE3 (ECs 3858 / <italic>nleE</italic>) (Tobe et al., <xref ref-type="bibr" rid="B50">2006</xref>; Rasko et al., <xref ref-type="bibr" rid="B44">2008</xref>; Bugarel et al., <xref ref-type="bibr" rid="B4">2010a</xref>,<xref ref-type="bibr" rid="B5">b</xref>, <xref ref-type="bibr" rid="B6">2011</xref>; Delannoy et al., <xref ref-type="bibr" rid="B10">2013</xref>). The <italic>nleA</italic> gene (ECs 1812 - Sp9), which was found to be the second most important feature in our study, has been shown to play a key role in the virulence of various pathogenic bacteria, including <italic>E. coli</italic> (Rasko et al., <xref ref-type="bibr" rid="B44">2008</xref>). Similarly, the <italic>nleG</italic> gene (ECs 1824 - Sp9), which was the third most important feature, has been shown to be important for the virulence of enterohemorrhagic <italic>E. coli</italic> (Tobe et al., <xref ref-type="bibr" rid="B50">2006</xref>). Other T3SS effectors located in these four genomic islands have previously been shown to be associated with EHEC. For example, the Sp4 genomic island also harbors <italic>espV</italic> (ECs 1127), which, in combination with <italic>espK</italic> (ECs 1568 - Sp6) have been demonstrated to be present in EHEC strains and proposed as genetic markers to reduce false-positive results in food testing (Delannoy et al., <xref ref-type="bibr" rid="B10">2013</xref>, <xref ref-type="bibr" rid="B11">2016</xref>). Similarly, combinations of genes from Sp9 and SpLE3 were demonstrated to be strong signatures of typical EHECs (Bugarel et al., <xref ref-type="bibr" rid="B4">2010a</xref>,<xref ref-type="bibr" rid="B5">b</xref>, <xref ref-type="bibr" rid="B6">2011</xref>). These four genomic islands are recurrently found as harboring important features with all models and were previously experimentally found associated with EHEC. This strongly suggests that these genomic islands are stably associated with both the LEE and the presence of an <italic>stx</italic>-phage and may have co-evolved (Guo et al., <xref ref-type="bibr" rid="B20">2012</xref>). Although, the precise order of acquisition of these mobile genetic elements remains to be determined. The only incorrect EHEC predictions of the models using <italic>in-silico</italic> mixtures were obtained with the STEC/EPEC mixtures containing the aEPEC strain 2142-O103. Although negative for the <italic>stx</italic> gene this strain harbors the different genomic islands: Sp4, Sp6, Sp9, and SpLE3. It also belongs to a known EHEC serotype (O103:H25) that has been associated with an HUS outbreak (Schimmer et al., <xref ref-type="bibr" rid="B46">2008</xref>). It is thus likely that this strain represents what had previously been named an EHEC-like or EHEC-LST (Bielaszewska et al., <xref ref-type="bibr" rid="B3">2007</xref>; Mellmann et al., <xref ref-type="bibr" rid="B34">2008</xref>; Bugarel et al., <xref ref-type="bibr" rid="B6">2011</xref>), meaning that it could constitute an EHEC progeny that has lost the <italic>stx</italic> phage at one point. The existence of such EHEC-like strains constitute a caveat of our approach, as it is impossible for our model to distinguish a STEC/EHEC-like mixture accurately. However, from a risk management perspective, it could be beneficial to detect this kind of strains when in the presence of other STEC strains due to the potential of these EHEC-like to acquire the <italic>stx</italic> phage and become typical EHEC (Bielaszewska et al., <xref ref-type="bibr" rid="B3">2007</xref>; Mellmann et al., <xref ref-type="bibr" rid="B34">2008</xref>). Correct and timely identification of EHEC is crucial in food microbiology as well as for surveillance of STEC-mediated disease. The growing genomic sequence data offers additive information that may support the identification of discriminative EHEC markers (Kiel et al., <xref ref-type="bibr" rid="B30">2018</xref>). To extend EHEC diagnostics in the post-genomic era beyond the detection of the O157:H7 and the non-O157 serogroups from the Top 7, we developed suitable pipelines that integrate high throughput sequence data, to predict with high specificity and sensitivity EHEC strains. Different combinations of discriminative genetic markers were identified and validated to target the main STEC subgroup (<italic>eae</italic>-positive STEC) associated with severe human infections and outbreaks worldwide. Our study is in line with recent papers showing the potential and power of GWAS and Machine Learning approaches for designing biomarkers that target foodborne pathogens (Feucherolles et al., <xref ref-type="bibr" rid="B16">2021</xref>; S&#x000E9;vellec et al., <xref ref-type="bibr" rid="B48">2022</xref>). Here, the description of these new EHEC biomarkers is the confirmation that <italic>stx</italic> and <italic>eae</italic> are not the only genetic markers that are the hallmark of EHEC, but that EHEC characterization is much more complex than the simultaneous identification of <italic>stx</italic> and <italic>eae</italic> genes. There are in fact associated factors (type III effectors are some of them as shown in this study) which, by their presence or absence, provide a fairly precise predictive model on the co-localization of <italic>stx</italic> and <italic>eae</italic> in a single strain. The new EHEC markers found using ML in our study could predict EHEC with very high accuracy in a large genome dataset and artificially contaminated raw milk metagenomes. The correct prediction of the EHEC strain while co-occurring with another <italic>E. coli</italic> strain at a ratio of 1:1 is remarkable. Most programs that aim at distinguishing strains from the same species relies on coverage differences (<italic>i.e</italic>. for assemblers and binning tools). These findings open the door for the development of new diagnostics tests for a better screening of EHEC in foods products. As long as DNA sequence-based diagnostics of mixed populations cannot resolve whether relevant markers like <italic>stx</italic> and <italic>eae</italic> genes are present in the same genome, some risk of generating false-positive results exist. Including the combination of additional EHEC-related markers like those we described here, in the detection scheme, would supports a better hazard characterization of typical EHEC.</p></sec>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/<xref ref-type="supplementary-material" rid="s8">Supplementary material</xref>.</p></sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>FV, SJ, SD, and PF conceptualized the project. PF and SD were in charge of funding acquisition. SJ downloaded the fasta from public database. SJ and M-LT did the milk artificially contaminated sequencing and assembly. FV did the <italic>in silico</italic> mixtures analysis, machine learning model development, database creation, and wrote the original draft. Methodology and resources by FV, SJ, SD, and HR. HR did the cluster analysis. All authors contributed to manuscript revision, read, and approved the submitted version.</p></sec>
</body>
<back>


<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s7">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>

<sec sec-type="supplementary-material" id="s8">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fmicb.2023.1118158/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fmicb.2023.1118158/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.XLSX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_2.XLSX" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_3.XLSX" id="SM3" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_4.xlsx" id="SM4" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bayliss</surname> <given-names>S. C.</given-names></name> <name><surname>Thorpe</surname> <given-names>H. A.</given-names></name> <name><surname>Coyle</surname> <given-names>N. M.</given-names></name> <name><surname>Sheppard</surname> <given-names>S. K.</given-names></name> <name><surname>Feil</surname> <given-names>E. J.</given-names></name></person-group> (<year>2019</year>). <article-title>PIRATE: A fast and scalable pangenomics toolbox for clustering diverged orthologues in bacteria</article-title>. <source>Gigascience</source> 8, giz119. <pub-id pub-id-type="doi">10.1093/gigascience/giz119</pub-id><pub-id pub-id-type="pmid">31598686</pub-id></citation></ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name></person-group> (<year>2015</year>). <article-title>Detection of Shiga toxin-producing <italic>Escherichia coli</italic> from nonhuman sources and strain typing</article-title>. <source>295Microbiol. Spectrum</source> 2, EHEC-0001-2013. <pub-id pub-id-type="doi">10.1128/9781555818791.ch14</pub-id><pub-id pub-id-type="pmid">26103970</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bielaszewska</surname> <given-names>M.</given-names></name> <name><surname>Prager</surname> <given-names>R.</given-names></name> <name><surname>Kock</surname> <given-names>R.</given-names></name> <name><surname>Mellmann</surname> <given-names>A.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Tschape</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2007</year>). <article-title>Shiga toxin gene loss and transfer <italic>in vitro</italic> and <italic>in vivo</italic> during enterohemorrhagic <italic>Escherichia coli</italic> O26 infection in humans</article-title>. <source>Appl. Environ. Microbiol</source>. <volume>73</volume>, <fpage>3144</fpage>&#x02013;<lpage>3150</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.02937-06</pub-id><pub-id pub-id-type="pmid">17400784</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bugarel</surname> <given-names>M.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name></person-group> (<year>2010a</year>). <article-title>Low-density macroarray targeting non-locus of enterocyte effacement effectors (<italic>nle</italic> genes) and major virulence factors of Shiga toxin-producing <italic>Escherichia coli</italic> (STEC): a new approach for molecular risk assessment of STEC isolates</article-title>. <source>Appl. Environ. Microbiol</source>. <volume>76</volume>, <fpage>203</fpage>&#x02013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.01921-09</pub-id><pub-id pub-id-type="pmid">19880649</pub-id></citation></ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bugarel</surname> <given-names>M.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Martin</surname> <given-names>A.</given-names></name> <name><surname>Gill</surname> <given-names>A.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name></person-group> (<year>2010b</year>). <article-title>Micro-array for the identification of Shiga toxin-producing <italic>Escherichia coli</italic> (STEC) seropathotypes associated with hemorrhagic colitis and hemolytic uremic syndrome in humans</article-title>. <source>Int. J. Food Microbiol</source>. <volume>142</volume>, <fpage>318</fpage>&#x02013;<lpage>329</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijfoodmicro.2010.07.010</pub-id><pub-id pub-id-type="pmid">20675003</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bugarel</surname> <given-names>M.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Scheutz</surname> <given-names>F.</given-names></name> <name><surname>Loukiadis</surname> <given-names>E.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name></person-group> (<year>2011</year>). <article-title>Identification of genetic markers for differentiation of Shiga toxin-producing, enteropathogenic, and avirulent strains of <italic>Escherichia coli</italic> O26</article-title>. <source>Appl. Environ. Microbiol</source>. <volume>77</volume>, <fpage>2275</fpage>&#x02013;<lpage>2281</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.02832-10</pub-id><pub-id pub-id-type="pmid">21317253</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Coombes</surname> <given-names>B. K.</given-names></name> <name><surname>Wickham</surname> <given-names>M. E.</given-names></name> <name><surname>Mascarenhas</surname> <given-names>M.</given-names></name> <name><surname>Gruenheid</surname> <given-names>S.</given-names></name> <name><surname>Finlay</surname> <given-names>B. B.</given-names></name> <name><surname>Karmali</surname> <given-names>M. A.</given-names></name></person-group> (<year>2008</year>). <article-title>Molecular analysis as an aid to assess the public health risk of non-O157 Shiga toxin-producing <italic>Escherichia coli</italic> strains</article-title>. <source>Appl. Environ. Microbiol</source>. <volume>74</volume>, <fpage>2153</fpage>&#x02013;<lpage>2160</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.02566-07</pub-id><pub-id pub-id-type="pmid">18245257</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Creuzburg</surname> <given-names>K.</given-names></name> <name><surname>Middendorf</surname> <given-names>B.</given-names></name> <name><surname>Mellmann</surname> <given-names>A.</given-names></name> <name><surname>Martaler</surname> <given-names>T.</given-names></name> <name><surname>Holz</surname> <given-names>C.</given-names></name> <name><surname>Fruth</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Evolutionary analysis and distribution of type iii effector genes in pathogenic <italic>Escherichia coli</italic> from human, animal and food sources</article-title>. <source>Environ. Microbiol</source>. <volume>13</volume>, <fpage>439</fpage>&#x02013;<lpage>452</lpage>. <pub-id pub-id-type="doi">10.1111/j.1462-2920.2010.02349.x</pub-id><pub-id pub-id-type="pmid">20880329</pub-id></citation></ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dallman</surname> <given-names>T. J.</given-names></name> <name><surname>Ashton</surname> <given-names>P. M.</given-names></name> <name><surname>Byrne</surname> <given-names>L.</given-names></name> <name><surname>Perry</surname> <given-names>N. T.</given-names></name> <name><surname>Petrovska</surname> <given-names>L.</given-names></name> <name><surname>Ellis</surname> <given-names>R.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Applying phylogenomics to understand the emergence of Shiga-toxin-producing tEscherichia coli O157: H7 strains causing severe human disease in the uk</article-title>. <source>Microbial Genomics</source> <volume>1</volume>, <fpage>e000029</fpage>. <pub-id pub-id-type="doi">10.1099/mgen.0.000029</pub-id><pub-id pub-id-type="pmid">28348814</pub-id></citation></ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delannoy</surname> <given-names>S.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name></person-group> (<year>2013</year>). <article-title>Discrimination of enterohemorrhagic <italic>Escherichia coli</italic> (EHEC) from non-EHEC strains based on detection of various combinations of type iii effector genes</article-title>. <source>J. Clin. Microbiol</source>. <volume>51</volume>, <fpage>3257</fpage>&#x02013;<lpage>3262</lpage>. <pub-id pub-id-type="doi">10.1128/JCM.01471-13</pub-id><pub-id pub-id-type="pmid">23884997</pub-id></citation></ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delannoy</surname> <given-names>S.</given-names></name> <name><surname>Chaves</surname> <given-names>B. D.</given-names></name> <name><surname>Ison</surname> <given-names>S. A.</given-names></name> <name><surname>Webb</surname> <given-names>H. E.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Delaval</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Revisiting the STEC testing approach: using <italic>espK</italic> and <italic>espV</italic> to make enterohemorrhagic <italic>Escherichia coli</italic> (EHEC) detection more reliable in beef</article-title>. <source>Front. Microbiol</source>. 7, 1. <pub-id pub-id-type="doi">10.3389/fmicb.2016.00001</pub-id><pub-id pub-id-type="pmid">26834723</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delannoy</surname> <given-names>S.</given-names></name> <name><surname>Tran</surname> <given-names>M.-L.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name></person-group> (<year>2022</year>). <article-title>Insights into the assessment of highly pathogenic Shiga toxin-producing <italic>Escherichia coli</italic> in raw milk and raw milk cheeses by high throughput real-time PCR</article-title>. <source>Int. J. Food Microbiol</source>. 366, 109564. <pub-id pub-id-type="doi">10.1016/j.ijfoodmicro.2022.109564</pub-id><pub-id pub-id-type="pmid">35151054</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><collab>European Food Safety Authority and European Centre for Disease Prevention and Control</collab></person-group>. (<year>2021</year>). <article-title>The European union one health 2019 zoonoses report</article-title>. <source>EFSA J</source>. 19, e06406. <pub-id pub-id-type="doi">10.2903/j.efsa.2021.6406</pub-id><pub-id pub-id-type="pmid">33680134</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><collab>European Food Safety Authority</collab></person-group>. (<year>2007</year>). <article-title>Scientific opinion of the panel on biological hazards (biohaz)-monitoring of verotoxigenic <italic>Escherichia coli</italic> (VTEC) and identification of human pathogenic VTEC types</article-title>. <source>EFSA J</source>. 5, 579. <pub-id pub-id-type="doi">10.2903/j.efsa.2007.579</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><collab>European Food Safety Authority</collab></person-group>. (<year>2013</year>). <article-title>Scientific opinion on VTEC-seropathotype and scientific criteria regarding pathogenicity assessment</article-title>. <source>EFSA J</source>. <volume>11</volume>, <fpage>3138</fpage>&#x02013;<lpage>3244</lpage>. <pub-id pub-id-type="doi">10.2903/j.efsa.2013.3138</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Feucherolles</surname> <given-names>M.</given-names></name> <name><surname>Nennig</surname> <given-names>M.</given-names></name> <name><surname>Becker</surname> <given-names>S. L.</given-names></name> <name><surname>Martiny</surname> <given-names>D.</given-names></name> <name><surname>Losch</surname> <given-names>S.</given-names></name> <name><surname>Penny</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Combination of MALDI-TOF mass spectrometry and machine learning for rapid antimicrobial resistance screening: the case of <italic>Campylobacter</italic> spp</article-title>. <source>Front. Microbiol</source>. 12, 804484. <pub-id pub-id-type="doi">10.3389/fmicb.2021.804484</pub-id><pub-id pub-id-type="pmid">35250909</pub-id></citation></ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Franz</surname> <given-names>E.</given-names></name> <name><surname>Delaquis</surname> <given-names>P.</given-names></name> <name><surname>Morabito</surname> <given-names>S.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Gobius</surname> <given-names>K.</given-names></name> <name><surname>Rasko</surname> <given-names>D. A.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Exploiting the explosion of information associated with whole genome sequencing to tackle Shiga toxin-producing <italic>Escherichia coli</italic> (STEC) in global food production systems</article-title>. <source>Int. J. Food Microbiol</source>. <volume>187</volume>, <fpage>57</fpage>&#x02013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijfoodmicro.2014.07.002</pub-id><pub-id pub-id-type="pmid">25051454</pub-id></citation></ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gautreau</surname> <given-names>G.</given-names></name> <name><surname>Bazin</surname> <given-names>A.</given-names></name> <name><surname>Gachet</surname> <given-names>M.</given-names></name> <name><surname>Planel</surname> <given-names>R.</given-names></name> <name><surname>Burlot</surname> <given-names>L.</given-names></name> <name><surname>Dubois</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>PPanGGOLiN: depicting microbial diversity via a partitioned pangenome graph</article-title>. <source>PLoS Computat. Biol</source>. 16, e1007732. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1007732</pub-id><pub-id pub-id-type="pmid">34890406</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gill</surname> <given-names>A.</given-names></name> <name><surname>Dussault</surname> <given-names>F.</given-names></name> <name><surname>McMahon</surname> <given-names>T.</given-names></name> <name><surname>Petronella</surname> <given-names>N.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Cebelinski</surname> <given-names>E.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Characterization of atypical Shiga toxin gene sequences and description of stx2j, a new subtype</article-title>. <source>J. Clin. Microbiol</source>. <volume>60</volume>, <fpage>e02229</fpage>&#x02013;<lpage>e02221</lpage>. <pub-id pub-id-type="doi">10.1128/jcm.02229-21</pub-id><pub-id pub-id-type="pmid">35225693</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>F.</given-names></name> <name><surname>Wei</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Ding</surname> <given-names>H.</given-names></name> <name><surname>Huang</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2012</year>). <article-title>Co-evolution of genomic islands and their bacterial hosts revealed through phylogenetic analyses of 17 groups of homologous genomic islands</article-title>. <source>Genet. Mol. Res</source> <volume>11</volume>, <fpage>3735</fpage>&#x02013;<lpage>3743</lpage>. <pub-id pub-id-type="doi">10.4238/2012.October.15.5</pub-id><pub-id pub-id-type="pmid">23096693</pub-id></citation></ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hall</surname> <given-names>M. B.</given-names></name></person-group> (<year>2022</year>). <article-title>Rasusa: randomly subsample sequencing reads to a specified coverage</article-title>. <source>J. Open Source Softw</source>. 7, 3941. <pub-id pub-id-type="doi">10.21105/joss.03941</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hayashi</surname> <given-names>T.</given-names></name> <name><surname>Makino</surname> <given-names>K.</given-names></name> <name><surname>Ohnishi</surname> <given-names>M.</given-names></name> <name><surname>Kurokawa</surname> <given-names>K.</given-names></name> <name><surname>Ishii</surname> <given-names>K.</given-names></name> <name><surname>Yokoyama</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2001</year>). <article-title>Complete genome sequence of enterohemorrhagic <italic>Escherichia coli</italic> O157: H7 and genomic comparison with a laboratory strain K-12</article-title>. <source>DNA Res</source>. <volume>8</volume>, <fpage>11</fpage>&#x02013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1093/dnares/8.1.11</pub-id><pub-id pub-id-type="pmid">11258796</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Herold</surname> <given-names>S.</given-names></name> <name><surname>Karch</surname> <given-names>H.</given-names></name> <name><surname>Schmidt</surname> <given-names>H.</given-names></name></person-group> (<year>2004</year>). <article-title>Shiga toxin-encoding bacteriophages-genomes in motion</article-title>. <source>Int. J. Med. Microbiol</source>. <volume>294</volume>, <fpage>115</fpage>&#x02013;<lpage>121</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijmm.2004.06.023</pub-id><pub-id pub-id-type="pmid">15493821</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Im</surname> <given-names>H.</given-names></name> <name><surname>Hwang</surname> <given-names>S.-H.</given-names></name> <name><surname>Kim</surname> <given-names>B. S.</given-names></name> <name><surname>Choi</surname> <given-names>S. H.</given-names></name></person-group> (<year>2021</year>). <article-title>Pathogenic potential assessment of the Shiga toxin-producing <italic>Escherichia coli</italic> by a source attribution considered machine learning model</article-title>. <source>Proc. Natl. Acad. Sci</source>. 118, e2018877118. <pub-id pub-id-type="doi">10.1073/pnas.2018877118</pub-id><pub-id pub-id-type="pmid">33986113</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Imamovic</surname> <given-names>L.</given-names></name> <name><surname>Jofre</surname> <given-names>J.</given-names></name> <name><surname>Schmidt</surname> <given-names>H.</given-names></name> <name><surname>Serra-Moreno</surname> <given-names>R.</given-names></name> <name><surname>Muniesa</surname> <given-names>M.</given-names></name></person-group> (<year>2009</year>). <article-title>Phage-mediated Shiga toxin 2 gene transfer in food and water</article-title>. <source>Appl. Environ. Microbiol</source>. <volume>75</volume>, <fpage>1764</fpage>&#x02013;<lpage>1768</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.02273-08</pub-id><pub-id pub-id-type="pmid">19168651</pub-id></citation></ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Imamovic</surname> <given-names>L.</given-names></name> <name><surname>Tozzoli</surname> <given-names>R.</given-names></name> <name><surname>Michelacci</surname> <given-names>V.</given-names></name> <name><surname>Minelli</surname> <given-names>F.</given-names></name> <name><surname>Marziano</surname> <given-names>M. L.</given-names></name> <name><surname>Caprioli</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2010</year>). <article-title>OI-57, a genomic island of <italic>Escherichia coli</italic> O157, is present in other seropathotypes of Shiga toxin-producing <italic>E. coli</italic> associated with severe human disease</article-title>. <source>Infect. Immunity</source> <volume>78</volume>, <fpage>4697</fpage>&#x02013;<lpage>4704</lpage>. <pub-id pub-id-type="doi">10.1128/IAI.00512-10</pub-id><pub-id pub-id-type="pmid">20823207</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><collab>International Organization for Standardization</collab></person-group>. (<year>2012</year>). <source>Microbiology of food and animal feed. Real-time polymerase chain reaction (PCR)-based method for the detection of food-borne pathogens</source>. Horizontal method for the detection of Shiga toxin-producing <italic>Escherichia coli</italic> (STEC) and the determination of O157, <italic>O111, O26, O103 and O145 serogroups.ISO/TS 13136:2012</italic>. 22 p.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jaudou</surname> <given-names>S.</given-names></name> <name><surname>Deneke</surname> <given-names>C.</given-names></name> <name><surname>Tran</surname> <given-names>M.-L.</given-names></name> <name><surname>Schuh</surname> <given-names>E.</given-names></name> <name><surname>Goehler</surname> <given-names>A.</given-names></name> <name><surname>Vorimore</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A step forward for Shiga toxin-producing <italic>Escherichia coli</italic> identification and characterization in raw milk using long-read metagenomics</article-title>. <source>Microbial Genomics</source> 8, mgen000911. <pub-id pub-id-type="doi">10.1099/mgen.0.000911</pub-id><pub-id pub-id-type="pmid">36748417</pub-id></citation></ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jaudou</surname> <given-names>S.</given-names></name> <name><surname>Tran</surname> <given-names>M.-L.</given-names></name> <name><surname>Vorimore</surname> <given-names>F.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name> <name><surname>Delannoy</surname> <given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>Hybrid assembly from 75 <italic>E. coli</italic> genomes isolated from french bovine food products between 1995 and 2016</article-title>. <source>Microbiol. Resour. Announc</source>. 12, e01095-22. <pub-id pub-id-type="doi">10.1128/mra.01095-22</pub-id><pub-id pub-id-type="pmid">36722944</pub-id></citation></ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kiel</surname> <given-names>M.</given-names></name> <name><surname>Sagory-Zalkind</surname> <given-names>P.</given-names></name> <name><surname>Miganeh</surname> <given-names>C.</given-names></name> <name><surname>Stork</surname> <given-names>C.</given-names></name> <name><surname>Leimbach</surname> <given-names>A.</given-names></name> <name><surname>Sekse</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Identification of novel biomarkers for priority serotypes of Shiga toxin-producing <italic>Escherichia coli</italic> and the development of multiplex PCR for their detection</article-title>. <source>Front. Microbiol</source>. 9, 1321. <pub-id pub-id-type="doi">10.3389/fmicb.2018.01321</pub-id><pub-id pub-id-type="pmid">29997582</pub-id></citation></ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kolmogorov</surname> <given-names>M.</given-names></name> <name><surname>Bickhart</surname> <given-names>D. M.</given-names></name> <name><surname>Behsaz</surname> <given-names>B.</given-names></name> <name><surname>Gurevich</surname> <given-names>A.</given-names></name> <name><surname>Rayko</surname> <given-names>M.</given-names></name> <name><surname>Shin</surname> <given-names>S. B.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>metaflye: scalable long-read metagenome assembly using repeat graphs</article-title>. <source>Nat. Methods</source> <volume>17</volume>, <fpage>1103</fpage>&#x02013;<lpage>1110</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-020-00971-x</pub-id><pub-id pub-id-type="pmid">33020656</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Konczy</surname> <given-names>P.</given-names></name> <name><surname>Ziebell</surname> <given-names>K.</given-names></name> <name><surname>Mascarenhas</surname> <given-names>M.</given-names></name> <name><surname>Choi</surname> <given-names>A.</given-names></name> <name><surname>Michaud</surname> <given-names>C.</given-names></name> <name><surname>Kropinski</surname> <given-names>A. M.</given-names></name> <etal/></person-group>. (<year>2008</year>). <article-title>Genomic O island 122, locus for enterocyte effacement, and the evolution of virulent verocytotoxin-producing <italic>Escherichia coli</italic></article-title>. <source>J. Bacteriol</source>. <volume>190</volume>, <fpage>5832</fpage>&#x02013;<lpage>5840</lpage>. <pub-id pub-id-type="doi">10.1128/JB.00480-08</pub-id><pub-id pub-id-type="pmid">18586943</pub-id></citation></ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lupolova</surname> <given-names>N.</given-names></name> <name><surname>Dallman</surname> <given-names>T. J.</given-names></name> <name><surname>Matthews</surname> <given-names>L.</given-names></name> <name><surname>Bono</surname> <given-names>J. L.</given-names></name> <name><surname>Gally</surname> <given-names>D. L.</given-names></name></person-group> (<year>2016</year>). <article-title>Support vector machine applied to predict the zoonotic potential of <italic>E. coli</italic> O157 cattle isolates</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A</source>. <volume>113</volume>, <fpage>11312</fpage>&#x02013;<lpage>11317</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1606567113</pub-id><pub-id pub-id-type="pmid">27647883</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mellmann</surname> <given-names>A.</given-names></name> <name><surname>Lu</surname> <given-names>S.</given-names></name> <name><surname>Karch</surname> <given-names>H.</given-names></name> <name><surname>Xu</surname> <given-names>J.</given-names></name> <name><surname>Harmsen</surname> <given-names>D.</given-names></name> <name><surname>Schmidt</surname> <given-names>M. A.</given-names></name> <etal/></person-group>. (<year>2008</year>). <article-title>Recycling of Shiga toxin 2 genes in sorbitol-fermenting enterohemorrhagic <italic>Escherichia coli</italic> O157: NM</article-title>. <source>Appl. Environ. Microbiol</source>. <volume>74</volume>, <fpage>67</fpage>&#x02013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.01906-07</pub-id><pub-id pub-id-type="pmid">17981936</pub-id></citation></ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Michelacci</surname> <given-names>V.</given-names></name> <name><surname>Orsini</surname> <given-names>M.</given-names></name> <name><surname>Knijn</surname> <given-names>A.</given-names></name> <name><surname>Delannoy</surname> <given-names>S.</given-names></name> <name><surname>Fach</surname> <given-names>P.</given-names></name> <name><surname>Caprioli</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2016</year>). Development of a high resolution virulence allelic profiling (HReVAP) approach based on the accessory genome of <italic>Escherichia coli</italic> to characterize Shiga-toxin producing <italic>E. coli</italic> (STEC). <italic>Front. Microbiol</italic>. 7, 202. <pub-id pub-id-type="doi">10.3389/fmicb.2016.00202</pub-id><pub-id pub-id-type="pmid">26941726</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Minh</surname> <given-names>B. Q.</given-names></name> <name><surname>Schmidt</surname> <given-names>H. A.</given-names></name> <name><surname>Chernomor</surname> <given-names>O.</given-names></name> <name><surname>Schrempf</surname> <given-names>D.</given-names></name> <name><surname>Woodhams</surname> <given-names>M. D.</given-names></name> <name><surname>Von Haeseler</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Iq-tree 2: new models and efficient methods for phylogenetic inference in the genomic era</article-title>. <source>Mol. Biol. Evol.</source> <volume>37</volume>, <fpage>1530</fpage>&#x02013;<lpage>1534</lpage>. <pub-id pub-id-type="doi">10.1093/molbev/msaa015</pub-id><pub-id pub-id-type="pmid">32556291</pub-id></citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Moradigaravand</surname> <given-names>D.</given-names></name> <name><surname>Palm</surname> <given-names>M.</given-names></name> <name><surname>Farewell</surname> <given-names>A.</given-names></name> <name><surname>Mustonen</surname> <given-names>V.</given-names></name> <name><surname>Warringer</surname> <given-names>J.</given-names></name> <name><surname>Parts</surname> <given-names>L.</given-names></name></person-group> (<year>2018</year>). <article-title>Prediction of antibiotic resistance in <italic>Escherichia coli</italic> from large-scale pan-genome data</article-title>. <source>PLoS Comput. Biol</source>. 14, e1006258. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1006258</pub-id><pub-id pub-id-type="pmid">30550564</pub-id></citation></ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>M&#x000FC;thing</surname> <given-names>J.</given-names></name> <name><surname>Schweppe</surname> <given-names>C. H.</given-names></name> <name><surname>Karch</surname> <given-names>H.</given-names></name> <name><surname>Friedrich</surname> <given-names>A. W.</given-names></name></person-group> (<year>2009</year>). <article-title>Shiga toxins, glycosphingolipid diversity, and endothelial cell injury</article-title>. <source>Thromb. Haemost</source>. <volume>101</volume>, <fpage>252</fpage>&#x02013;<lpage>264</lpage>. <pub-id pub-id-type="doi">10.1160/TH08-05-0317</pub-id><pub-id pub-id-type="pmid">19190807</pub-id></citation></ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nataro</surname> <given-names>J. P.</given-names></name> <name><surname>Kaper</surname> <given-names>J. B.</given-names></name></person-group> (<year>1998</year>). <article-title>Diarrheagenic <italic>Escherichia coli</italic></article-title>. <source>Clin. Microbiol. Rev</source>. <volume>11</volume>, <fpage>142</fpage>&#x02013;<lpage>201</lpage>. <pub-id pub-id-type="doi">10.1128/CMR.11.1.142</pub-id><pub-id pub-id-type="pmid">9457432</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nishida</surname> <given-names>R.</given-names></name> <name><surname>Nakamura</surname> <given-names>K.</given-names></name> <name><surname>Taniguchi</surname> <given-names>I.</given-names></name> <name><surname>Murase</surname> <given-names>K.</given-names></name> <name><surname>Ooka</surname> <given-names>T.</given-names></name> <name><surname>Ogura</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>The global population structure and evolutionary history of the acquisition of major virulence factor-encoding genetic elements in Shiga toxin-producing <italic>Escherichia coli</italic> O121: H19</article-title>. <source>Microbial Genomics</source> <volume>7</volume>, <fpage>000716</fpage>. <pub-id pub-id-type="doi">10.1099/mgen.0.000716</pub-id><pub-id pub-id-type="pmid">34878971</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Njage</surname> <given-names>P. M. K.</given-names></name> <name><surname>Leekitcharoenphon</surname> <given-names>P.</given-names></name> <name><surname>Hald</surname> <given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>Improving hazard characterization in microbial risk assessment using next generation sequencing data and machine learning: predicting clinical outcomes in shigatoxigenic <italic>Escherichia coli</italic></article-title>. <source>Int. J. Food Microbiol</source>. <volume>292</volume>, <fpage>72</fpage>&#x02013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijfoodmicro.2018.11.016</pub-id><pub-id pub-id-type="pmid">30579059</pub-id></citation></ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ogura</surname> <given-names>Y.</given-names></name> <name><surname>Gotoh</surname> <given-names>Y.</given-names></name> <name><surname>Itoh</surname> <given-names>T.</given-names></name> <name><surname>Sato</surname> <given-names>M. P.</given-names></name> <name><surname>Seto</surname> <given-names>K.</given-names></name> <name><surname>Yoshino</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Population structure of <italic>Escherichia coli</italic> O26: H11 with recent and repeated <italic>stx2</italic> acquisition in multiple lineages</article-title>. <source>Microbial Genomics</source> <volume>3</volume>, <fpage>e000141</fpage>. <pub-id pub-id-type="doi">10.1099/mgen.0.000141</pub-id><pub-id pub-id-type="pmid">29208163</pub-id></citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Page</surname> <given-names>A. J.</given-names></name> <name><surname>Cummins</surname> <given-names>C. A.</given-names></name> <name><surname>Hunt</surname> <given-names>M.</given-names></name> <name><surname>Wong</surname> <given-names>V. K.</given-names></name> <name><surname>Reuter</surname> <given-names>S.</given-names></name> <name><surname>Holden</surname> <given-names>M. T.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Roary: rapid large-scale prokaryote pan genome analysis</article-title>. <source>Bioinformatics</source> <volume>31</volume>, <fpage>3691</fpage>&#x02013;<lpage>3693</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btv421</pub-id><pub-id pub-id-type="pmid">26198102</pub-id></citation></ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rasko</surname> <given-names>D. A.</given-names></name> <name><surname>Rosovitz</surname> <given-names>M.</given-names></name> <name><surname>Myers</surname> <given-names>G. S.</given-names></name> <name><surname>Mongodin</surname> <given-names>E. F.</given-names></name> <name><surname>Fricke</surname> <given-names>W. F.</given-names></name> <name><surname>Gajer</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2008</year>). <article-title>The pangenome structure of <italic>Escherichia coli</italic>: comparative genomic analysis of <italic>E. coli</italic> commensal and pathogenic isolates</article-title>. <source>J. Bacteriol</source>. <volume>190</volume>, <fpage>6881</fpage>&#x02013;<lpage>6893</lpage>. <pub-id pub-id-type="doi">10.1128/JB.00619-08</pub-id><pub-id pub-id-type="pmid">18676672</pub-id></citation></ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Scheutz</surname> <given-names>F.</given-names></name> <name><surname>Teel</surname> <given-names>L. D.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Pi&#x000E9;rard</surname> <given-names>D.</given-names></name> <name><surname>Buvens</surname> <given-names>G.</given-names></name> <name><surname>Karch</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2012</year>). <article-title>Multicenter evaluation of a sequence-based protocol for subtyping Shiga toxins and standardizing stx nomenclature</article-title>. <source>J. Clin. Microbiol</source>. <volume>50</volume>, <fpage>2951</fpage>&#x02013;<lpage>2963</lpage>. <pub-id pub-id-type="doi">10.1128/JCM.00860-12</pub-id><pub-id pub-id-type="pmid">22760050</pub-id></citation></ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schimmer</surname> <given-names>B.</given-names></name> <name><surname>Nygard</surname> <given-names>K.</given-names></name> <name><surname>Eriksen</surname> <given-names>H.-M.</given-names></name> <name><surname>Lassen</surname> <given-names>J.</given-names></name> <name><surname>Lindstedt</surname> <given-names>B.-A.</given-names></name> <name><surname>Brandal</surname> <given-names>L. T.</given-names></name> <etal/></person-group>. (<year>2008</year>). <article-title>Outbreak of haemolytic uraemic syndrome in norway caused by <italic>stx</italic> 2-positive <italic>Escherichia coli</italic> O103: H25 traced to cured mutton sausages</article-title>. <source>BMC Infect. Dis</source>. 8, 41. <pub-id pub-id-type="doi">10.1186/1471-2334-8-41</pub-id><pub-id pub-id-type="pmid">18387178</pub-id></citation></ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Seemann</surname> <given-names>T.</given-names></name></person-group> (<year>2014</year>). <article-title>Prokka: rapid prokaryotic genome annotation</article-title>. <source>Bioinformatics</source> <volume>30</volume>, <fpage>2068</fpage>&#x02013;<lpage>2069</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btu153</pub-id><pub-id pub-id-type="pmid">24642063</pub-id></citation></ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>S&#x000E9;vellec</surname> <given-names>Y.</given-names></name> <name><surname>Ascencio</surname> <given-names>E.</given-names></name> <name><surname>Douarre</surname> <given-names>P.-E.</given-names></name> <name><surname>F&#x000E9;lix</surname> <given-names>B.</given-names></name> <name><surname>Gal</surname> <given-names>L.</given-names></name> <name><surname>Garmyn</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title><italic>Listeria</italic> monocytogenes: investigation of fitness in soil does not support the relevance of ecotypes</article-title>. <source>Front. Microbiol</source>. 13, 917588. <pub-id pub-id-type="doi">10.3389/fmicb.2022.917588</pub-id><pub-id pub-id-type="pmid">35770178</pub-id></citation></ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shaik</surname> <given-names>S.</given-names></name> <name><surname>Singh</surname> <given-names>A.</given-names></name> <name><surname>Suresh</surname> <given-names>A.</given-names></name> <name><surname>Ahmed</surname> <given-names>N.</given-names></name></person-group> (<year>2022</year>). <article-title>Genome informatics and machine learning-based identification of antimicrobial resistance-encoding features and virulence attributes in <italic>Escherichia coli</italic> genomes representing globally prevalent lineages, including high-risk clonal complexes</article-title>. <source>Mbio</source> <volume>13</volume>, <fpage>e03796</fpage>&#x02013;<lpage>e03721</lpage>. <pub-id pub-id-type="doi">10.1128/mbio.03796-21</pub-id><pub-id pub-id-type="pmid">35164570</pub-id></citation></ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tobe</surname> <given-names>T.</given-names></name> <name><surname>Beatson</surname> <given-names>S. A.</given-names></name> <name><surname>Taniguchi</surname> <given-names>H.</given-names></name> <name><surname>Abe</surname> <given-names>H.</given-names></name> <name><surname>Bailey</surname> <given-names>C. M.</given-names></name> <name><surname>Fivian</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2006</year>). <article-title>An extensive repertoire of type iii secretion effectors in <italic>Escherichia coli</italic> O157 and the role of lambdoid phages in their dissemination</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A</source>. <volume>103</volume>, <fpage>14941</fpage>&#x02013;<lpage>14946</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0604891103</pub-id><pub-id pub-id-type="pmid">16990433</pub-id></citation></ref>
<ref id="B51">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tonkin-Hill</surname> <given-names>G.</given-names></name> <name><surname>MacAlasdair</surname> <given-names>N.</given-names></name> <name><surname>Ruis</surname> <given-names>C.</given-names></name> <name><surname>Weimann</surname> <given-names>A.</given-names></name> <name><surname>Horesh</surname> <given-names>G.</given-names></name> <name><surname>Lees</surname> <given-names>J. A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Producing polished prokaryotic pangenomes with the panaroo pipeline</article-title>. <source>Genome Biol</source>. <volume>21</volume>, <fpage>1</fpage>&#x02013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1186/s13059-020-02090-4</pub-id><pub-id pub-id-type="pmid">32698896</pub-id></citation></ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Kohler</surname> <given-names>B.</given-names></name> <name><surname>Oswald</surname> <given-names>E.</given-names></name> <name><surname>Beutin</surname> <given-names>L.</given-names></name> <name><surname>Karch</surname> <given-names>H.</given-names></name> <name><surname>Morabito</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2002</year>). <article-title>Genetic diversity of intimin genes of attaching and effacing <italic>Escherichia coli</italic> strains</article-title>. <source>J. Clin. Microbiol</source>. <volume>40</volume>, <fpage>4486</fpage>&#x02013;<lpage>4492</lpage>. <pub-id pub-id-type="doi">10.1128/JCM.40.12.4486-4492.2002</pub-id><pub-id pub-id-type="pmid">12454140</pub-id></citation></ref>
</ref-list> 
</back>
</article>