<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1074212</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2023.1074212</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Bioinformatics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Adding context to the pneumococcal core genes using bioinformatic analysis of the intergenic pangenome of <italic>Streptococcus pneumoniae</italic>
</article-title>
<alt-title alt-title-type="left-running-head">Nielsen et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2023.1074212">10.3389/fbinf.2023.1074212</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Nielsen</surname>
<given-names>Flemming Damgaard</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2036183/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>M&#xf8;ller-Jensen</surname>
<given-names>Jakob</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/319962/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>J&#xf8;rgensen</surname>
<given-names>Mikkel Girke</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/930504/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Biochemistry and Molecular Biology</institution>, <institution>University of Southern Denmark</institution>, <addr-line>Odense</addr-line>, <country>Denmark</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Clinical Microbiology</institution>, <institution>Odense University Hospital</institution>, <addr-line>Odense</addr-line>, <country>Denmark</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/34672">Vasco Ariston De Carvalho Azevedo</ext-link>, Federal University of Minas Gerais, Brazil</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/510904/overview">Mamoon Rashid</ext-link>, (KAIMRC), Saudi Arabia</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/874919/overview">Olli-Pekka Smolander</ext-link>, Tallinn University of Technology, Estonia</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Mikkel Girke J&#xf8;rgensen, <email>mikkelj@bmb.sdu.dk</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Genomic Analysis, a section of the journal Frontiers in Bioinformatics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>08</day>
<month>02</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>3</volume>
<elocation-id>1074212</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>10</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>24</day>
<month>01</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Nielsen, M&#xf8;ller-Jensen and J&#xf8;rgensen.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Nielsen, M&#xf8;ller-Jensen and J&#xf8;rgensen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>
<bold>Introduction:</bold> Whole genome sequencing offers great opportunities for linking genotypes to phenotypes aiding in our understanding of human disease and bacterial pathogenicity. However, these analyses often overlook non-coding intergenic regions (IGRs). By disregarding the IGRs, crucial information is lost, as genes have little biological function without expression.</p>
<p>
<bold>Methods/Results:</bold> In this study, we present the first complete pangenome of the important human pathogen <italic>Streptococcus pneumoniae</italic> (pneumococcus), spanning both the genes and IGRs. We show that the pneumococcus species retains a small core genome of IGRs that are present across all isolates. Gene expression is highly dependent on these core IGRs, and often several copies of these core IGRs are found across each genome. Core genes and core IGRs show a clear linkage as 81% of core genes are associated with core IGRs. Additionally, we identify a single IGR within the core genome that is always occupied by one of two highly distinct sequences, scattered across the phylogenetic tree.</p>
<p>
<bold>Discussion:</bold> Their distribution indicates that this IGR is transferred between isolates through horizontal regulatory transfer independent of the flanking genes and that each type likely serves different regulatory roles depending on their genetic context.</p>
</abstract>
<kwd-group>
<kwd>genomics</kwd>
<kwd>pangenome</kwd>
<kwd>intergenic region</kwd>
<kwd>horizontal regulatory transfer</kwd>
<kwd>horizontal gene transfer</kwd>
<kwd>computational biology</kwd>
</kwd-group>
<contract-sponsor id="cn001">Sundhed og Sygdom, Det Frie Forskningsr&#xe5;d<named-content content-type="fundref-id">10.13039/100008392</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>
<italic>Streptococcus pneumonia</italic> (pneumococcus) is the leading cause of sepsis, meningitis and bacterial pneumoniae in children worldwide (<xref ref-type="bibr" rid="B18">O&#x2019;Brien et al., 2009</xref>). Widespread antibiotic resistance and the emergence of non-vaccine serotypes is making treatment increasingly difficult. These threats have led the WHO to list pneumococcus as a &#x201c;priority&#x201d; pathogen (<xref ref-type="bibr" rid="B18">O&#x2019;Brien et al., 2009</xref>; <xref ref-type="bibr" rid="B35">Weiser et al., 2018</xref>). This clinical relevance of pneumococcus has, in part, led to great scientific interest and the publication of several thousand sequenced genomes (<xref ref-type="bibr" rid="B17">National Center for Biotechnology Information, 2018</xref>).</p>
<p>The availability of whole genome sequence (WGS) data has made it possible to study the entire pangenome of an organism rather than single isolates. A pangenome consists of the collective gene pool present in a group of organisms belonging to the same clade (<xref ref-type="bibr" rid="B31">Tettelin et al., 2005</xref>). The pangenome can be divided into a core genome, which constitutes genes present in all isolates and the accessory genome as the remaining genes (<xref ref-type="bibr" rid="B31">Tettelin et al., 2005</xref>). The pangenome of pneumococcus is considered at the extreme end of being open, that is, there is no defined limit to its pangenome as new genes are acquired continuously (<xref ref-type="bibr" rid="B6">Donati et al., 2010</xref>). This openness is mainly due to new genes being acquired through horizontal gene transfer (HGT) mediated by pneumococcus&#x2019; natural competence (<xref ref-type="bibr" rid="B34">Vos, 2009</xref>; <xref ref-type="bibr" rid="B4">Chaguza et al., 2015</xref>).</p>
<p>Traditionally, pangenomes are limited to genes thereby excluding the non-coding intergenic regions (IGRs) (<xref ref-type="bibr" rid="B21">Page et al., 2015</xref>; <xref ref-type="bibr" rid="B36">Xiao et al., 2015</xref>). This focus on genes alone leaves out 15% of the genomes and ignores a significant amount of crucial genomic information as IGRs contain several biologically relevant elements such as promoters, terminators, regulatory binding sites and non-coding RNAs (<xref ref-type="bibr" rid="B10">Koonin et al., 2001</xref>; <xref ref-type="bibr" rid="B5">Dagan et al., 2008</xref>; <xref ref-type="bibr" rid="B23">Peters et al., 2011</xref>; <xref ref-type="bibr" rid="B15">McCutcheon and Moran, 2012</xref>; <xref ref-type="bibr" rid="B19">Ochman and Caro-Quintero, 2016</xref>; <xref ref-type="bibr" rid="B8">J&#xf8;rgensen et al., 2020</xref>). To effectively link genotypes to phenotypes through pangenomics, IGRs must be taken into consideration, as genes have little biological function without expression.</p>
<p>Recently, IGRs have attracted more attention as potential drivers of evolution (<xref ref-type="bibr" rid="B16">Molina and Van Nimwegen, 2008</xref>; <xref ref-type="bibr" rid="B20">Oren et al., 2014</xref>; <xref ref-type="bibr" rid="B32">Thorpe et al., 2017</xref>). They persist through purifying selection, also known as negative selection, where unused or unwanted traits are removed. This persistence is true across several diverse bacterial species, in a similar fashion to that of core genes, even when major regulatory elements are excluded (<xref ref-type="bibr" rid="B16">Molina and Van Nimwegen, 2008</xref>; <xref ref-type="bibr" rid="B32">Thorpe et al., 2017</xref>). Small variations in IGRs can lead to great phenotypical impact, for instance, the inversion of a single promoter element was demonstrated to turn a commensal bacterium pathogenic (<xref ref-type="bibr" rid="B29">Somvanshi et al., 2012</xref>).</p>
<p>IGRs may also undergo genetic recombination, a term coined <italic>horizontal regulatory transfer</italic> (HRT) (<xref ref-type="bibr" rid="B25">Ragan and Beiko, 2009</xref>; <xref ref-type="bibr" rid="B14">Matus-Garcia et al., 2012</xref>). HRT can occur with the flanking genes of the IGR, but in some cases, the IGRs are transferred independently of the genes they regulate (<xref ref-type="bibr" rid="B20">Oren et al., 2014</xref>). As much as 32% of the core regulatory regions in <italic>E. coli</italic> and 51% of the overall core IGRs are thought to have been acquired in this manner indicating that HRT is indeed common (<xref ref-type="bibr" rid="B20">Oren et al., 2014</xref>). Another aspect of HRT is regulatory switching where one IGR is replaced with another non-homologous IGR. This leads to two or more conserved IGRs occupying the same genomic space across different isolates of the same species (<xref ref-type="bibr" rid="B25">Ragan and Beiko, 2009</xref>; <xref ref-type="bibr" rid="B14">Matus-Garcia et al., 2012</xref>; <xref ref-type="bibr" rid="B29">Somvanshi et al., 2012</xref>; <xref ref-type="bibr" rid="B20">Oren et al., 2014</xref>; <xref ref-type="bibr" rid="B33">Thorpe et al., 2018</xref>). As much as 13% of the IGRs within the core genome of <italic>E. coli</italic> have undergone regulatory switching (<xref ref-type="bibr" rid="B20">Oren et al., 2014</xref>). Thus, IGRs seemingly contribute to greater variation in the core genome than genes themselves, thereby challenging the view of the bacterial core genome as being relatively stable (<xref ref-type="bibr" rid="B20">Oren et al., 2014</xref>; <xref ref-type="bibr" rid="B3">Caicedo-Montoya et al., 2021</xref>; <xref ref-type="bibr" rid="B7">Hyun et al., 2022</xref>).</p>
<p>In this study we map the complete core genome of pneumococcus and compare the nature of genes and IGRs against each other in the pangenome. We find a clear linkage between core genes and core IGRs, but core genes are associated with different IGRs, indicating that the pneumococcal core genome is less stable than previously thought. Additionally, we identify any potential regulatory switching events within this core genome. To our knowledge we are the first to identify the complete core genome of pneumococcus, both coding and non-coding.</p>
</sec>
<sec sec-type="results" id="s2">
<title>Results</title>
<p>In this study, we map the first complete pangenome of pneumococcus, spanning both genes and IGRs. The identified intergenic core genome is provided in <xref ref-type="sec" rid="s9">Supplementary Appendix SA1</xref>. Additionally, we screen for any regulatory switching events present within the core genome.</p>
<sec id="s2-1">
<title>Many intergenic regions are universally conserved across all pneumococcal isolates</title>
<p>We created a pangenome of 84 different pneumococcal isolates, spanning both genes and the non-coding IGRs. To put the nature of the pneumococcal IGR pangenome into perspective, we performed the same analysis for <italic>S. aureus</italic>. Both species may colonize the human upper respiratory tract, both are opportunistic pathogens and both possess open pangenomes, making them prime candidates for comparison (<xref ref-type="bibr" rid="B11">Laux et al., 2019</xref>). The analysis shows that the otherwise non-coding IGRs of both species are conserved in a similar manner to genes across the pangenome, although the number of unique genes outnumber the number of unique IGRs in both species (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The pangenome of <italic>S. pneumoniae</italic> and <italic>S. aureus</italic>, spanning both intergenic regions (green) and genes (orange), illustrated by Venn diagrams. Both species possess a core genome of both IGRs and genes, defined as being present in &#x3e;95% of isolates. The pangenomes are constructed from 84 unique genomes of each species. <italic>S. pneumoniae</italic> has a core genome of 1,550 genes and 669 IGRs, while an accessory genome of 3,132 genes and 2683 IGRs. <italic>S. aureus</italic> has a core genome of 2096 genes and 1,142 IGRs, while an accessory genome of 3,846 genes and 3,322 IGRs.</p>
</caption>
<graphic xlink:href="fbinf-03-1074212-g001.tif"/>
</fig>
<p>While the proportion of core genes roughly scales relative to the size of the genome (pneumococcus 2.1&#xa0;Mbp/<italic>S. aureus</italic> 2.8&#xa0;Mbp) the proportion of core IGRs relative to genome size is lower in pneumococcus (<xref ref-type="fig" rid="F1">Figure 1</xref>). However, pneumococcus seemingly compensates for the lower number of unique IGRs by having multiple copies of several core IGRs in each genome. On average, each core IGR is present 1.23 times in each pneumococcal genome compared to 1.03 times in <italic>S. aureus</italic>. Each core gene is present 1.08 times in each pneumococcal genome and 1.02 times in <italic>S. aureus</italic>, this indicates that the high copy number of pneumococcal core IGRs is quite unusual.</p>
</sec>
<sec id="s2-2">
<title>Core genes and IGRs constitute the majority of each genome</title>
<p>The average pneumococcal genome has 79% of its genes as core genes and 66% of its IGRs as core IGRs. The average <italic>S. aureus</italic> genome is comparatively close to that observed in pneumococcus, here core genes constitute 79% and core IGRs 68% of each genome (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>The number of genes and IGRs in the core and accessory genome in selected genomes and across the collected pangenome, as well as the percentage of genes and IGRs that are core.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Species/isolate</th>
<th align="center">Core genes</th>
<th align="center">Core IGRs</th>
<th align="center">Accessory genes</th>
<th align="center">Accessory IGRs</th>
<th align="center">Percentage core genes pr. genome (%)</th>
<th align="center">Percentage core IGRs pr. genome (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<italic>S. pneumoniae</italic> (<italic>Species average</italic>)</td>
<td align="center">1,670</td>
<td align="center">817</td>
<td align="center">448</td>
<td align="center">425</td>
<td align="center">78.93</td>
<td align="center">65.83</td>
</tr>
<tr>
<td align="left">
<italic>S. pneumoniae</italic>
</td>
<td rowspan="2" align="center">1,672</td>
<td rowspan="2" align="center">810</td>
<td rowspan="2" align="center">352</td>
<td rowspan="2" align="center">388</td>
<td rowspan="2" align="center">82.61</td>
<td rowspan="2" align="center">67.61</td>
</tr>
<tr>
<td align="left">D39</td>
</tr>
<tr>
<td align="left">
<italic>S. pneumoniae</italic>
</td>
<td rowspan="2" align="center">1,674</td>
<td rowspan="2" align="center">809</td>
<td rowspan="2" align="center">345</td>
<td rowspan="2" align="center">388</td>
<td rowspan="2" align="center">82.91</td>
<td rowspan="2" align="center">67.59</td>
</tr>
<tr>
<td align="left">R6</td>
</tr>
<tr>
<td align="left">
<italic>S. pneumoniae</italic>
</td>
<td rowspan="2" align="center">1703</td>
<td rowspan="2" align="center">820</td>
<td rowspan="2" align="center">447</td>
<td rowspan="2" align="center">447</td>
<td rowspan="2" align="center">79.21</td>
<td rowspan="2" align="center">64.72</td>
</tr>
<tr>
<td align="left">Tigr4</td>
</tr>
<tr>
<td align="left">
<italic>S. aureus (Species average)</italic>
</td>
<td align="center">2131</td>
<td align="center">1,170</td>
<td align="center">570</td>
<td align="center">544</td>
<td align="center">78.98</td>
<td align="center">68.28</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Despite pneumococcus having fewer unique core IGRs relative to genome size than <italic>S. aureus</italic>, as stated earlier, their copy number is higher in each genome, thus the percentage of core IGRs per genome is roughly equivalent in the two species (<xref ref-type="table" rid="T1">Table 1</xref>). The higher copy number of core IGRs in pneumococcus is also illustrated by the fact that 669 unique core IGRs exist in the pneumococcal core genome (<xref ref-type="fig" rid="F1">Figure 1</xref>) but on average each genome has 817 core IGRs (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
</sec>
<sec id="s2-3">
<title>IGRs are more likely to be unique to a few isolates than genes</title>
<p>The number of unique IGRs in the pneumococcal pangenome increases with the number of isolates analyzed in a similar manner to the number of unique genes (<xref ref-type="fig" rid="F2">Figure 2A</xref>). Overall, fewer unique IGRs are present in the pangenome than genes, part of this is due to the exclusion of IGRs of &#x3c;30&#xa0;bp in length, which are most often intraoperonic (<xref ref-type="bibr" rid="B33">Thorpe et al., 2018</xref>).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Properties of the pneumococcal pangenome and its intergenic regions (IGRs) <bold>(A)</bold> Number of unique intergenic regions (green) and genes (orange) as a function of the number of isolates included in the pangenome. <bold>(B)</bold> Distribution of unique IGRs (green) and genes (orange) across the streptococcal pangenome, illustrated with a frequency histogram (number of IGRs/genes present in the given number of isolates). Most IGRs and genes are part of the core genome or confined to a small fraction of the isolates.</p>
</caption>
<graphic xlink:href="fbinf-03-1074212-g002.tif"/>
</fig>
<p>Most IGRs are either present in almost all pneumococcus isolates or unique to only a few, that is, they are either very common or very rare (<xref ref-type="fig" rid="F2">Figure 2B</xref>). Pneumococcus genes show a similar distribution across the pangenome, though a larger proportion of IGRs are confined to only a few isolates than genes.</p>
<p>Pneumococcus retains more unique genes than IGRs within its pangenome (<xref ref-type="fig" rid="F2">Figure 2A</xref>), and most unique IGRs are only found in single isolates, making them rare (<xref ref-type="fig" rid="F2">Figure 2B</xref>). This scarcity of unique IGRs could indicate that IGRs experience a higher evolutionary selection threshold than genes, thereby lowering the likelihood of a newly acquired IGR of spreading to more isolates through HRT.</p>
</sec>
<sec id="s2-4">
<title>Double regulatory regions are more common in the core genome</title>
<p>IGRs can be categorized according to the orientation of their flanking genes. IGRs that are downstream of two convergently transcribed genes are considered non-regulatory (NR), IGRs that are upstream one gene and downstream another gene are considered single regulatory (SR) and IGRs that are between two divergently transcribed genes are considered double regulatory (DR) (<xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>The types of intergenic regions (IGRs) and their distribution in the pneumococcal pangenome, illustrated with a raincloud plot. Each point is a unique IGR of that type plotted against the number of isolates in the pangenome it is present in. Cloud areas are scaled relative to the size of each dataset. Core IGRs are present in &#x3e;95% strains (orange) and accessory IGRs are present in &#x3c;95% of isolates (green). The IGRs are categorized according to the orientation of their flanking genes. If the flanking genes are pointing in the same direction the IGR is categorized as single regulatory (SR), if they face towards the IGR, it is categorized as non-regulatory (NR) and if they face away from the IGR, it is categorized as double regulatory (DR).</p>
</caption>
<graphic xlink:href="fbinf-03-1074212-g003.tif"/>
</fig>
<p>Looking at the distribution of the IGR types across the pneumococcal pangenome, NR and DR regions are rare compared to SR IGRs (<xref ref-type="fig" rid="F3">Figure 3</xref>). DR regions also constitute a greater relative proportion of the core IGRs than seen in the accessory genome.</p>
</sec>
<sec id="s2-5">
<title>Core IGRs are linked to core genes</title>
<p>Next, we analyzed the degree of linkage between core IGRs and core genes, that is, how often a core IGR is directly upstream a core gene. IGRs and their flanking genes were identified and any IGRs directly upstream the start codon of a gene was selected. The status of the IGR/gene pairs as accessory or core genome was then assessed and the ratio of each combination calculated. On average 81% of core genes in <italic>S. pneumoniae</italic> are associated with a core IGR, whereas only 74% of accessory genes are linked to accessory IGRs (<xref ref-type="table" rid="T2">Table 2</xref>). For comparison, the linkage of core genes to core IGRs is greater in <italic>S. aureus</italic> at 86%, and accessory IGRs are flanking accessory genes 82% of the time.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Genes and their upstream IGR was analyzed for their distribution in the pangenome. Listed are the percentage core and/or genes with a core and/or accessory IGR immediately upstream.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Species/isolate</th>
<th align="center">Core gene: core IGR (%)</th>
<th align="center">Core gene: accessory IGR (%)</th>
<th align="center">Accessory gene: core IGR (%)</th>
<th align="center">Accessory gene: accessory IGR (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<italic>S. pneumoniae</italic> (Species average)</td>
<td align="center">80.92</td>
<td align="center">19.08</td>
<td align="center">26.45</td>
<td align="center">73.55</td>
</tr>
<tr>
<td align="left">
<italic>S. pneumoniae</italic>
</td>
<td rowspan="2" align="center">81.06</td>
<td rowspan="2" align="center">18.94</td>
<td rowspan="2" align="center">30.77</td>
<td rowspan="2" align="center">69.23</td>
</tr>
<tr>
<td>D39</td>
</tr>
<tr>
<td align="left">
<italic>S. pneumoniae</italic>
</td>
<td rowspan="2" align="center">80.97</td>
<td rowspan="2" align="center">19.03</td>
<td rowspan="2" align="center">30.39</td>
<td rowspan="2" align="center">69.61</td>
</tr>
<tr>
<td align="left">R6</td>
</tr>
<tr>
<td align="left">
<italic>S. pneumoniae</italic>
</td>
<td rowspan="2" align="center">80.38</td>
<td rowspan="2" align="center">19.62</td>
<td rowspan="2" align="center">26.52</td>
<td rowspan="2" align="center">73.48</td>
</tr>
<tr>
<td align="left">Tigr4</td>
</tr>
<tr>
<td align="left">
<italic>S.aureus (Species average)</italic>
</td>
<td align="center">86.19</td>
<td align="center">13.81</td>
<td align="center">17.62</td>
<td align="center">82.38</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>None of the IGRs of the capsular polysaccharide synthesis (cps) operon were found to be core IGRs, however the highly conserved flanking genes <italic>dexB</italic> and <italic>aliA</italic> were both associated with core IGRs (Appendix 1).</p>
</sec>
<sec id="s2-6">
<title>A single core IGR shows sign of regulatory switching</title>
<p>Next, we examined the IGR candidates for regulatory switching. Regulatory switching describes when one IGR is replaced by a different non-homologue IGR. The origin of these switched IGRs is not inferred in this analysis, thus they can both originate from within the isolate itself or even from a separate species. For this analysis, only switches where the IGRs share no significant sequence homology with a BLASTN were included.</p>
<p>We detected three switches within the pneumococcus pangenome and only one of these is flanked by core genes. We designated the core switched IGR as csIGR (<xref ref-type="table" rid="T3">Table 3</xref>). While the two versions of the csIGR are highly conserved on their own, with both having a nucleotide identity of &#x3e;99% amongst themselves, aligning the two versions with each other results in an insignificant nucleotide identity of 57%. These results were manually confirmed with a blastn and confirmed that all pneumococcal isolates always have one of these two csIGRs but only in a single copy and always between the same flanking genes.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>The two versions of the csIGR in-between the single copy core genes. csIGR1 is present in 32 of the isolates analyzed and is highly conserved across the genomes with an average nucleotide identity of 99.49%. csIGR2 is present in 52 of the strains analyzed and is likewise highly conserved with an average nucleotide identity of 99.28%. Both IGRs have roughly the same length in base pairs.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th align="left">Length</th>
<th align="left">SNPs</th>
<th align="left">Nuc_identity (%)</th>
<th align="left">Length_identity (%)</th>
<th align="left">No.isolates</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">csIGR1</td>
<td align="char" char=".">215</td>
<td align="char" char=".">1</td>
<td align="char" char=".">99.49</td>
<td align="char" char=".">99.53</td>
<td align="char" char=".">32</td>
</tr>
<tr>
<td align="left">csIGR2</td>
<td align="char" char=".">214</td>
<td align="char" char=".">1</td>
<td align="char" char=".">99.28</td>
<td align="char" char=".">99.05</td>
<td align="char" char=".">52</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The flanking genes were both single copy core genes and were identified in the common lab strains <italic>S. pneumoniae</italic> D39 and TIGR4 (<xref ref-type="fig" rid="F4">Figure 4</xref>). These two strains have distinct csIGR types, with D39 having csIGR1 and TIGR4 having csIGR2 (<xref ref-type="fig" rid="F4">Figure 4</xref>). Interestingly, rather than flanking an operon, the IGRs are predicted to sit in the middle of an operon. Little is known about the flanking genes, other than their status as single copy core genes found in this study. SPD_1559/SP_1749 is considered essential in pneumococcus and is a homologue to <italic>ygeH</italic>, a gene involved in biogenesis of the 30&#xa0;S ribosome subunit (<xref ref-type="bibr" rid="B22">Pek et al., 2007</xref>; <xref ref-type="bibr" rid="B13">Liu et al., 2017</xref>). The sequence of both csIGR types is provided in <xref ref-type="sec" rid="s9">Supplementary Appendix SA2</xref>. Interestingly, neither of the csIGR types were confined to a specific phylogenetic cluster of pneumococci (<xref ref-type="fig" rid="F5">Figure 5</xref>). The fact that the csIGR types are spread across the phylogenetic tree indicates that their distribution is due to HRT.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>The core switched intergenic region (csIGR) in <italic>S. pneumoniae</italic> D39 and Tigr4. Each type of csIGR is represented in these strains, with D39 having csIGR1 (orange) and TIGR4 having csIGR2 (green). In D39, csIGR1 is flanked by SPD_1558 and SPD_1559. In TIGR4, csIGR2 is flanked by the genes SP_1748 and SP_1749.</p>
</caption>
<graphic xlink:href="fbinf-03-1074212-g004.tif"/>
</fig>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Unrooted phylogenetic tree of the 84 <italic>S. pneumoniae</italic> strains used in this study. The tree is based on SNPs in the core genes. The shading of each label indicates the presence of csIGR1 (orange) or csIGR2 (green). The tree was created using Roary, Fasttree and iTol. The <italic>S. pneumoniae</italic> strain names are specified if applicable, if no clear strain name was given the sequence ID was used.</p>
</caption>
<graphic xlink:href="fbinf-03-1074212-g005.tif"/>
</fig>
<p>We performed a pangenome wide association study to see if any genes within the accessory genome were significantly co-occurring with the csIGR alleles across the pangenome. However, no genes were exclusively associated with neither of the csIGR types. Both sequences were also screened for promoters, riboswitches and homology to known regulatory RNAs with no significant hits. However, the translated RNA sequence of both sequences was predicted to form significant secondary structures, the significance of which remains to be elucidated. The predicted secondary structures are provided in <xref ref-type="sec" rid="s9">Supplementary Appendix SA2</xref>.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s3">
<title>Discussion</title>
<p>Here we present the first complete pangenome of pneumococcus, spanning both genes and the non-coding IGRs. A small but conserved IGR core genome in pneumococcus was identified. We find that the pneumococcal core genome consists of 1,550 unique genes and 669 IGRs, whereas the accessory genome consists of 3,132 unique genes and 2683 IGRs. The number of unique genes surpasses that of unique IGR in both cases, this is unsurprising as most intraoperonic regions in pneumococcus are less than 30&#xa0;bp in length and are therefore disregarded in the analysis. This also means that most IGRs identified are associated with the flanking genes of operons i.e., the regulatory regions.</p>
<p>IGRs between two divergently transcribed genes are termed double regulatory (DR). These regions constituted a greater relative part of the core genome than the accessory genome. It is likely because meaningful regulation of two genes is harder to achieve than regulation of single genes, raising the selection threshold for the emergence of beneficial divergence. This increased selection pressure has previously been observed as purifying selection has been shown to be more prominent in DR regions than the other types (<xref ref-type="bibr" rid="B16">Molina and Van Nimwegen, 2008</xref>).</p>
<p>Our analysis reveals that IGRs are highly conserved in pneumococcus. On average, 66% of IGRs in any isolate is shared with all other isolates and 79% of genes in any isolate is shared with all other isolates. A similar trend is seen in <italic>S. aureus</italic>, however, the overall number of unique IGRs is lower in pneumococcus relative to genome size. Instead, our analysis reveals that pneumococcus has several duplicates of some core IGRs across the genome, with core IGRs on average being present 1.25 times in each isolate. This trend is not seen with its core genes and is not observed in neither the core genes nor core IGRs of <italic>S. aureus</italic>. This suggests that pneumococcus is more rigid with its transcriptional profile as the same regulatory regions might be repeated to a greater degree than observed in <italic>S. aureus</italic>.</p>
<p>We identify a clear linkage between the core IGRs and core genes in pneumococcus, on average 81% of core IGRs are directly upstream of a core gene. This indicates that the transcriptional regulation of the core genome in pneumococcus is mostly conserved across all isolates, but to a lesser degree than seen in <italic>S. aureus</italic>. However, this leaves 19% of core genes being associated with accessory IGRs, indicating some plasticity to the core genome that is otherwise viewed as stable. The greatest difference seen between the two species in this regard is that core IGRs are more often associated with accessory genes in pneumococcus. This might be explained by pneumococcus retaining multiple copies of some core IGRs, making them associated with both core and accessory genes, though this remains to the investigated.</p>
<p>Surprisingly, only three switches were detected in pneumococcus and only one of these was flanked by core genes. In another study, the same analysis was done on a collection of <italic>E. coli</italic> genomes and 61 switches were detected (<xref ref-type="bibr" rid="B33">Thorpe et al., 2018</xref>). This indicates that regulatory switching does not play a major role in pneumococcal disease. It is possible that regulatory switching is more prominent in <italic>E. coli</italic> as it inhabits a great number of different niches compared to pneumococcus (<xref ref-type="bibr" rid="B30">Tenaillon et al., 2010</xref>; <xref ref-type="bibr" rid="B35">Weiser et al., 2018</xref>).Our results show that the pneumococcal core genome is less stable than previously thought. While there is indeed a stable reservoir of highly conserved core genes, their flanking IGRs, which contain most of the regulatory regions responsible for controlling the transcription of these core genes show greater plasticity. We believe that future studies will benefit from viewing the genes as a &#x201c;package&#x201d; with their upstream IGR, as even core genes maintain different regulatory regions within the pneumococcal species.</p>
</sec>
<sec sec-type="materials|methods" id="s4">
<title>Materials and methods</title>
<sec id="s4-1">
<title>Genomes</title>
<p>All 84 complete <italic>S. pneumoniae</italic> genomes available from the National Center for Biotechnology Information, GenBank resource was downloaded in raw FASTA format. Additionally, 84 randomly selected <italic>S. aureus</italic> genomes were retrieved for comparison with <italic>S. pneumoniae</italic> (12/5/2021). Genomes were then annotated with Prokka (v 1.14.5), using the standard parameters of the software (<xref ref-type="bibr" rid="B27">Seemann, 2014</xref>). The genomes used are listed in <xref ref-type="sec" rid="s9">Supplementary Appendix SA3</xref>.</p>
</sec>
<sec id="s4-2">
<title>Pangenome creation</title>
<p>Initially a pangenome of the coding sequences (CDS) was created using Roary (v3.13.0) (<xref ref-type="bibr" rid="B21">Page et al., 2015</xref>). Then a complementary pangenome of the IGRs was created using Piggy (v1.5), an intergenic pangenome analysis tool that emulates Roary (<xref ref-type="bibr" rid="B33">Thorpe et al., 2018</xref>). Some steps were taken to ensure comparability between the outputs of the software. Roary was set to cluster CDSs with -e -n (to perform alignments using MAFFT (<xref ref-type="bibr" rid="B9">Katoh et al., 2002</xref>)), -i 90 (90% sequence identity cut-off) and -s (to not split paralogs into separate clusters). The settings for running Piggy were set at the standard parameters of the software, except for -len_id 10 (the minimum percentage of length identity to form a cluster). The length identity was reduced for comparability with Roary, as gene clusters generated by Roary only require a sequence length identity of 120&#xa0;bp for clustering CDSs, thus the len_id of 10 is recommended by the creators of Piggy for Roary consistency as IGRs are not erroneously placed into separate clusters (<xref ref-type="bibr" rid="B10">Koonin et al., 2001</xref>; <xref ref-type="bibr" rid="B5">Dagan et al., 2008</xref>; <xref ref-type="bibr" rid="B16">Molina and Van Nimwegen, 2008</xref>; <xref ref-type="bibr" rid="B25">Ragan and Beiko, 2009</xref>; <xref ref-type="bibr" rid="B23">Peters et al., 2011</xref>; <xref ref-type="bibr" rid="B14">Matus-Garcia et al., 2012</xref>; <xref ref-type="bibr" rid="B15">McCutcheon and Moran, 2012</xref>; <xref ref-type="bibr" rid="B29">Somvanshi et al., 2012</xref>; <xref ref-type="bibr" rid="B20">Oren et al., 2014</xref>; <xref ref-type="bibr" rid="B21">Page et al., 2015</xref>; <xref ref-type="bibr" rid="B36">Xiao et al., 2015</xref>; <xref ref-type="bibr" rid="B19">Ochman and Caro-Quintero, 2016</xref>; <xref ref-type="bibr" rid="B32">Thorpe et al., 2017</xref>; <xref ref-type="bibr" rid="B33">Thorpe et al., 2018</xref>; <xref ref-type="bibr" rid="B8">J&#xf8;rgensen et al., 2020</xref>). The randomly assigned locus tags provided by Prokka were translated when necessary, by aligning the GFF files of the Genbank annotated files and Prokka output.</p>
</sec>
<sec id="s4-3">
<title>Core gene and core IGR linkage analysis</title>
<p>The linkage of core genes and core IGRs was quantified using R (v. 4.1.0). The gene_presence_absence file from Roary and the IGR_presence_absence file from Piggy was loaded as dataframes in R. For each gene and IGR cluster in the files their status as a core or accessory gene was identified and assigned. For each genome all IGRs were paired with their upstream gene. Thus, NR regions were removed from the dataset and both flanking genes for DR regions were analyzed separately, if any of the two genes were core, the DR IGR was assigned as flanking a core gene. The R code is provided in <xref ref-type="sec" rid="s9">Supplementary Appendix SA4</xref>.</p>
</sec>
<sec id="s4-4">
<title>Switched intergenic regions analysis</title>
<p>For identification of switched IGRs, a separate analysis using Piggy was performed with -len_id 90 (the minimum percentage of length identity to form a cluster). This was done to perform a more strict analysis of the IGRs, as the higher threshold for forming a cluster ensured that homologue IGRs were not identified as switched IGRs (<xref ref-type="bibr" rid="B33">Thorpe et al., 2018</xref>). IGR switches were identified using the &#x201c;gene-pair&#x201d; method of Piggy, here two or more different IGR sequences that occupy the same space between a specific gene pair are analyzed. The candidate IGR sequences are then aligned with BLASTN with low complexity filtering turned off and if there are no significant matches between the IGR they are identified as &#x201c;switched&#x201d;. If there is a significant match Piggy aligns the sequences using MAFFT and provides the nucleotide identity of the alignments.</p>
<p>The identified switch was validated manually with a BLASTN against all the genomes (data not shown).</p>
</sec>
<sec id="s4-5">
<title>Phylogenetic analysis</title>
<p>A phylogenetic tree of the strains included in this study was created, based on single nucleotide polymorphisms (SNPs) in the core genes. Roary was run separately with the same settings as previously mentioned with the exception of -e (Core gene alignment with PRANK) (<xref ref-type="bibr" rid="B21">Page et al., 2015</xref>). This produced a highly accurate alignment of the core genes within the pangenome. FastTree (v2.1.11) was then run to infer an approximately-maximum-likelihood phylogenetic tree based on SNPs within the core genes (<xref ref-type="bibr" rid="B24">Price et al., 2009</xref>). The resulting newick file was then visualized using iTol (v5.7) and exported to Adobe Illustrator (<xref ref-type="bibr" rid="B12">Letunic and Bork, 2007</xref>).</p>
</sec>
<sec id="s4-6">
<title>Pangenome wide association study</title>
<p>To identify whether any accessory genes were significantly associated with any of the csIGR alleles, a pangenome-wide association study was performed using Scoary (v1.6.16) (<xref ref-type="bibr" rid="B2">Brynildsrud et al., 2016</xref>). A trait matrix was created as an input for Scoary, indicating which of the two csIGRs alleles were present in which genomes. Scoary then sorted the accessory genome provided by the gene_presence_absence file from Roary, scoring each accessory gene according to their co-occurrence with each csIGR.</p>
</sec>
<sec id="s4-7">
<title>csIGR1 and csIGR2 analysis</title>
<p>To assess homology to existing regulatory RNAs, a BLASTN was performed for each csIGR against the RefSeq RNA database. Both sequences were screened for potential riboswitches using Riboswitch Finder (<xref ref-type="bibr" rid="B1">Bengert and Dandekar, 2004</xref>). Any potential promoter or terminator regions were screened for using BPROM and FindTerm (Softberry) (<xref ref-type="bibr" rid="B28">Solovyev et al., 2011</xref>). Secondary structures were predicted for both sequences using the RNA structure package available through Mathews lab, at standard parameters (<xref ref-type="bibr" rid="B26">Reuter and Mathews, 2010</xref>).</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. In total, 84 different pneumococcal genomes and 84 randomly selected <italic>S. aureus</italic> genomes were retrieved from the National Center for Biotechnology Information. Their accession numbers are stated in <xref ref-type="sec" rid="s9">Supplementary Appendix SA3</xref>.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>FN, MJ and JM-J conceptualized the study. FN and MJ wrote the paper. Project supervised and funded by JM-J and MJ.</p>
</sec>
<sec sec-type="COI-statement" id="s7">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s8">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s9">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fbinf.2023.1074212/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fbinf.2023.1074212/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material>
<label>Supplementary Table S1</label>
<caption>
<p>
<italic>Streptococcus pneumoniae</italic> intergenic core genome.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table S2</label>
<caption>
<p>Sequences and predicted secondary structures of csIGR1 and csIGR2.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table S3</label>
<caption>
<p>Genomes used in this study.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table S4</label>
<caption>
<p>Code.</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Table3.XLSX" id="SM1" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table2.DOCX" id="SM2" mimetype="application/DOCX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table1.XLSX" id="SM3" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table4.DOCX" id="SM4" mimetype="application/DOCX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bengert</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dandekar</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Riboswitch finder--a tool for identification of riboswitch RNAs</article-title>. <source>Nucleic Acids Res.</source> <volume>32</volume>, <fpage>W154</fpage>&#x2013;<lpage>W159</lpage>. <pub-id pub-id-type="doi">10.1093/NAR/GKH352</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brynildsrud</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Bohlin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Scheffer</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Eldholm</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Rapid scoring of genes in microbial pan-genome-wide association studies with Scoary</article-title>. <source>Genome Biol.</source> <volume>17</volume> (<issue>1</issue>), <fpage>238</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-016-1108-8</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Caicedo-Montoya</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Manzo-Ruiz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>R&#xed;os-Estepa</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Pan-genome of the genus streptomyces and prioritization of biosynthetic gene clusters with potential to produce antibiotic compounds</article-title>. <source>Front. Microbiol.</source> <volume>12</volume>, <fpage>677558</fpage>. <pub-id pub-id-type="doi">10.3389/FMICB.2021.677558</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chaguza</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Cornick</surname>
<given-names>J. E.</given-names>
</name>
<name>
<surname>Everett</surname>
<given-names>D. B.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Mechanisms and impact of genetic recombination in the evolution of <italic>Streptococcus pneumoniae</italic>
</article-title>. <source>Comput. Struct. Biotechnol. J.</source> <volume>13</volume>, <fpage>241</fpage>&#x2013;<lpage>247</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2015.03.007</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dagan</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Artzy-Randrup</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Modular networks and cumulative impact of lateral transfer in prokaryote genome evolution</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A.</source> <volume>105</volume>, <fpage>10039</fpage>&#x2013;<lpage>10044</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0800679105</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Donati</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hiller</surname>
<given-names>N. L.</given-names>
</name>
<name>
<surname>Tettelin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Muzzi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Croucher</surname>
<given-names>N. J.</given-names>
</name>
<name>
<surname>Angiuoli</surname>
<given-names>S. V.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>Structure and dynamics of the pan-genome of Streptococcus pneumoniae and closely related species</article-title>. <source>Genome Biol.</source> <volume>11</volume>, <fpage>R107</fpage>. <pub-id pub-id-type="doi">10.1186/gb-2010-11-10-r107</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hyun</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Monk</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Palsson</surname>
<given-names>B. O.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Comparative pangenomics: Analysis of 12 microbial pathogen pangenomes reveals conserved global structures of genetic and functional diversity</article-title>. <source>BMC Genomics</source> <volume>23</volume>, <fpage>7</fpage>. <pub-id pub-id-type="doi">10.1186/S12864-021-08223-8</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>J&#xf8;rgensen</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Pettersen</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Kallipolitis</surname>
<given-names>B. H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>sRNA-mediated control in bacteria: An increasing diversity of regulatory mechanisms</article-title>. <source>Biochimica Biophysica Acta - Gene Regul. Mech.</source> <volume>1863</volume>, <fpage>194504</fpage>. <pub-id pub-id-type="doi">10.1016/j.bbagrm.2020.194504</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Katoh</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Misawa</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kuma</surname>
<given-names>K. I.</given-names>
</name>
<name>
<surname>Miyata</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>Mafft: A novel method for rapid multiple sequence alignment based on fast fourier transform</article-title>. <source>Nucleic Acids Res.</source> <volume>30</volume>, <fpage>3059</fpage>&#x2013;<lpage>3066</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkf436</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Koonin</surname>
<given-names>E. V.</given-names>
</name>
<name>
<surname>Makarova</surname>
<given-names>K. S.</given-names>
</name>
<name>
<surname>Aravind</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Horizontal gene transfer in prokaryotes: Quantification and classification</article-title>. <source>Annu. Rev. Microbiol.</source> <volume>55</volume>, <fpage>709</fpage>&#x2013;<lpage>742</lpage>. <pub-id pub-id-type="doi">10.1146/annurev.micro.55.1.709</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Laux</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Peschel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Krismer</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>
<italic>Staphylococcus aureus</italic> colonization of the human nose and interaction with other microbiome members</article-title>. <source>Microbiol. Spectr.</source> <volume>7</volume>. <pub-id pub-id-type="doi">10.1128/microbiolspec.gpp3-0029-2018</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Letunic</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bork</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2007</year>), <article-title>Interactive Tree Of Life (iTOL): An online tool for phylogenetic tree display and annotation</article-title>. <source>Bioinformatics</source> <volume>23</volume> (<issue>1</issue>), <fpage>127</fpage>&#x2013;<lpage>128</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btl529</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Gallay</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kjos</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Domenech</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Slager</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kessel</surname>
<given-names>S. P.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>High&#x2010;throughput CRISPRi phenotyping identifies new essential genes in Streptococcus pneumoniae</article-title>. <source>Mol. Syst. Biol.</source> <volume>13</volume>, <fpage>931</fpage>. <pub-id pub-id-type="doi">10.15252/msb.20167449</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Matus-Garcia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nijveen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Van Passel</surname>
<given-names>M. W. J.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Promoter propagation in prokaryotes</article-title>. <source>Nucleic Acids Res.</source> <volume>40</volume>, <fpage>10032</fpage>&#x2013;<lpage>10040</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gks787</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McCutcheon</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Moran</surname>
<given-names>N. A.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Extreme genome reduction in symbiotic bacteria</article-title>. <source>Nat. Rev. Microbiol.</source> <volume>10</volume>, <fpage>13</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1038/nrmicro2670</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Molina</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Van Nimwegen</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Universal patterns of purifying selection at noncoding positions in bacteria</article-title>. <source>Genome Res.</source> <volume>18</volume>, <fpage>148</fpage>&#x2013;<lpage>160</lpage>. <pub-id pub-id-type="doi">10.1101/gr.6759507</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="web">
<collab>National Center for Biotechnology Information</collab>, (<year>2018</year>).&#x201c;<article-title>GenBank and WGS statistics</article-title>,&#x201d; <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/genbank/statistics/">https://www.ncbi.nlm.nih.gov/genbank/statistics/</ext-link> (accessed Dec. 13, 2018)</comment>.</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>O&#x2019;Brien</surname>
<given-names>K. L.</given-names>
</name>
<name>
<surname>Wolfson</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Watt</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Henkle</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Deloria-Knoll</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>McCall</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2009</year>). <article-title>Burden of disease caused by Streptococcus pneumoniae in children younger than 5 years: Global estimates</article-title>. <source>Lancet</source> <volume>374</volume>, <fpage>893</fpage>&#x2013;<lpage>902</lpage>. <pub-id pub-id-type="doi">10.1016/S0140-6736(09)61204-6</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ochman</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Caro-Quintero</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Genome size and structure, bacterial</article-title>,&#x201d; in <source>Encyclopedia of evolutionary biology</source> (<publisher-loc>Amsterdam</publisher-loc>: <publisher-name>Elsevier</publisher-name>).</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oren</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>M. B.</given-names>
</name>
<name>
<surname>Johns</surname>
<given-names>N. I.</given-names>
</name>
<name>
<surname>Kaplan Zeevi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Biran</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ron</surname>
<given-names>E. Z.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Transfer of noncoding DNA drives regulatory rewiring in Bacteria</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A.</source> <volume>111</volume>, <fpage>16112</fpage>&#x2013;<lpage>16117</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1413272111</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Page</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Cummins</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Hunt</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>V. K.</given-names>
</name>
<name>
<surname>Reuter</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Holden</surname>
<given-names>M. T.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Roary: Rapid large-scale prokaryote pan genome analysis</article-title>. <source>Bioinformatics</source> <volume>31</volume> (<issue>22</issue>), <fpage>3691</fpage>&#x2013;<lpage>3693</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btv421</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pek</surname>
<given-names>C. L.</given-names>
</name>
<name>
<surname>Morimoto</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Matsuo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Oshima</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ogasawara</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>The GTP-binding protein YqeH participates in biogenesis of the 30S ribosome subunit in Bacillus subtilis</article-title>. <source>Genes Genet. Syst.</source> <volume>82</volume>, <fpage>281</fpage>&#x2013;<lpage>289</lpage>. <pub-id pub-id-type="doi">10.1266/ggs.82.281</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peters</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Vangeloff</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Landick</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Bacterial transcription terminators: The RNA 3&#x2032;-end chronicles</article-title>. <source>J. Mol. Biol.</source> <volume>412</volume>, <fpage>793</fpage>&#x2013;<lpage>813</lpage>. <pub-id pub-id-type="doi">10.1016/j.jmb.2011.03.036</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Price</surname>
<given-names>M. N.</given-names>
</name>
<name>
<surname>Dehal</surname>
<given-names>P. S.</given-names>
</name>
<name>
<surname>Arkin</surname>
<given-names>A. P.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Fasttree: Computing large minimum evolution trees with profiles instead of a distance matrix</article-title>. <source>Mol. Biol. Evol.</source> <volume>26</volume>, <fpage>1641</fpage>&#x2013;<lpage>1650</lpage>. <pub-id pub-id-type="doi">10.1093/molbev/msp077</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ragan</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Beiko</surname>
<given-names>R. G.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Lateral genetic transfer: Open issues</article-title>. <source>Philos. Trans. R. Soc. B Biol. Sci.</source> <volume>364</volume>, <fpage>2241</fpage>&#x2013;<lpage>2251</lpage>. <pub-id pub-id-type="doi">10.1098/rstb.2009.0031</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reuter</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Mathews</surname>
<given-names>D. H.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>RNAstructure: Software for RNA secondary structure prediction and analysis</article-title>. <source>BMC Bioinforma.</source> <volume>11</volume>, <fpage>129</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-11-129</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Seemann</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Prokka: Rapid prokaryotic genome annotation</article-title>. <source>Bioinformatics</source> <volume>30</volume> (<issue>14</issue>), <fpage>2068</fpage>&#x2013;<lpage>2069</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btu153</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Solovyev</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Salamov</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>Automatic annotation of microbial genomes and metagenomic sequences</article-title>,&#x201d; in <source>Metagenomics and its applications in agriculture, biomedicine and environmental studies</source>. Editor <person-group person-group-type="editor">
<name>
<surname>Li</surname>
<given-names>R. W.</given-names>
</name>
</person-group> (<publisher-loc>New York</publisher-loc>: <publisher-name>Nova Science Publishers</publisher-name>), <fpage>61</fpage>&#x2013;<lpage>78</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Somvanshi</surname>
<given-names>V. S.</given-names>
</name>
<name>
<surname>Sloup</surname>
<given-names>R. E.</given-names>
</name>
<name>
<surname>Crawford</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Heidt</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>K. s.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>A single promoter inversion switches photorhabdus between pathogenic and mutualistic states</article-title>. <source>Science</source> <volume>80</volume>, <fpage>88</fpage>&#x2013;<lpage>93</lpage>. <pub-id pub-id-type="doi">10.1126/science.1216641</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tenaillon</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Skurnik</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Picard</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Denamur</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>The population genetics of commensal <italic>Escherichia coli</italic>
</article-title>. <source>Nat. Rev. Microbiol.</source> <volume>8</volume>, <fpage>207</fpage>&#x2013;<lpage>217</lpage>. <pub-id pub-id-type="doi">10.1038/nrmicro2298</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tettelin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Masignani</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Cieslewicz</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Donati</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Medini</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ward</surname>
<given-names>N. L.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>Genome analysis of multiple pathogenic isolates of Streptococcus agalactiae: Implications for the microbial &#x2018;pan-genome</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A.</source> <volume>102</volume>, <fpage>13950</fpage>&#x2013;<lpage>13955</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0506758102</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thorpe</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Bayliss</surname>
<given-names>S. C.</given-names>
</name>
<name>
<surname>Hurst</surname>
<given-names>L. D.</given-names>
</name>
<name>
<surname>Feil</surname>
<given-names>E. J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Comparative analyses of selection operating on nontranslated intergenic regions of diverse bacterial species</article-title>. <source>Genetics</source>. <pub-id pub-id-type="doi">10.1534/genetics.116.195784</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thorpe</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Bayliss</surname>
<given-names>S. C.</given-names>
</name>
<name>
<surname>Sheppard</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Feil</surname>
<given-names>E. J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Piggy: A rapid, large-scale pan-genome analysis tool for intergenic regions in bacteria</article-title>. <source>Gigascience</source>. <pub-id pub-id-type="doi">10.1093/gigascience/giy015</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vos</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Why do bacteria engage in homologous recombination?</article-title> <source>Trends Microbiol.</source> <volume>17</volume>, <fpage>226</fpage>&#x2013;<lpage>232</lpage>. <pub-id pub-id-type="doi">10.1016/j.tim.2009.03.001</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weiser</surname>
<given-names>J. N.</given-names>
</name>
<name>
<surname>Ferreira</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Paton</surname>
<given-names>J. C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Streptococcus pneumoniae: Transmission, colonization and invasion</article-title>. <source>Nat. Rev. Microbiol.</source> <volume>16</volume> (<issue>6</issue>), <fpage>355</fpage>&#x2013;<lpage>367</lpage>. <pub-id pub-id-type="doi">10.1038/s41579-018-0001-8</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>A brief review of software tools for pangenomics</article-title>. <source>Genomics, Proteomics Bioinforma.</source> <volume>13</volume> (<issue>1</issue>), <fpage>73</fpage>&#x2013;<lpage>76</lpage>. <pub-id pub-id-type="doi">10.1016/j.gpb.2015.01.007</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>