<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mol. Biosci.</journal-id>
<journal-title>Frontiers in Molecular Biosciences</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mol. Biosci.</abbrev-journal-title>
<issn pub-type="epub">2296-889X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">663532</article-id>
<article-id pub-id-type="doi">10.3389/fmolb.2021.663532</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Molecular Biosciences</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>BowSaw: Inferring Higher-Order Trait Interactions Associated With Complex Biological Phenotypes</article-title>
<alt-title alt-title-type="left-running-head">DiMucci et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Higher-Order Interactions in Complex Phenotypes</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>DiMucci</surname>
<given-names>Demetrius</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1265257/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kon</surname>
<given-names>Mark</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1200858/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Segr&#xe8;</surname>
<given-names>Daniel</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/203875/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Bioinformatics Graduate Program, Boston University, <addr-line>Boston</addr-line>, <addr-line>MA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Biological Design Center, Boston University, <addr-line>Boston</addr-line>, <addr-line>MA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Department of Mathematics and Statistics, Boston University, <addr-line>Boston</addr-line>, <addr-line>MA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff4">
<label>
<sup>4</sup>
</label>Department of Biology, Boston University, <addr-line>Boston</addr-line>, <addr-line>MA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff5">
<label>
<sup>5</sup>
</label>Department of Biomedical Engineering, Boston University, <addr-line>Boston</addr-line>, <addr-line>MA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff6">
<label>
<sup>6</sup>
</label>Department of Physics, Boston University, <addr-line>Boston</addr-line>, <addr-line>MA</addr-line>, <country>United&#x20;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1092788/overview">Frederic Cadet</ext-link>, DSIMB, UMR S-1134, INSERM, Laboratory of Excellence Labex GR, France</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/177339/overview">Tatiana Galochkina</ext-link>, Universit&#xe9; de Paris, France</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/164086/overview">Elodie Laine</ext-link>, Universit&#xe9; Pierre et Marie Curie, France</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Daniel Segr&#xe8;, <email>dsegre@bu.edu</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Biological Modeling and Simulation, a section of the journal Frontiers in Molecular Biosciences</p>
</fn>
<fn fn-type="present-address" id="fn1">
<label>
<sup>
<bold>&#x2020;</bold>
</sup>
</label>
<p>
<bold>Present Address:</bold> The Forsyth Institute, Cambridge, MA, United&#x20;States</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>17</day>
<month>06</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>8</volume>
<elocation-id>663532</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>02</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>24</day>
<month>05</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 DiMucci, Kon and Segr&#xe8;.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>DiMucci, Kon and Segr&#xe8;</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Machine learning is helping the interpretation of biological complexity by enabling the inference and classification of cellular, organismal and ecological phenotypes based on large datasets, e.g., from genomic, transcriptomic and metagenomic analyses. A number of available algorithms can help search these datasets to uncover patterns associated with specific traits, including disease-related attributes. While, in many instances, treating an algorithm as a black box is sufficient, it is interesting to pursue an enhanced understanding of how system variables end up contributing to a specific output, as an avenue toward new mechanistic insight. Here we address this challenge through a suite of algorithms, named BowSaw, which takes advantage of the structure of a trained random forest algorithm to identify combinations of variables (&#x201c;rules&#x201d;) frequently used for classification. We first apply BowSaw to a simulated dataset and show that the algorithm can accurately recover the sets of variables used to generate the phenotypes through complex Boolean rules, even under challenging noise levels. We next apply our method to data from the integrative Human Microbiome Project and find previously unreported high-order combinations of microbial taxa putatively associated with Crohn&#x2019;s disease. By leveraging the structure of trees within a random forest, BowSaw provides a new way of using decision trees to generate testable biological hypotheses.</p>
</abstract>
<kwd-group>
<kwd>high-order interactions</kwd>
<kwd>microbiome</kwd>
<kwd>epistasis</kwd>
<kwd>random forest</kwd>
<kwd>Boolean rules</kwd>
<kwd>decision tree</kwd>
<kwd>complex phenotypes</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>The production of large biological data sets with high-throughput techniques has increased the utilization of supervised machine learning algorithms (<xref ref-type="bibr" rid="B21">Goodswen et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B39">Reel et&#x20;al., 2021</xref>), including support vector machines (<xref ref-type="bibr" rid="B51">Yang et&#x20;al., 2021</xref>), neural networks (<xref ref-type="bibr" rid="B37">Rampelli et&#x20;al., 2021</xref>) and random forests (<xref ref-type="bibr" rid="B15">Dicker et&#x20;al., 2021</xref>), to produce predictions of complex phenotypes (e.g., healthy vs. disease) from measurable traits (<xref ref-type="bibr" rid="B11">Cesario et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B22">Hughes et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B30">Marcos-Zambrano et&#x20;al., 2021</xref>). These algorithms use measurements of relevant traits such as gene variants, the presence/absence of microbial taxa, or metabolic consumption variables as predictors. Categorical prediction of phenotypes is typically the end goal of these applications. However, an additional benefit of these algorithms is the potential to extract explanatory classification rules. In this context, a rule is defined as a Boolean function of a set of traits, such that the value of the function is 1 (true) when the traits are associated with a given phenotype. Identifying the relationships between the traits involved in classification rules may yield key insights into the biological processes associated with important phenotypes (<xref ref-type="bibr" rid="B19">Furqan and Siyal, 2016</xref>; <xref ref-type="bibr" rid="B44">Visscher et&#x20;al., 2017</xref>). This realization is creating demand for methods that assist in the interpretation of supervised machine learning methods (<xref ref-type="bibr" rid="B2">Azmi et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B33">Nguyen et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B25">Le et&#x20;al., 2020</xref>), especially when the measured traits may be causal agents of disease states, such as genetic variants or microbial taxa (<xref ref-type="bibr" rid="B24">LaPierre et&#x20;al., 2019</xref>). Identifying classification rules associated with a phenotype of interest is valuable because these rules are likely to carry information about the causal mechanisms that generate the phenotype.</p>
<p>Algorithms that are particularly valuable in this respect are those involving decision trees, such as random forests, since decision trees are easily interpretable (<xref ref-type="bibr" rid="B7">Brodley and Friedl, 1997</xref>). Decision trees are rule-based classifiers, where rules arise from a series of &#x201c;yes-no&#x201d; questions that can efficiently divide the data into categorical groups. In a biological context, such rules may arise from sets of genes whose simultaneous modulation could affect a phenotype, or sets of microbial species whose co-occurrence may be associated with a disease state. While in several cases it seems like disease phenotypes are uniquely associated with a single specific pattern [e.g., retinoblastoma (<xref ref-type="bibr" rid="B23">Knudson, 1971</xref>)], there is increasing evidence for cases in which multiple distinct patterns can be associated with (and potentially causing) the same high-level phenotype (<xref ref-type="bibr" rid="B17">Emily et&#x20;al., 2009</xref>; <xref ref-type="bibr" rid="B26">Leem et&#x20;al., 2014</xref>). A particular example we will explore in this work is the multiplicity of distinct microbial presence/absence patterns which may be associated with Crohn&#x2019;s disease (<xref ref-type="bibr" rid="B35">Proctor et&#x20;al., 2019</xref>). Crohn&#x2019;s disease has five clinically defined sub-types (<xref ref-type="bibr" rid="B38">Reading, 2014</xref>) but studies of the associated microbiome do not usually indicate which form of Crohn&#x2019;s disease a donor has been diagnosed with. Each sub-type of the disease may be associated with different microbes, each requiring different treatment regimes. As discussed later, we hypothesize that the different rules associated with a given phenotype label may be related to these different subtypes, with potential therapeutic implications.</p>
<p>The fact that there may be multiple etiologies that generate the same or similar phenotypes complicates the straightforward interpretation of parameter coefficients or variable importance scores (<xref ref-type="bibr" rid="B29">Louppe, 2014</xref>; <xref ref-type="bibr" rid="B49">Wright et&#x20;al., 2016</xref>). Uncovering the multiple interactions between predictive variables as they relate to phenotypic labels remains a challenging statistical endeavor, but one that is of paramount importance. In an ideal situation, one could conduct a best subset search, evaluating all possible classification rules that can be defined using the data and identifying a set of rules that concisely explain the observed associations. This strategy is computationally intractable using a brute force approach: even a relatively small biological data set of 50 features with binary coding would require examining over 2<sup>50</sup> variable sets and many more specific rules (since the specific value of features, 0, 1, or &#x2018;omitted&#x2019;, is important). Identifying the associated rules that a random forest uses to classify a given sample (a specific row of the data matrix) offers the possibility to bypass the brute force approach and enables the development of mechanistic hypotheses for follow-up studies. This challenge, and an overview of the key strategy we propose, are illustrated in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref>. In <xref ref-type="fig" rid="F1">Figure&#x20;1A</xref> we depict a toy model where measured variables (traits) have only two possible values (e.g., present/absent), the high-level phenotype (category) is binary (e.g., no disease/disease), and two distinct Boolean rules can both generate the phenotype. The goal in this case is to identify each of the rules that are associated with the phenotype. The multiple Boolean rules obtained in this manner can be thought of as a consensus decision tree that possesses the most informative branches of the forest with respect to a given class label. In this work, we will show how this can be achieved by in-depth analyses of any given random forest (RF) (<xref ref-type="fig" rid="F1">Figure&#x20;1B</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>
<bold>(A)</bold> In a hypothetical dataset there are two phenotype labels&#x2013;&#x201c;Disease&#x201d; and &#x201c;No Disease&#x201d; that we wish to discriminate based on input predictor variables. In this example, there are two distinct high-order patterns that both confer the same &#x201c;Disease&#x201d; phenotype. Our goal is to identify a potentially diverse set of patterns (or, in this simplified case, all patterns) that are associated with the &#x201c;Disease&#x201d; label. <bold>(B)</bold> Instead of exhaustively evaluating variable combinations we leverage the structure that emerges from an ensemble of decisions trees like those produced by a trained random forest. <bold>(C)</bold> For each sample with the observed phenotype &#x201c;Disease&#x201d; we first identify the vector containing its input values (i). Then follow the paths it takes downs each tree that attempts to predict its class and record the frequency of parent-child variable pairs (ii). Next, we rank parent-child variable pairs in descending order of frequency (iii). Finally, we use a great search to construct a sample-specific rule that is fully associated with the &#x201c;Disease&#x201d; phenotype (iv). <bold>(D)</bold> All sample-specific rules are evaluated in order to obtain a consensus set of rules that combined account for all samples with the &#x201c;Disease&#x201d; phenotype.</p>
</caption>
<graphic xlink:href="fmolb-08-663532-g001.tif"/>
</fig>
<p>The random forest algorithm intrinsically takes advantage of non-linear relationships between variables and is widely used in the life sciences (<xref ref-type="bibr" rid="B5">Boulesteix et&#x20;al., 2012</xref>; <xref ref-type="bibr" rid="B32">Nguyen et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B41">Touw et&#x20;al., 2013</xref>). RFs, when used to distinguish between disease states known to have multiple causes, often result in excellent classifiers (<xref ref-type="bibr" rid="B16">Duvallet et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B18">Franzosa et&#x20;al., 2019</xref>). It has also been reported that RFs capture subtle statistical interactions between variables (<xref ref-type="bibr" rid="B29">Louppe, 2014</xref>). Unfortunately, an RF is not straightforwardly interpretable despite its hierarchical structure, and recovering those interactions is notoriously difficult (<xref ref-type="bibr" rid="B49">Wright et&#x20;al., 2016</xref>) due in large part to the method&#x2019;s reliance on ensembles of trees (<xref ref-type="bibr" rid="B6">Breiman, 2001</xref>). The difficulties in interpretation created by these properties has led many to refer to RF as a &#x2018;black box&#x2019; model (<xref ref-type="bibr" rid="B10">Castelvecchi, 2016</xref>).</p>
<p>Identifying the rules that a RF utilizes in classification tasks is an active area of research, and many strategies have been developed to address this problem. Effective strategies have focused on evaluating how individual variables influence the classification probabilities of specific samples (<xref ref-type="bibr" rid="B34">Palczewska et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B47">Welling et&#x20;al., 2016</xref>), pruning existing decision rules found in the tree ensemble to produce compact models (<xref ref-type="bibr" rid="B13">Deng, 2019</xref>), computing conditional importance scores (<xref ref-type="bibr" rid="B40">Strobl et&#x20;al., 2008</xref>), or iteratively enriching the most prevalent variable co-occurrences through regularization (<xref ref-type="bibr" rid="B3">Basu et&#x20;al., 2018</xref>). These approaches offer valuable methods for the identification of statistical interactions between variables. However, we and others have observed that while these methods are capable of recovering a true causal rule in simulated data when exactly one such rule is present, the existence of multiple rules associated with one phenotype can confound interpretation efforts (<xref ref-type="bibr" rid="B3">Basu et&#x20;al., 2018</xref>).</p>
<p>Here we describe BowSaw, a new set of algorithms that utilizes variable interactions in a trained RF model in order to extract multiple candidate explanatory rules. With BowSaw, we set out to develop a <italic>post hoc</italic> method intended to aid in the discovery of these rules when the input variables are categorical in nature. The primary approach of BowSaw is to start by approximating a best combination of variables (i.e.,&#x20;a rule) that explain the forest&#x2019;s predictions for individual samples of a given class in the data set and then to curate the collection of best combinations to obtain a concise set of combinations that collectively segregate a class of interest with high precision. For individual samples a rule is identified by systematically quantifying the co-occurrence of specific variable pairs across trees in the forest that attempt to predict the class of the sample (out-of-bag trees) and then using the frequency of these co-occurring variable pairs to guide the construction of a rule that precisely identifies the sample as its observed class. For the entire set of samples, we then curate the collection of all rules identified in this way, in order to produce a small set of rules that are broadly and precisely applicable to samples of the given class&#x20;label.</p>
<p>We first demonstrate that BowSaw can recover true rules (when they exist) by applying the algorithms to simulated data sets of varying complexity. We then apply BowSaw to a study on the role of the gut microbiome on Crohn&#x2019;s disease (<xref ref-type="bibr" rid="B35">Proctor et&#x20;al., 2019</xref>), and show that it can find a previously unreported combination of microbial taxa that is broadly and precisely associated with Crohn&#x2019;s disease samples in the data set. In its current implementation, BowSaw can be applied to any dataset with categorical or discrete predictors with any number of class labels.</p>
</sec>
<sec sec-type="methods" id="s2">
<title>Methods</title>
<sec id="s2-1">
<title>Overview of the Pipeline</title>
<p>Provided with a trained random forest and a training set, BowSaw goes through three steps in order to generate a candidate rule (variable-value combination) for each sample associated with the phenotype of interest. First, for a specific sample, the <italic>Count</italic> algorithm counts the frequency of unique ordered pairs of variables encountered along each of its out-of-bag trees in the forest (<xref ref-type="fig" rid="F1">Figure&#x20;1C</xref>&#x2013;step 2). Second, for that sample, the <italic>Construct</italic> algorithm takes the counts from the first step and generates a list of ordered pairs, ranked by their frequencies, then uses this list as a guide to construct a candidate decision rule (which could consist of two or more variables) that is associated with the observed phenotype at a user defined precision threshold (<xref ref-type="fig" rid="F1">Figure&#x20;1C</xref>&#x2013;steps 3&#x2013;4). Finally, the <italic>Curate</italic> algorithm pools the candidate decision rules from each sample together and greedily selects a subset of rules that collectively account for all of the samples with the desired phenotype (<xref ref-type="fig" rid="F1">Figure&#x20;1D</xref>). Optionally, the <italic>Sub-rule</italic> algorithm can be used to generate pruned versions of candidate rules prior to applying the Curate algorithm in order to obtain a more concise, albeit less specific, set of candidate rules. The Count and Construct algorithms generate the candidate rules for individual samples while the Curate and Sub-rule algorithms produce a combined set of rules that account for all samples with the chosen phenotype.</p>
<p>In the following section, we provide a description of the inputs BowSaw takes and the algorithms that implement these steps along with pseudocode.</p>
</sec>
<sec id="s2-2">
<title>Inputs</title>
<p>BowSaw takes as inputs a dataset, <bold>
<italic>D</italic>
</bold>, composed of <inline-formula id="inf1">
<mml:math id="m1">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> observed vectors <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (together with their respective classes <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) each of <inline-formula id="inf4">
<mml:math id="m4">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula> categorical variables. There are assumed to be <inline-formula id="inf5">
<mml:math id="m5">
<mml:mi>K</mml:mi>
</mml:math>
</inline-formula> possible class labels for each vector in <bold>
<italic>D</italic>
</bold> which for the purposes of this discussion denote different phenotypes. A random forest is assumed to be trained on <bold>
<italic>D</italic>
</bold> to distinguish the classes <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>.</italic> Additionally, BowSaw takes as input the feature vector <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of a specific sample for which the goal is to identify a set of simplified rules associated with the phenotype&#x20;<inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s2-3">
<title>Counting Stubs</title>
<p>Given an RF machine <bold>
<italic>M</italic>
</bold> trained on dataset <bold>
<italic>D</italic>
</bold> and a feature vector <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
<bold>,</bold> the first sub-routine of our method (the <italic>count algorithm</italic>) proceeds as follows. It starts by identifying among the set of trees in <bold>
<italic>M</italic>
</bold>, those sub-paths (sequences of successive variable indices) encountered by sample <inline-formula id="inf10">
<mml:math id="m10">
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:math>
</inline-formula> as it travels through <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, its set of out-of-bag trees. An out-of-bag tree is a tree for which <inline-formula id="inf12">
<mml:math id="m12">
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:math>
</inline-formula> was not included in the training set. For a specific path <bold>
<italic>P</italic>
</bold> in <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> the sequence of successive variable indices forms a vector <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:mi mathvariant="bold-italic">v</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
</mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) (note that each <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is one of the variables <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). Each stub (ordered pair of sequentially encountered variables <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) in all out-of-bag elements along <inline-formula id="inf18">
<mml:math id="m18">
<mml:mi mathvariant="bold-italic">P</mml:mi>
</mml:math>
</inline-formula> for <italic>i &#x3d;</italic> 1, &#x2026; <italic>r</italic>-1 is accounted for in a <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> matrix <inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">C</mml:mi>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where the element <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> records the number of stubs containing the ordered pair of variables <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> among all paths of <inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">M</mml:mi>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We restrict the counting to sequentially encountered variables because higher order interactions involving 3 or more sequential variables are much rarer and would require many more trees than is necessary to build an acceptable classifier.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>
<inline-graphic xlink:href="fmolb-08-663532-fx1.tif"/>
</p>
</statement>
</p>
</sec>
<sec id="s2-4">
<title>Constructing a Candidate Rule</title>
<p>A <italic>rule</italic> for classifying to a test point <inline-formula id="inf25">
<mml:math id="m25">
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:math>
</inline-formula> will have the form &#x201c;If <inline-formula id="inf26">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> then classify <inline-formula id="inf27">
<mml:math id="m27">
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:math>
</inline-formula> to class <inline-formula id="inf28">
<mml:math id="m28">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>&#x201d;. Here <inline-formula id="inf29">
<mml:math id="m29">
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:math>
</inline-formula> is a designated subcollection of the variable indices <inline-formula id="inf30">
<mml:math id="m30">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf31">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the sub-vector of current vector <inline-formula id="inf32">
<mml:math id="m32">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> corresponding just to the indices <inline-formula id="inf33">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The vector <inline-formula id="inf34">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> will denote a pre-defined set of values to <inline-formula id="inf35">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, with the above rule requirement effectively meaning that each <inline-formula id="inf36">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> appearing in the second vector must equal the corresponding <inline-formula id="inf37">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the first vector. Thus the condition <inline-formula id="inf38">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> requires a specific assignment of values to <inline-formula id="inf39">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for <inline-formula id="inf40">
<mml:math id="m40">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the rule is that if a test vector satisfies this condition, we classify it to category&#x20;k.</p>
<p>The second sub-routine (the <italic>construct algorithm</italic>) builds a candidate rule <inline-formula id="inf41">
<mml:math id="m41">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula>, based (initially) on a fixed training point, say <inline-formula id="inf42">
<mml:math id="m42">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, in class <inline-formula id="inf43">
<mml:math id="m43">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>. This is done by first placing all of the stubs <inline-formula id="inf44">
<mml:math id="m44">
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> with non-zero counts <inline-formula id="inf45">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into a list <inline-formula id="inf46">
<mml:math id="m46">
<mml:mi mathvariant="bold-italic">L</mml:mi>
</mml:math>
</inline-formula> sorted in descending order by their values in&#x20;<inline-formula id="inf47">
<mml:math id="m47">
<mml:mi mathvariant="bold-italic">C</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>We define the candidate rule <inline-formula id="inf48">
<mml:math id="m48">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula> (based on <inline-formula id="inf49">
<mml:math id="m49">
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:math>
</inline-formula>) through the following steps. We initialize using the first stub <inline-formula id="inf50">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> in the list <inline-formula id="inf51">
<mml:math id="m51">
<mml:mrow>
<mml:mi mathvariant="bold-italic">L</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> together with the two fixed values <inline-formula id="inf52">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> This is the initialized form of the rule <inline-formula id="inf53">
<mml:math id="m53">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula>, which requires that for any test vector, its values at the above indices <inline-formula id="inf54">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf55">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> match the values of the above fixed training vector <inline-formula id="inf56">
<mml:math id="m56">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, so that <inline-formula id="inf57">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf58">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. For brevity, denote the pair <inline-formula id="inf59">
<mml:math id="m59">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the corresponding assigned values as <inline-formula id="inf60">
<mml:math id="m60">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
<p>Then the content of rule <inline-formula id="inf61">
<mml:math id="m61">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula> will be denoted succinctly as <inline-formula id="inf62">
<mml:math id="m62">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
<mml:mo>&#x21d2;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Since ordering of the indices <inline-formula id="inf63">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> does not matter, (as long as the indices are identified), we will henceforth write <inline-formula id="inf64">
<mml:math id="m64">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>We then update rule <inline-formula id="inf65">
<mml:math id="m65">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula> as follows. We find all <inline-formula id="inf66">
<mml:math id="m66">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> that satisfy the initial part of rule <inline-formula id="inf67">
<mml:math id="m67">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula>, i.e., <inline-formula id="inf68">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> i.e.,&#x20;all training points matching the two indices <inline-formula id="inf69">
<mml:math id="m69">
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of training sample <inline-formula id="inf70">
<mml:math id="m70">
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:math>
</inline-formula>, and store them as a subcollection <inline-formula id="inf71">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2282;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of the training set. We call <inline-formula id="inf72">
<mml:math id="m72">
<mml:mi>F</mml:mi>
</mml:math>
</inline-formula> the fraction of data points in <inline-formula id="inf73">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> that have phenotype <inline-formula id="inf74">
<mml:math id="m74">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>, i.e.,&#x20;match the phenotype of the initial sample <inline-formula id="inf75">
<mml:math id="m75">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. When <italic>F</italic> is greater than or equal to a user defined <italic>threshold</italic>, the algorithm terminates and returns <bold>
<italic>R</italic>
</bold>. If <inline-formula id="inf76">
<mml:math id="m76">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we stop and return the current above rule <inline-formula id="inf77">
<mml:math id="m77">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula>
<bold>.</bold> If <inline-formula id="inf78">
<mml:math id="m78">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we continue by choosing the second stub <inline-formula id="inf79">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> in the above list <inline-formula id="inf80">
<mml:math id="m80">
<mml:mi mathvariant="bold-italic">L</mml:mi>
</mml:math>
</inline-formula>, and augment the current rule <inline-formula id="inf81">
<mml:math id="m81">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula> by adding the condition <inline-formula id="inf82">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (again written <inline-formula id="inf83">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) and maintaining the assignment of class <inline-formula id="inf84">
<mml:math id="m84">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> (i.e.,&#x20;the same class as the currently fixed sample <inline-formula id="inf85">
<mml:math id="m85">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>). If the second stub <inline-formula id="inf86">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> happens to overlap with the initial stub <inline-formula id="inf87">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, this added condition in the rule <inline-formula id="inf88">
<mml:math id="m88">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula> will clearly be consistent, being still based on the fixed sample <inline-formula id="inf89">
<mml:math id="m89">
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:math>
</inline-formula>. We augment the current index list <inline-formula id="inf90">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to a list <inline-formula id="inf91">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, adding to it the two new indices <inline-formula id="inf92">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf93">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, so that now <inline-formula id="inf94">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>{</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>} writing the augmented rule as <inline-formula id="inf95">
<mml:math id="m95">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x21d2;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>k</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Again defining <inline-formula id="inf96">
<mml:math id="m96">
<mml:mi>F</mml:mi>
</mml:math>
</inline-formula> to be the fraction of the data subset <inline-formula id="inf97">
<mml:math id="m97">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (matching the more restrictive new rule <inline-formula id="inf98">
<mml:math id="m98">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula>
<bold>)</bold> with phenotype <inline-formula id="inf99">
<mml:math id="m99">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>, we stop the algorithm and use the current rule <inline-formula id="inf100">
<mml:math id="m100">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula> if <inline-formula id="inf101">
<mml:math id="m101">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and otherwise augment rule <inline-formula id="inf102">
<mml:math id="m102">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula> by adding the indices <inline-formula id="inf103">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">L</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>j</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to it, as above, yielding a larger set <inline-formula id="inf104">
<mml:math id="m104">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of indices and the augmented rule <inline-formula id="inf105">
<mml:math id="m105">
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:math>
</inline-formula>: <inline-formula id="inf106">
<mml:math id="m106">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x21d2;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> , with a more restricted subset <inline-formula id="inf107">
<mml:math id="m107">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#x2282;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and a new value for <inline-formula id="inf108">
<mml:math id="m108">
<mml:mi>F</mml:mi>
</mml:math>
</inline-formula>, now the fraction of <inline-formula id="inf109">
<mml:math id="m109">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the class <inline-formula id="inf110">
<mml:math id="m110">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>of the fixed <inline-formula id="inf111">
<mml:math id="m111">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.This process continues until the fraction <inline-formula id="inf112">
<mml:math id="m112">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, e.g., 100% of the samples in <inline-formula id="inf113">
<mml:math id="m113">
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:math>
</inline-formula> match the current set of indices, and also match the class <inline-formula id="inf114">
<mml:math id="m114">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> of the current sample <inline-formula id="inf115">
<mml:math id="m115">
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:math>
</inline-formula>. Alternatively, the algorithm stops when all stubs in <inline-formula id="inf116">
<mml:math id="m116">
<mml:mi mathvariant="bold-italic">L</mml:mi>
</mml:math>
</inline-formula> have been exhausted.</p>
<p>In the examples that follow we have set <italic>threshold</italic> to 1. The rationale for this choice is that we allow overfit with intention of pruning the overfit rules in order to find more generalizable forms. We make this choice because from the perspective of discovery, we assume that it is more desirable to capture as much of a true underlying rule as possible and then prune back to a shorter one, than it is to extract a concise rule. In practice one might decide to tune the <italic>threshold, F,</italic> to approximate the overall precision of the model in order to identify less complex rules or tune it as a hyper-parameter in order to reduce the combinatorial search&#x20;space.</p>
<p>
<statement content-type="algorithm" id="Algorithm_2">
<label>Algorithm 2</label>
<p>
<inline-graphic xlink:href="fmolb-08-663532-fx2.tif"/>
</p>
</statement>
</p>
</sec>
<sec id="s2-5">
<title>Curating Candidate Rules</title>
<p>The <italic>count</italic> and <italic>construct</italic> algorithms are the heart of BowSaw. In our workflow, we apply these algorithms to each sample <inline-formula id="inf117">
<mml:math id="m117">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf118">
<mml:math id="m118">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the set of samples in dataset <inline-formula id="inf119">
<mml:math id="m119">
<mml:mi>D</mml:mi>
</mml:math>
</inline-formula> with phenotype <inline-formula id="inf120">
<mml:math id="m120">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>. At this stage in the algorithm, we have associated a single candidate rule <inline-formula id="inf121">
<mml:math id="m121">
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula> for each vector in <inline-formula id="inf122">
<mml:math id="m122">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The union of these candidate rules over all samples in <inline-formula id="inf123">
<mml:math id="m123">
<mml:mi>D</mml:mi>
</mml:math>
</inline-formula> will form a list which we will denote as <inline-formula id="inf124">
<mml:math id="m124">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which ranks each rule <inline-formula id="inf125">
<mml:math id="m125">
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula> by the size <inline-formula id="inf126">
<mml:math id="m126">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of the set <inline-formula id="inf127">
<mml:math id="m127">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> consisting of all samples <inline-formula id="inf128">
<mml:math id="m128">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> consistent with rule <inline-formula id="inf129">
<mml:math id="m129">
<mml:mi>q</mml:mi>
</mml:math>
</inline-formula>. Since <inline-formula id="inf130">
<mml:math id="m130">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> may include many redundant rules or rules that strictly extend each other, we have another sub-routine (the <italic>curate algorithm</italic>) to generate a concise set of candidate rules that collectively account for all samples <inline-formula id="inf131">
<mml:math id="m131">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> in class <inline-formula id="inf132">
<mml:math id="m132">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>. Briefly, we initialize a list <bold>
<italic>H</italic>
</bold>, with the element <inline-formula id="inf133">
<mml:math id="m133">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> representing the largest set <inline-formula id="inf134">
<mml:math id="m134">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2282;</mml:mo>
<mml:msup>
<mml:mi>D</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> of samples. At each stage, the next rule in <inline-formula id="inf135">
<mml:math id="m135">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is selected so as to be satisfied by the largest number of elements<inline-formula id="inf136">
<mml:math id="m136">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> that do not satisfy any of the previous rules. This rule is then added to <inline-formula id="inf137">
<mml:math id="m137">
<mml:mi mathvariant="bold-italic">H</mml:mi>
</mml:math>
</inline-formula>, with ties resolved randomly. This is then continued until the elements in <inline-formula id="inf138">
<mml:math id="m138">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mi mathvariant="bold-italic">k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> satisfying at least one rule in <inline-formula id="inf139">
<mml:math id="m139">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> are exhausted.</p>
<p>
<statement content-type="algorithm" id="Algorithm_3">
<label>Algorithm 3</label>
<p>
<inline-graphic xlink:href="fmolb-08-663532-fx3.tif"/>
</p>
</statement>
</p>
</sec>
<sec id="s2-6">
<title>Constructing Sub-Rules</title>
<p>In any given dataset, rules are rarely perfectly associated with specific phenotypes. Given the current list <inline-formula id="inf140">
<mml:math id="m140">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> of rules describing phenotype <inline-formula id="inf141">
<mml:math id="m141">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> as above, we may consider a looser set of rules by creating a new list <inline-formula id="inf142">
<mml:math id="m142">
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> consisting of rules in <inline-formula id="inf143">
<mml:math id="m143">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> together with sub-rules satisfying some user defined minimal complexity criterion and precision thresholds which serve to exclude low quality rules from the analysis. Given a rule is the conjunction of a set of conditions, by sub-rule we mean the conjunction of a subset of these conditions. The list <inline-formula id="inf144">
<mml:math id="m144">
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be treated precisely as the list <inline-formula id="inf145">
<mml:math id="m145">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> was above, resulting in a new curated list <inline-formula id="inf146">
<mml:math id="m146">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> obtained as earlier, yielding a new candidate rule set which has a reduced likelihood of overfitting the&#x20;data.</p>
<p>Thus, we will require a strategy for selecting a set of candidate sub-rules that account for all samples with desired observed phenotype class <inline-formula id="inf147">
<mml:math id="m147">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>. Candidate sub-rules are shorter candidate rules (with less complexity, likely less precise, and more broadly applicable) derived from larger candidate rules by keeping one or more (generally <inline-formula id="inf148">
<mml:math id="m148">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula>) variables. For each candidate rule in <bold>
<italic>H</italic>
</bold>, and complexity level <italic>i</italic>, we include only sub-rules that meet the user-defined complexity criterion, designated as complexity level <inline-formula id="inf149">
<mml:math id="m149">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula>. We place each of the sub-rules derived from <inline-formula id="inf150">
<mml:math id="m150">
<mml:mi mathvariant="bold-italic">H</mml:mi>
</mml:math>
</inline-formula> at complexity level <inline-formula id="inf151">
<mml:math id="m151">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula> into a new list <inline-formula id="inf152">
<mml:math id="m152">
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. For each rule in <inline-formula id="inf153">
<mml:math id="m153">
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> its precision is calculated with respect to the class <inline-formula id="inf154">
<mml:math id="m154">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>, and those rules with a precision below a given threshold are eliminated. Finally, this reduced list is subject to the above <italic>Curate</italic> algorithm&#x20;again.</p>
<p>
<statement content-type="algorithm" id="Algorithm_4">
<label>Algorithm 4</label>
<p>
<inline-graphic xlink:href="fmolb-08-663532-fx4.tif"/>
</p>
<p>Within the above aggregation algorithm, <inline-formula id="inf155">
<mml:math id="m155">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is determined for each rule in <inline-formula id="inf156">
<mml:math id="m156">
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> which is then pruned with the <italic>curate</italic> algorithm to produce&#x20;<inline-formula id="inf157">
<mml:math id="m157">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>The algorithms described above are generalizable to multi-classification tasks but are currently limited to discretized or categorical representations of the feature space. Pseudocode for implementing each of the algorithms described above along with an implementation of the algorithms in R (<xref ref-type="bibr" rid="B14">R Core Team, 2020</xref>) can be found in the supplemental files and on github: <ext-link ext-link-type="uri" xlink:href="https://github.com/segrelab/BowSaw">https://github.com/segrelab/BowSaw</ext-link>.</p>
</statement>
</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>Application to Simulated Data</title>
<p>To test the capacity of BowSaw to recover multiple decision rules when the ground truth is known, we applied it to increasingly challenging simulated data sets. These data sets consist of binary vectors representing different samples. The phenotype associated with each sample is a function of the corresponding vector. The function consists of a set of multiple mutually distinct Boolean rules, such that if a rule is satisfied, it will cause the sample to have the phenotype with a certain probability (which we call here &#x201c;penetrance&#x201d; because of its resemblance to the genetics concept). The first dataset (IDEALIZED) we use is relatively simple and includes multiple equally prevalent rules. It is also generated under the assumption that there are no unmeasured confounders, i.e.,&#x20;that if a sample does have a phenotype, then it must be satisfying at least one of the above rules. We then apply BowSaw to a more challenging scenario (INTERMEDIATE) in which the phenotype-generating rules differ in their relative prevalence and the assumption of unmeasured confounders is violated. Finally, is a set of data sets with complex co-varying parameters (COMPLEX), we systematically varied the underlying parameters of the simulation and examined the relationship between summary statistics of the RF performance and the ability of BowSaw to generate candidate rules containing the true phenotype-generating&#x20;rules.</p>
<p>For the IDEALIZED scenario, we simulated a data set of 100 independent and identically distributed random binary variables and 2,000 samples. We randomly defined five rules, each requiring four randomly selected variables to have specific values (e.g., all variables equal to 1) in order to assign a hypothetical phenotype with likelihood between 0.8 and 0.9. Here we present the results of this scenario with a specified random seed, but other seeds and parameters can be explored using the scripts provided in the supplemental files. Using these parameters, 497 samples were assigned the phenotype and BowSaw produced a set of 135 unique candidate rules ranging in complexity from six to fourteen variables. From these rules, we produced all sub-rules involving anywhere between two and five variables, which resulted in unique 50,034&#x20;sub-rules. To reduce the number of sub-rules that the <italic>Curate</italic> algorithm would need to examine, we eliminated from consideration any rules that had a class precision below 80%. We selected an 80% threshold because in the cluster centered around 125 matching samples there is a small cloud of rules that are clearly segregating the phenotype more efficiently than the others (<xref ref-type="fig" rid="F2">Figure&#x20;2A</xref>). We selected the most general remaining sub-rule to initialize our list of candidate rules. This produced a final list consisting of five candidate rules that accounted for all of the samples with the phenotype and were each one of the true phenotype generating rules (<xref ref-type="fig" rid="F2">Figure&#x20;2A</xref> red points). These results demonstrate that in an ideal scenario with no measurement errors, BowSaw is indeed capable of recovering multiple true&#x20;rules.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>For both scenarios 2,000 samples were generated with 100 randomly generated binary features. <bold>(A)</bold> The generality of sub-rules (number of points that exactly satisfy the rule criteria) is plotted against their precision for the IDEALIZED scenario (Five rules that cause the phenotype and no noise). Each point represents a unique sub-rule. <italic>X</italic>-axis is the number of samples in the dataset that exactly match the pattern defined by the rule. <italic>Y</italic>-axis is the fraction of matching samples with the observed phenotype (i.e.,&#x20;precision of the rule). Each cluster of points corresponds to decreasing rule complexity from 5 variables per rule to 2 on the right-most cluster. These clusters appear because the values of each variable are produced by an identical binomial distribution. The dashed line is the precision threshold we chose in order to exclude low quality rules. Only candidate rules with precision above this threshold were considered for the curate algorithm. Red points are the causative sub-rules we defined. BowSaw correctly identified all five red points in this scenario. <bold>(B)</bold> Candidate sub-rules generated for the more challenging INTERMEDIATE scenario. We defined 5 causative rules of varying lengths in this scenario and allowed 2% of samples without a causative rule to be assigned the label. BowSaw completely recovered 4 of the causative rules (red points). The longest rule which involved 5 variables was not fully recovered by any candidate rule. Rules that were selected by the Curate algorithm because of their contribution to additional coverage but that did not contain a complete true rule are indicated by blue points.</p>
</caption>
<graphic xlink:href="fmolb-08-663532-g002.tif"/>
</fig>
<p>For the more challenging scenario (INTERMEDIATE), we generated the data set as before, except that this time we allowed the five underlying rules to vary in complexity from three to five variables. Varying the complexity of rules resulted in different prevalence among them, as rules that are more complicated are less likely to appear in the data. In this case, we had one rule of complexity five, two that required four variables, and two that used three variables. We also added background noise by randomly assigning the phenotype to 2% of samples that did not possess any of the rules, 655 samples were assigned the phenotype. BowSaw produced 176 unique candidate rules involving between six to thirteen variables. From this list we generated 68,938&#x20;sub-rules and chose a precision threshold of 75% because there are two clusters at &#x223c;&#x7c;<bold>
<italic>T</italic>
</bold>&#x7c; &#x3d; 125 that begin to clearly separate in that range and the two outlier points at &#x223c;&#x7c;<bold>
<italic>T</italic>
</bold>&#x7c; &#x3d; 250 do not combine to account for all of the phenotype (<xref ref-type="fig" rid="F2">Figure&#x20;2B</xref>). Applying the <italic>Curate</italic> algorithm to the rules meeting this threshold selected 19 candidate sub-rules, the top four (when ranked by &#x7c;<bold>
<italic>T</italic>
</bold>&#x7c;) of which were true rules (red points). The remaining 15 rules were noise rules (blue points). The rule of five variables was not recovered. These results show that BowSaw is able to recover strongly associated patterns (and in this case, causal patterns) even in the presence of noise, but low prevalence rules can be masked by more highly prevalent&#x20;rules.</p>
<p>We used the same data generation method to investigate BowSaw&#x2019;s ability to produce candidate rules containing true rules when the underlying parameters change. We applied BowSaw to 20,000 simulated data sets where we randomly altered the number of features (50&#x2013;1,000), sample size (200 or 2,000 samples), complexity of the rules (2&#x2013;8 variables), number of rules (2&#x2013;8), the likelihood of each rule assigning the phenotype (0.0005&#x2013;1), and the background noise (1x10<sup>&#x2212;5</sup> to 0.1). For each simulation we extracted a single candidate rule per sample with the assigned phenotype and ranked them without generating sub-rules.</p>
<p>To investigate how effectively BowSaw recovers true rules, for each simulation we calculated the fraction of true rules fully recovered, the probability of fully recovering at least one rule, the median rank of the first recovered rule when at least one is recovered, and the mean rule completeness of recovered rules. We investigated the relationship of these measurements to the to the ROC-AUC, PR-AUC, number of features, and sample size. These values were chosen because they are easily accessible to researchers during model building and could potentially be used to assess the likelihood of obtaining useful insights from applications of BowSaw.</p>
<p>ROC-AUC, PR-AUC, and sample size are positively correlated with full recovery of true rules, mean completeness of recovered rules, and median rank. Number of features was negatively correlated with these values. These correlations are summarized in <xref ref-type="table" rid="T1">Table&#x20;1</xref>. The probability of recovering at least one true rule gradually decreases with increasing feature space, gradually increases with increasing sample size, and forms a sigmoidal curve with both ROC-AUC and PR-AUC. Plots depicting the relationship of the four metrics with the fraction of fully recovered rules, probability of recovering at least one rule, median rank of rules, and mean rule completeness can be found in <xref ref-type="sec" rid="s9">Supplementary Figures S1&#x2013;S4</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Correlation of performance metrics and data dimensions with rule recovery.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center"/>
<th align="center">ROC-AUC</th>
<th align="center">PR-AUC</th>
<th align="center">N Features</th>
<th align="center">Sample size</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Fraction of rules recovered</td>
<td align="char" char=".">0.672</td>
<td align="char" char=".">0.585</td>
<td align="char" char=".">-0.151</td>
<td align="char" char=".">0.556</td>
</tr>
<tr>
<td align="left">Mean partial recovery all rules</td>
<td align="char" char=".">0.683</td>
<td align="char" char=".">0.581</td>
<td align="char" char=".">-0.251</td>
<td align="char" char=".">0.657</td>
</tr>
<tr>
<td align="left">Median rank of first recovered rule</td>
<td align="char" char=".">0.268</td>
<td align="char" char=".">0.195</td>
<td align="char" char=".">-0.073</td>
<td align="char" char=".">0.071</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-2">
<title>Application to Human Microbiome Data</title>
<p>Irregular distributions of microbial taxa within the gut are often associated with serious illnesses such as Crohn&#x2019;s disease or ulcerative colitis (<xref ref-type="bibr" rid="B8">Carding et&#x20;al., 2015</xref>; <xref ref-type="bibr" rid="B27">Levy et&#x20;al., 2017</xref>). Human microbiome studies regularly use 16s rRNA amplicon sequencing methods and extensive reference databases to report on microbial taxa found in samples as operational taxon units (OTUs). RF classifiers are frequently built using counts of OTUs to accurately discriminate between disease and healthy patient samples (<xref ref-type="bibr" rid="B1">Ai et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B43">Vangay et&#x20;al., 2019</xref>). Despite their demonstrated effectiveness as good classifiers of Crohn&#x2019;s disease, studies that look to discover associations with disease status typically focus on individual OTUs, while specific microbial association rules found by RF are not discussed, as a result it is uncertain how heterogeneous study cohorts are. To investigate potential rule heterogeneity in a human microbiome cohort we downloaded processed files from the Human Microbiome Project for inflammatory bowel disease (IBD) (<xref ref-type="bibr" rid="B35">Proctor et&#x20;al., 2019</xref>) which contain information on the taxonomic profiles of 982 OTUs in 178 patients&#x2013;86 of which have been diagnosed with Crohn&#x2019;s disease, 46 diagnosed with ulcerative colitis, and 46 diagnosed as non-IBD. We were specifically interested in finding rules that separate the Crohn&#x2019;s disease samples from ulcerative colitis and non-IBD, so we framed the problem as a binary classification task with Crohn&#x2019;s disease as the target phenotype.</p>
<p>Since the current implementation of BowSaw is limited to finding rules when the variables have categorical values, we first converted the OTU counts of each taxon to a simple presence/absence scheme. This resulted in nearly equivalent RF performance relative to training RF with the original continuous OTU inputs: ROC AUC of 0.856 (binary) vs 0.872 (continuous) and PR AUC of 0.853 (binary) vs 0.86 (continuous) (<xref ref-type="fig" rid="F3">Figures 3A,B</xref>). This is an important result because it allows us to think about associations just in terms of presence or absence of an OTU without sacrificing much in model performance. We next applied BowSaw to the Crohn&#x2019;s disease samples and generated 86 unique classification rules. These rules ranged in complexity from 4 OTUs to 16 OTUs (median 9 OTUs) and applied to as few as 1 sample up to 36 samples (mean 6.3, &#x2b;/-6.6, median: 4). The most broadly applicable rule involved 8&#x20;OTUs.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>
<bold>(A)</bold> Performance of the random forest classifier as measured by area under the receiver operator curve (ROC-AUC) is not strongly perturbed by simplifying OTU representation to a presence/absence scheme vs. the original continuous count. Dashed line indicates the performance of a perfectly random classifier. <bold>(B)</bold> The area under the curve of the precision recall curve is similarly not strongly affected by the new representation scheme. Dashed horizontal line is the random performance line. <bold>(C)</bold> Each point represents a unique candidate sub-rule. On the <italic>x</italic>-axis is the number of samples in the data matrix that are subject to that rule. The <italic>y</italic>-axis represents what fraction of matching samples were diagnosed as Crohn&#x2019;s disease. <bold>(D)</bold> The taxon identities of the OTUs that make up the most generally applicable of the sub-rules where all matching samples have the Crohn&#x2019;s disease label.</p>
</caption>
<graphic xlink:href="fmolb-08-663532-g003.tif"/>
</fig>
<p>We then applied the Sub-rule algorithm and visualized 56,902 resultant sub-rules ranging in complexity from 2 to 7 variables (<xref ref-type="fig" rid="F3">Figure&#x20;3C</xref>). There were 1,941&#x20;sub-rules with precision &#x3d; 1. We selected the most general of these rules (max<italic>&#x7c;</italic>
<bold>
<italic>T</italic>
</bold>
<italic>&#x7c;</italic>) to be the top candidate for the curate algorithm and found that it considers the status of 5 OTUs and accounts for 38 of the 86 Crohn&#x2019;s disease samples (<xref ref-type="fig" rid="F3">Figure&#x20;3C</xref>), this rule was derived from the rule that considered the status of 8 OTUs and accounted for 36 Crohn&#x2019;s disease samples. We set a precision threshold of 90% and ended up with 10&#x20;sub-rules involving an average of 4 OTUs (min &#x3d; 2, max &#x3d; 7), each derived from a unique parent rule (average OTUs &#x3d; 9.6, min &#x3d; 6, max &#x3d; 16), that together account for all 86 Crohn&#x2019;s disease samples and an additional 11&#x20;non-Crohn&#x2019;s disease samples (4&#x20;non-IBD, 7 ulcerative colitis). The top five rules combine to account for 78 of 86 Crohn&#x2019;s disease samples and include 10&#x20;non-Crohn&#x2019;s disease samples (<xref ref-type="table" rid="T2">Table&#x20;2</xref>).</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Association rules identified by BowSaw that account for all Crohn&#x2019;s disease samples.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Rule</th>
<th align="center">CD samples</th>
<th align="center">Non CD samples</th>
<th align="center">New samples covered</th>
<th align="center">Taxonomy</th>
<th align="center">Presence</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="5" align="left">1</td>
<td rowspan="5" align="center">38</td>
<td rowspan="5" align="center">0</td>
<td rowspan="5" align="center">38</td>
<td align="left">
<italic>Bacteroides (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Lachnoclsotridium (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Tyzzerella (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Lachnospira (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Lachnospiricae UCG-001 (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="6" align="left">2</td>
<td rowspan="6" align="center">41</td>
<td rowspan="6" align="center">4</td>
<td rowspan="6" align="center">20</td>
<td align="left">
<italic>Dialister (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Christensenellacea R7 group (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Collinsella (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Ruminococcaceae (family)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Finegoldia (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Ruminococcus (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="3" align="left">3</td>
<td rowspan="3" align="center">9</td>
<td rowspan="3" align="center">1</td>
<td rowspan="3" align="center">9</td>
<td align="left">
<italic>Ruminococcus (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Ruminococcaceae UCG-002 (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Lachnospirceae (family)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="4" align="left">4</td>
<td rowspan="4" align="center">24</td>
<td rowspan="4" align="center">2</td>
<td rowspan="4" align="center">6</td>
<td align="left">
<italic>Streptococcus (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Tyzzerella (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Lachnospiraceae (family)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Hafnia obesumbacterium</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="3" align="left">5</td>
<td rowspan="3" align="center">27</td>
<td rowspan="3" align="center">3</td>
<td rowspan="3" align="center">5</td>
<td align="left">
<italic>Lachnospiricae UCG-008 (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Ruminococcus 1 (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Eubacterium eligens group</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="2" align="left">6</td>
<td rowspan="2" align="center">5</td>
<td rowspan="2" align="center">0</td>
<td rowspan="2" align="center">2</td>
<td align="left">
<italic>Ruminococcus 1 (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Dorea (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="3" align="left">7</td>
<td rowspan="3" align="center">7</td>
<td rowspan="3" align="center">0</td>
<td rowspan="3" align="center">2</td>
<td align="left">
<italic>Bacteroides (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Dialister (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Eubacterium rectale group</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="5" align="left">8</td>
<td rowspan="5" align="center">15</td>
<td rowspan="5" align="center">0</td>
<td rowspan="5" align="center">2</td>
<td align="left">
<italic>Lachnospiraceae NK4A136 group</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Eubacterium eligens group</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Tyzzerella (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Christensenellacea R7 group (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Lachnospira (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="4" align="left">9</td>
<td rowspan="4" align="center">3</td>
<td rowspan="4" align="center">0</td>
<td rowspan="4" align="center">1</td>
<td align="left">
<italic>Ruminococcus gnavus group</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Veillonella (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Bacteroides (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td align="left">
<italic>Finegoldia (genus)</italic>
</td>
<td align="center">n</td>
</tr>
<tr>
<td rowspan="3" align="left">10</td>
<td rowspan="3" align="center">10</td>
<td rowspan="3" align="center">1</td>
<td rowspan="3" align="center">1</td>
<td align="left">
<italic>Parabacteroides (genus)</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Eubacterium eligens group</italic>
</td>
<td align="center">y</td>
</tr>
<tr>
<td align="left">
<italic>Ruminococcaceae Ucg-003 (genus)</italic>
</td>
<td align="center">n</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The top candidate rule is comprised of the presence of <italic>Bacteroides</italic> and <italic>Lachnoclostridium</italic> and the absence of three genera from the family <italic>Lachnospiraceae: Lachnospira</italic>, <italic>Tyzerrella,</italic> and <italic>Lachnospiracea UCG 001</italic> (<xref ref-type="fig" rid="F3">Figure&#x20;3D</xref>). Detection of <italic>Bacteroides</italic> was nearly ubiquitous within the cohort, it was found in 170 of 178 total samples, but only 3 of the samples in which it was missing are diagnosed as Crohn&#x2019;s disease. For the remaining taxa we performed a t-test comparing the distribution of the taxa in Crohn&#x2019;s disease vs. ulcerative colitis and vs. healthy samples. <italic>Lachnoclostridium</italic> was frequently found in Crohn&#x2019;s disease (67/86) but not in ulcerative colitis (27/46, <italic>p</italic>&#x20;&#x3d; 0.02) and was detected at roughly the same rate in non-IBD samples (34/46, <italic>p</italic>&#x20;&#x3d; 0.616). Detection of <italic>Lachnospira</italic> was depleted in Crohn&#x2019;s disease samples (20/86) relative to ulcerative colitis (20/46, <italic>p</italic>&#x20;&#x3d; 0.022) and to non-IBD samples (31/46, <italic>p</italic>&#x20;&#x3d; 9.9&#x2013;7). <italic>Tyzzerella</italic> was also detected at a lower rate in Crohn&#x2019;s disease (63/86) relative to ulcerative colitis (24/46, <italic>p</italic>&#x20;&#x3d; 0.019) and non-IBD (24/46, <italic>p</italic>&#x20;&#x3d; 0.019). <italic>Lachnospiracea UCG 001</italic> was rarely detected in Crohn&#x2019;s disease (4/86) which is a lower rate than it was detected in ulcerative colitis (9/46, <italic>p</italic>&#x20;&#x3d; 0.022) and in non-IBD samples (19/46, <italic>p</italic>&#x20;&#x3d; 1.45&#x2013;5).</p>
</sec>
<sec id="s3-3">
<title>Application to Mushroom Data</title>
<p>To further demonstrate the generalizability of our approach to non-binarized datasets we identified the mushroom data set from the <xref ref-type="bibr" rid="B42">UCI machine learning repository</xref> (UCI Machine Learning Repository). This data set contains 8,123 observations of poisonous (3,915) and edible (4,208) mushrooms. There are 22 categorical features ranging from 2 to 12 categories. The two classes are perfectly separable, and the documentation accompanying the matrix describes a set of rules that separate all edible mushrooms from poisonous samples. This rule set provides a good baseline to compare the complexity of the final rule sets obtained with BowSaw&#x20;to.</p>
<p>We applied our approach to the original matrix of 22 features with multiple categories and to a binarized transformation where we give each category its own column (117 features). In both cases we used BowSaw to extract classification rules that account either for all edible mushrooms or for all poisonous mushrooms. Since the samples are fully separable we again set <italic>F</italic>&#x20;&#x3d; 1. This setting resulted in candidate rules ranging in complexity from 2 to 9 variables. We examined all sub-rules from complexity 1 up to complexity 9 and retained only those that were entirely associated with the target class (precision &#x3d; 1) for curating a short list. In total we generated 4 different rule lists that fully separate edible from poisonous mushrooms and also differ from the data donor&#x2019;s contributed list. Each list is composed of 7 rules. The rule lists obtained from each run are described in <xref ref-type="sec" rid="s9">Supplementary Table S1</xref> along with the contributed&#x20;list</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>Linear models for classification such as logistic regression are often the &#x201c;go to&#x201d; approach due to their ease of implementation and interpretation of coefficients. However, many biological datasets contain non-linear interactions between features. In these situations it is not uncommon for random forests to significantly outperform logistic regression. Interpretation of random forest models for classification is not straightforward and may be complicated when there are multiple rules (combinations of variables and their specific values) associated with a phenotype of interest. Our newly developed BowSaw approach, best applied when random forest is the appropriate classifier, is an algorithmic method for identifying the rules that a trained random forest model uses to make classifications when the values are categorical in nature. By taking advantage of the structure of trees found within a random forest, BowSaw produces a set of multiple decision rules that combine to account for each sample with a given observed phenotype. When the variables are the presumed causal agents, these rules represent plausible mechanistic relationships.</p>
<p>Results on simulated data demonstrate that when there are multiple rules associated with a single phenotype label that BowSaw is capable of faithfully identifying them. Application to data from the human microbiome project offers further evidence that BowSaw provides an efficient way of generating plausible hypotheses for high throughput metagenomics studies. In particular we identified a rule that utilizes a presence/absence pattern of five microbial taxa (present: <italic>Bacteroides, Lachnoclostridium</italic>; absent: <italic>Lachnospira, Lachnospiracea, Tyzerrella</italic>) that accounts for nearly half of all Crohn&#x2019;s disease samples in the cohort (38/86). This specific pattern of microbial colonization in the guts of Crohn&#x2019;s disease patients is unreported, but each taxon&#x2019;s respective enrichment or depletion status and association with disease status has been reported. If the cohort of patients in the human microbiome study are representative of all people afflicted by Crohn&#x2019;s disease, then this rule represents a significantly large subset of those suffering. Inquiries into the relationship of the taxa included in this rule with disease status may yield important insights into the mechanisms of the disease and potential therapeutic strategies for this sub-population. Of the five associated taxa, we suspect that the absence of <italic>Lachnospira, Lachnospiracea UCG 001,</italic> and <italic>Tyzzerella</italic> are biologically meaningful. We have reason to believe so because it has been reported that the <italic>Lachnospiraceae</italic> family is generally suppressed in Crohn&#x2019;s disease (<xref ref-type="bibr" rid="B28">Loh and Blaut, 2012</xref>; <xref ref-type="bibr" rid="B20">Geirnaert et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B31">Nagao-Kitamoto and Kamada, 2017</xref>). <italic>Lachnospira</italic> has been reported as depleted with respect to Crohn&#x2019;s disease several times (<xref ref-type="bibr" rid="B48">Wright et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B46">Wang Y. et&#x20;al., 2018</xref>). The depletion of <italic>Tyzzerella</italic> has been associated with chronic intestinal inflammation and supplementation suggested as a probiotic for Crohn&#x2019;s disease (<xref ref-type="bibr" rid="B4">Berry et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B12">Chen et&#x20;al., 2018</xref>). While the relationship of <italic>Lachnospiracea UCG 001</italic> with Crohn&#x2019;s disease is still unclear, its depletion has been reported in mice displaying symptoms of anhedonia and it was significantly enriched in anhedonia resilient mice (<xref ref-type="bibr" rid="B50">Yang et&#x20;al., 2019</xref>). Partly because IBD is frequently accompanied by depression, anhedonia has been suggested as an important symptom in the diagnosis of IBD (<xref ref-type="bibr" rid="B9">Carpinelli et&#x20;al., 2019</xref>). The associations of the individual OTUs defined by this rule are consistent with previously reported findings in the existing literature and describe a taxonomic profile that exclusively identifies a large sub-population of Crohn&#x2019;s disease samples within this cohort. The presence of <italic>Bacteroides</italic> does not appear to be particularly useful and in this context is probably preserved because it causes a perfect association, although high levels of some species are implicated in the pathology of Crohn&#x2019;s disease (<xref ref-type="bibr" rid="B36">Rabizadeh et&#x20;al., 2007</xref>). <italic>Lachnoclostridium</italic> is differentially distributed across the three classes. Notably it is less frequently detected in ulcerative colitis relative to Crohn&#x2019;s and non-IBD samples, which roughly resemble one another. Increased levels of this genus were detected in rats that showed relief of colitis symptoms after treatment with a proposed therapeutic agent (<xref ref-type="bibr" rid="B45">Wang K. et&#x20;al., 2018</xref>).</p>
<p>The current implementation of the algorithms is restricted to classification tasks with categorical predictor values. This is a challenge that can be addressed in future variants of this approach, in order to make it more generally applicable. Future work could also focus on extending these approaches to the interpretation of regression models or to consider the effect of counting stubs of higher-order interactions or co-occurring pairs on bookkeeping and rule extraction as opposed to strict parent-child relationships. We anticipate that the concept at the core of BowSaw and its different possible extensions could help uncover complex feature-phenotype maps for other types of biological datasets.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>Data and code presented in this study are available on GitHub (<ext-link ext-link-type="uri" xlink:href="https://github.com/segrelab/BowSaw">https://github.com/segrelab/BowSaw</ext-link>). Additional analyses are included in the article/<xref ref-type="sec" rid="s9">Supplementary Material</xref>.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>DD, DS, and MK planned the study. DD developed the algorithm, conducted the computational work and wrote a first version of the manuscript. DD, DS, and MK edited and approved the final version of the manuscript.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>DS and DD acknowledge funding from the NIH (T32GM100842, 5R01DE024468, R01GM121950, UH2AG064704), the National Science Foundation (1457695), the Human Frontiers Science Program (RGP0020/2016), and the Boston University Interdisciplinary Biomedical Research Office. MK Acknowledges funding from the NIH (R01GM131409), and the NSF DMS (1736392).</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<ack>
<p>We are grateful to members of the Segr&#xe8; lab for helpful discussions and for feedback on the manuscript. DD is grateful to Nisha Rajagopal for her patience in conversations about random forests and her valuable insight.</p>
</ack>
<sec id="s9">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fmolb.2021.663532/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fmolb.2021.663532/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table1.XLSX" id="SM2" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ai</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>L. C.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Using Decision Tree Aggregation with Random Forest Model to Identify Gut Microbes Associated with Colorectal Cancer</article-title>. <source>Genes</source> <volume>10</volume>, <fpage>112</fpage>. <pub-id pub-id-type="doi">10.3390/genes10020112</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Azmi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Runger</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Berrado</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Interpretable Regularized Class Association Rules Algorithm for Classification in a Categorical Data Space</article-title>. <source>Inf. Sci.</source> <volume>483</volume>, <fpage>313</fpage>&#x2013;<lpage>331</lpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2019.01.047</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Basu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kumbier</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Brown</surname>
<given-names>J.&#x20;B.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Iterative Random Forests to Discover Predictive and Stable High-Order Interactions</article-title>. <source>Proc. Natl. Acad. Sci. USA</source> <volume>115</volume>, <fpage>1943</fpage>&#x2013;<lpage>1948</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1711236115</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Berry</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rahman</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kaplan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gordon</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Probiotic and Prebiotic Compositions, and Methods of Use Thereof for Treatment and Prevention of Graft versus Host Disease</source> <publisher-name>US Patent Office</publisher-name>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Boulesteix</surname>
<given-names>A.-L.</given-names>
</name>
<name>
<surname>Janitza</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kruppa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>K&#xf6;nig</surname>
<given-names>I. R.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Overview of Random forest Methodology and Practical Guidance with Emphasis on Computational Biology and Bioinformatics</article-title>. <source>Wires Data Mining Knowl Discov.</source> <volume>2</volume>, <fpage>493</fpage>&#x2013;<lpage>507</lpage>. <pub-id pub-id-type="doi">10.1002/widm.1072</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Breiman</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Random Forests</article-title>. <source>Mach. Learn.</source> <volume>45</volume>, <fpage>5</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brodley</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Friedl</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Decision Tree Classification of Land Cover from Remotely Sensed Data</article-title>. <source>Remote Sens. Environ.</source> <volume>61</volume>, <fpage>399</fpage>&#x2013;<lpage>409</lpage>. </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Carding</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Verbeke</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Vipond</surname>
<given-names>D. T.</given-names>
</name>
<name>
<surname>Corfe</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Owen</surname>
<given-names>L. J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Dysbiosis of the Gut Microbiota in Disease</article-title>. <source>Microb. Ecol. Health Dis.</source> <volume>26</volume>, <fpage>26191</fpage>. <pub-id pub-id-type="doi">10.3402/mehd.v26.26191</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Carpinelli</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bucci</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Santonicola</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zingone</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Ciacci</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Iovino</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Anhedonia in Irritable Bowel Syndrome and in Inflammatory Bowel Diseases and its Relationship with Abdominal Pain</article-title>. <source>Neurogastroenterology Motil.</source> <volume>31</volume>, <fpage>e13531</fpage>. <pub-id pub-id-type="doi">10.1111/nmo.13531</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Castelvecchi</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Can We Open the Black Box of AI?</article-title> <source>Nature</source> <volume>538</volume>, <fpage>20</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1038/538020a</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cesario</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>D&#x2019;Oria</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bove</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Privitera</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Bo&#x161;koski</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Pedicino</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Personalized Clinical Phenotyping through Systems Medicine and Artificial Intelligence</article-title>. <source>Jpm</source> <volume>11</volume>, <fpage>265</fpage>. <pub-id pub-id-type="doi">10.3390/jpm11040265</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y.-J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>S.-D.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.-T.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.-N.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Parasutterella, in Association with Irritable Bowel Syndrome and Intestinal Chronic Inflammation</article-title>. <source>J.&#x20;Gastroenterol. Hepatol.</source> <volume>33</volume>, <fpage>1844</fpage>&#x2013;<lpage>1852</lpage>. <pub-id pub-id-type="doi">10.1111/jgh.14281</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Interpreting Tree Ensembles with inTrees</article-title>. <source>Int. J.&#x20;Data Sci. Anal.</source> <volume>7</volume>, <fpage>277</fpage>&#x2013;<lpage>287</lpage>. <pub-id pub-id-type="doi">10.1007/s41060-018-0144-8</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dicker</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Lonergan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Keir</surname>
<given-names>H. R.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>A. H.</given-names>
</name>
<name>
<surname>Pollock</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Finch</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The Sputum Microbiome and Clinical Outcomes in Patients with Bronchiectasis: a Prospective Observational Study</article-title>. <source>Lancet Respir. Med.</source> <comment>(2021) May 4; S2213-2600(20)30557-9</comment>. <pub-id pub-id-type="doi">10.1016/S2213-2600(20)30557-9</pub-id> Online ahead of print </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duvallet</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gibbons</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Gurry</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Irizarry</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Alm</surname>
<given-names>E. J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Meta-analysis of Gut Microbiome Studies Identifies Disease-specific and Shared Responses</article-title>. <source>Nat. Commun.</source> <volume>8</volume>, <fpage>1784</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-017-01973-8</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Emily</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mailund</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hein</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schauser</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schierup</surname>
<given-names>M. H.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Using Biological Networks to Search for Interacting Loci in Genome-wide Association Studies</article-title>. <source>Eur. J.&#x20;Hum. Genet.</source> <volume>17</volume>, <fpage>1231</fpage>&#x2013;<lpage>1240</lpage>. <pub-id pub-id-type="doi">10.1038/ejhg.2009.15</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Franzosa</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Sirota-Madi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Avila-Pacheco</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fornelos</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Haiser</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Reinker</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Gut Microbiome Structure and Metabolic Activity in Inflammatory Bowel Disease</article-title>. <source>Nat. Microbiol.</source> <volume>4</volume>, <fpage>293</fpage>&#x2013;<lpage>305</lpage>. <pub-id pub-id-type="doi">10.1038/s41564-018-0306-4</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Furqan</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Siyal</surname>
<given-names>M. Y.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Inference of Biological Networks Using Bi-directional Random Forest Granger Causality</article-title>. <source>Springerplus</source> <volume>5</volume>, <fpage>514</fpage>. <pub-id pub-id-type="doi">10.1186/s40064-016-2156-y</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Geirnaert</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Calatayud</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Grootaert</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Laukens</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Devriese</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Smagghe</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Butyrate-producing Bacteria Supplemented <italic>In Vitro</italic> to Crohn&#x27;s Disease Patient Microbiota Increased Butyrate Production and Enhanced Intestinal Epithelial Barrier Integrity</article-title>. <source>Sci. Rep.</source> <volume>7</volume>, <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-017-11734-8</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goodswen</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Barratt</surname>
<given-names>J.&#x20;L. N.</given-names>
</name>
<name>
<surname>Kennedy</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Kaufer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Calarco</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ellis</surname>
<given-names>J.&#x20;T.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Machine Learning and Applications in Microbiology</article-title>. <source>FEMS Microbiol. Rev.</source>, 2021 Mar 16; <fpage>fuab015</fpage>. <pub-id pub-id-type="doi">10.1093/femsre/fuab015</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hughes</surname>
<given-names>R. E.</given-names>
</name>
<name>
<surname>Elliott</surname>
<given-names>R. J.&#x20;R.</given-names>
</name>
<name>
<surname>Dawson</surname>
<given-names>J.&#x20;C.</given-names>
</name>
<name>
<surname>Carragher</surname>
<given-names>N. O.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>High-content Phenotypic and Pathway Profiling to advance Drug Discovery in Diseases of Unmet Need</article-title>. <source>Cel Chem. Biol.</source> <volume>28</volume>, <fpage>338</fpage>&#x2013;<lpage>355</lpage>. <pub-id pub-id-type="doi">10.1016/j.chembiol.2021.02.015</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Knudson</surname>
<given-names>A. G.</given-names>
</name>
</person-group> (<year>1971</year>). <article-title>Mutation and Cancer: Statistical Study of Retinoblastoma</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>68</volume>, <fpage>820</fpage>&#x2013;<lpage>823</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.68.4.820</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>LaPierre</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ju</surname>
<given-names>C. J.-T.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>MetaPheno: A Critical Evaluation of Deep Learning and Machine Learning in Metagenome-Based Disease Prediction</article-title>. <source>Methods</source> <volume>166</volume>, <fpage>74</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2019.03.003</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Le</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Quinn</surname>
<given-names>T. P.</given-names>
</name>
<name>
<surname>Tran</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Venkatesh</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep in the Bowel: Highly Interpretable Neural Encoder-Decoder Networks Predict Gut Metabolites from Gut Microbiome</article-title>. <source>BMC Genomics</source> <volume>21</volume>, <fpage>256</fpage>. <pub-id pub-id-type="doi">10.1101/686394</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Leem</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jeong</surname>
<given-names>H.-h.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wee</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Sohn</surname>
<given-names>K.-A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Fast Detection of High-Order Epistatic Interactions in Genome-wide Association Studies Using Information Theoretic Measure</article-title>. <source>Comput. Biol. Chem.</source> <volume>50</volume>, <fpage>19</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1016/j.compbiolchem.2014.01.005</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Levy</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kolodziejczyk</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Thaiss</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Elinav</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Dysbiosis and the Immune System</article-title>. <source>Nat. Rev. Immunol.</source> <volume>17</volume>, <fpage>219</fpage>&#x2013;<lpage>232</lpage>. <pub-id pub-id-type="doi">10.1038/nri.2017.7</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Loh</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Blaut</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Role of Commensal Gut Bacteria in Inflammatory Bowel Diseases</article-title>. <source>Gut Microbes</source> <volume>3</volume>, <fpage>544</fpage>&#x2013;<lpage>555</lpage>. <pub-id pub-id-type="doi">10.4161/gmic.22156</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Louppe</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2014</year>). <source>Understanding Random Forests</source>. <publisher-name>Cornell University Library</publisher-name>
<comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1407.7502">http://arxiv.org/abs/1407.7502</ext-link> (Accessed June, 2018)</comment>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marcos-Zambrano</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Karaduzovic-Hadziabdic</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Loncar Turukalo</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Przymus</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Trajkovik</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Aasmets</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Applications of Machine Learning in Human Microbiome Studies: A Review on Feature Selection, Biomarker Identification, Disease Prediction and Treatment</article-title>. <source>Front. Microbiol.</source> <volume>12</volume>, <fpage>634511</fpage>. <pub-id pub-id-type="doi">10.3389/fmicb.2021.634511</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nagao-Kitamoto</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kamada</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Host-microbial Cross-Talk in Inflammatory Bowel Disease</article-title>. <source>Immune Netw.</source> <volume>17</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.4110/in.2017.17.1.1</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nguyen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>H. N.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Random forest Classifier Combined with Feature Selection for Breast Cancer Diagnosis and Prognostic</article-title>. <source>JBiSE</source> <volume>06</volume>, <fpage>551</fpage>&#x2013;<lpage>560</lpage>. <pub-id pub-id-type="doi">10.4236/jbise.2013.65070</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nguyen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Long</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>McDermott</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Olsen</surname>
<given-names>R. J.</given-names>
</name>
<name>
<surname>Olson</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Stevens</surname>
<given-names>R. L.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Using Machine Learning to Predict Antimicrobial MICs and Associated Genomic Features for NontyphoidalSalmonella</article-title>. <source>J.&#x20;Clin. Microbiol.</source> <volume>57</volume>, <fpage>e01260-18</fpage>. <pub-id pub-id-type="doi">10.1128/JCM.01260-18</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Palczewska</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Palczewski</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Robinson</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Neagu</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2013</year>). &#x201c;<article-title>Interpreting Random forest Models Using a Feature Contribution Method</article-title>,&#x201d; in <conf-name>2013 IEEE 14th International Conference on Information Reuse &#x26; Integration (IRI)</conf-name>, <fpage>1</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1109/IRI.2013.6642461</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Proctor</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Creasy</surname>
<given-names>H. H.</given-names>
</name>
<name>
<surname>Fettweis</surname>
<given-names>J.&#x20;M.</given-names>
</name>
<name>
<surname>Lloyd-Price</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Mahurkar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>The Integrative Human Microbiome Project</article-title>. <source>Nature</source> <volume>569</volume>, <fpage>641</fpage>&#x2013;<lpage>648</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-019-1238-8</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rabizadeh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Rhee</surname>
<given-names>K.-J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Huso</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Gan</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Golub</surname>
<given-names>J.&#x20;E.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>Enterotoxigenic Bacteroides Fragilis: A Potential Instigator of Colitis</article-title>. <source>Inflamm. Bowel Dis.</source> <volume>13</volume>, <fpage>1475</fpage>&#x2013;<lpage>1483</lpage>. <pub-id pub-id-type="doi">10.1002/ibd.20265</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rampelli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fabbrini</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Candela</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Biagi</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Brigidi</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Turroni</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>G2S: A New Deep Learning Tool for Predicting Stool Microbiome Structure from Oral Microbiome Data</article-title>. <source>Front. Genet.</source> <volume>12</volume>. <pub-id pub-id-type="doi">10.3389/fgene.2021.644516</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<collab>R Core Team</collab> (<year>2020</year>). <source>R: A Language and Environment for Statistical Computing</source>. <publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>R Foundation for Statistical Computing</publisher-name> <comment>Available at <ext-link ext-link-type="uri" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link>
</comment>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reading</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Bibliography</article-title>. <source>Diagnosis, Treat.</source> <volume>85</volume>, <fpage>297</fpage>&#x2013;<lpage>320</lpage>. <pub-id pub-id-type="doi">10.2307/j.ctt9m0vx3.14</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reel</surname>
<given-names>P. S.</given-names>
</name>
<name>
<surname>Reel</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pearson</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Trucco</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Jefferson</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Using Machine Learning Approaches for Multi-Omics Data Analysis: A Review</article-title>. <source>Biotechnol. Adv.</source> <volume>49</volume>, <fpage>107739</fpage>. <pub-id pub-id-type="doi">10.1016/j.biotechadv.2021.107739</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Strobl</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Boulesteix</surname>
<given-names>A.-L.</given-names>
</name>
<name>
<surname>Kneib</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Augustin</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zeileis</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Conditional Variable Importance for Random Forests</article-title>. <source>BMC Bioinformatics</source> <volume>9</volume>, <fpage>307</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-9-307</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Touw</surname>
<given-names>W. G.</given-names>
</name>
<name>
<surname>Bayjanov</surname>
<given-names>J.&#x20;R.</given-names>
</name>
<name>
<surname>Overmars</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Backus</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Boekhorst</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wels</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Data Mining in the Life Sciences with Random Forest: a Walk in the Park or Lost in the Jungle?</article-title> <source>Brief. Bioinform.</source> <volume>14</volume>, <fpage>315</fpage>&#x2013;<lpage>326</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbs034</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<collab>UCI Machine Learning Repository</collab> (<year>2020</year>). <article-title>UCI Repository of Machine Learning Databases</article-title>. <publisher-loc>Irvine, CA</publisher-loc>: <publisher-name>University of California, Department of Information and Computer Science</publisher-name>. Available at: <ext-link ext-link-type="uri" xlink:href="http://www.ics.uci.edu/&#x223C;mlearn/MLRepository.html">http://www.ics.uci.edu/&#x223C;mlearn/MLRepository.html</ext-link> (<comment>Accessed May 14, 2021</comment>).</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vangay</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hillmann</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Knights</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Microbiome Learning Repo (ML Repo): A Public Repository of Microbiome Regression and Classification Tasks</article-title>. <source>Gigascience</source> <volume>8</volume>. <pub-id pub-id-type="doi">10.1093/gigascience/giz042</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Visscher</surname>
<given-names>P. M.</given-names>
</name>
<name>
<surname>Wray</surname>
<given-names>N. R.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Sklar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>McCarthy</surname>
<given-names>M. I.</given-names>
</name>
<name>
<surname>Brown</surname>
<given-names>M. A.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>10&#x20;Years of GWAS Discovery: Biology, Function, and Translation</article-title>. <source>Am. J.&#x20;Hum. Genet.</source> <volume>101</volume>, <fpage>5</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1016/j.ajhg.2017.06.005</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Protective Effects of Salvianolic Acid a against Dextran Sodium Sulfate-Induced Acute Colitis in Rats</article-title>. <source>Nutrients</source> <volume>10</volume> (<issue>6</issue>). <pub-id pub-id-type="doi">10.3390/nu10060791</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ghozlane</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Characteristics of Faecal Microbiota in Paediatric Crohn&#x2019;s Disease and Their Dynamic Changes during Infliximab Therapy</article-title>. <source>J.&#x20;Crohn&#x2019;s Colitis</source> <volume>12</volume>, <fpage>337</fpage>&#x2013;<lpage>346</lpage>. <pub-id pub-id-type="doi">10.1093/ecco-jcc/jjx153</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Welling</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Refsgaard</surname>
<given-names>H. H. F.</given-names>
</name>
<name>
<surname>Brockhoff</surname>
<given-names>P. B.</given-names>
</name>
<name>
<surname>Clemmensen</surname>
<given-names>L. H.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Forest Floor Visualizations of Random Forests</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1605.09196">http://arxiv.org/abs/1605.09196</ext-link> (Accessed June, 2018)</comment> </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wright</surname>
<given-names>E. K.</given-names>
</name>
<name>
<surname>Kamm</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Wagner</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Teo</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Cruz</surname>
<given-names>P. D.</given-names>
</name>
<name>
<surname>Hamilton</surname>
<given-names>A. L.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Microbial Factors Associated with Postoperative Crohn&#x2019;s Disease Recurrence</article-title>. <source>J.&#x20;Crohn&#x2019;s Colitis</source> <volume>11</volume>, <fpage>191</fpage>&#x2013;<lpage>203</lpage>. <pub-id pub-id-type="doi">10.1093/ecco-jcc/jjw136</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wright</surname>
<given-names>M. N.</given-names>
</name>
<name>
<surname>Ziegler</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>K&#xf6;nig</surname>
<given-names>I. R.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Do little Interactions Get Lost in Dark Random Forests?</article-title> <source>BMC Bioinformatics</source> <volume>17</volume>, <fpage>145</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-016-0995-8</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bi</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Key Role of Gut Microbiota in Anhedonia-like Phenotype in Rodents with Neuropathic Pain</article-title>. <source>Transl. Psychiatry</source> <volume>9</volume>, <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1038/s41398-019-0379-8</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jian</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Screening for Core Genes Related to Pathogenesis of Alzheimer&#x2019;s Disease</article-title>. <source>Front. Cel Dev. Biol.</source> <volume>9</volume>, <fpage>668738</fpage>. <pub-id pub-id-type="doi">10.3389/fcell.2021.668738</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>