<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">739432</article-id>
<article-id pub-id-type="doi">10.3389/frai.2021.739432</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Co-Inference of Data Mislabelings Reveals Improved Models in Genomics and Breast Cancer Diagnostics</article-title>
<alt-title alt-title-type="left-running-head">Gerber et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Co-Inference of Mislabeled Data</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Gerber</surname>
<given-names>Susanne</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/638713/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pospisil</surname>
<given-names>Lukas</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1541095/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sys</surname>
<given-names>Stanislav</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1587100/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hewel</surname>
<given-names>Charlotte</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1265646/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Torkamani</surname>
<given-names>Ali</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/23811/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Horenko</surname>
<given-names>Illia</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Institute of Human Genetics, University Medical Center of the Johannes Gutenberg University Mainz</institution>, <addr-line>Mainz</addr-line>, <country>Germany</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Faculty of Informatics, Institute of Computational Science, Universit&#xe0; Della Svizzera Italiana</institution>, <addr-line>Lugano</addr-line>, <country>Switzerland</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Integrative Structural and Computational Biology, The Scripps Research Institute</institution>, <addr-line>La Jolla</addr-line>, <addr-line>CA</addr-line>, <country>United&#x20;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/218737/overview">Mohammad Akbari</ext-link>, Amirkabir University of Technology,&#x20;Iran</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/142900/overview">Aline Paes</ext-link>, Fluminense Federal University, Brazil</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/311671/overview">Yalu Wen</ext-link>, The University of Auckland, New&#x20;Zealand</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Susanne Gerber, <email>sugerber@uni-mainz.de</email>; Illia Horenko, <email>illia.horenko@usi.ch</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Machine Learning and Artificial Intelligence, a section of the journal Frontiers in Artificial Intelligence</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>05</day>
<month>01</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>4</volume>
<elocation-id>739432</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>07</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>11</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Gerber, Pospisil, Sys, Hewel, Torkamani and Horenko.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Gerber, Pospisil, Sys, Hewel, Torkamani and Horenko</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Mislabeling of cases as well as controls in case&#x2013;control studies is a frequent source of strong bias in prognostic and diagnostic tests and algorithms. Common data processing methods available to the researchers in the biomedical community do not allow for consistent and robust treatment of labeled data in the situations where both, the case and the control groups, contain a non-negligible proportion of mislabeled data instances. This is an especially prominent issue in studies regarding late-onset conditions, where individuals who may convert to cases may populate the control group, and for screening studies that often have high false-positive/-negative rates. To address this problem, we propose a method for a simultaneous robust inference of Lasso reduced discriminative models and of latent group-specific mislabeling risks, not requiring any exactly labeled data. We apply it to a standard breast cancer imaging dataset and infer the mislabeling probabilities (being rates of false-negative and false-positive core-needle biopsies) together with a small set of simple diagnostic rules, outperforming the state-of-the-art BI-RADS diagnostics on these data. The inferred mislabeling rates for breast cancer biopsies agree with the published purely empirical studies. Applying the method to human genomic data from a healthy-ageing cohort reveals a previously unreported compact combination of single-nucleotide polymorphisms that are strongly associated with a healthy-ageing phenotype for Caucasians. It determines that 7.5<italic>%</italic> of Caucasians in the 1000 Genomes dataset (selected as a control group) carry a pattern characteristic of healthy ageing.</p>
</abstract>
<kwd-group>
<kwd>mislabeling</kwd>
<kwd>label noise</kwd>
<kwd>latent variable estimation</kwd>
<kwd>bioinformatics</kwd>
<kwd>bias</kwd>
<kwd>regression</kwd>
<kwd>machine learning</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>The analysis of biomedical data often aims to identify a specific (small) set of characteristics or biomarkers that will allow for the most accurate and efficient discrimination between groups. For example, it is assumed that an unknown combination of characteristics offers the best possibility to distinguish a group of patients with a certain symptom or disease from all other groups. A wide variety of statistical and machine learning tools have been developed to select the optimal feature set through an analysis of labeled data (<xref ref-type="bibr" rid="B3">Bair and Tibshirani, 2004</xref>; <xref ref-type="bibr" rid="B23">Hastie et&#x20;al., 2009</xref>; <xref ref-type="bibr" rid="B28">Luo and Ren, 2014</xref>; <xref ref-type="bibr" rid="B42">Taylor and Tibshirani, 2015</xref>). In practice, however, mistakes in the assignment of labels and features during the data acquisition procedure may introduce serious bias when common methods are applied or even prohibit their use. Erroneous assignments occur in many studies for a variety of reasons: experimental errors, differences in platforms used to acquire data from different groups, differences in protocols used to post-process data, or intrinsic difficulties in distinguishing the case and control groups (<xref ref-type="bibr" rid="B26">Lam et&#x20;al., 2011</xref>; <xref ref-type="bibr" rid="B33">ORawe et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B38">Ross et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B48">Wei&#xdf;bach et&#x20;al., 2021</xref>). This is particularly evident in the case of coronavirus data emerging from different sources where it has been shown that data variability is an important factor concerning the usability of such data for machine learning (<xref ref-type="bibr" rid="B39">S&#xe1;ez et&#x20;al., 2020</xref>). Furthermore, diagnoses for many diseases only reach a certain level of confidence; in some cases (e.g., Alzheimer&#x2019;s disease), a 100% diagnosis is only possible during a postmortem autopsy (<xref ref-type="bibr" rid="B21">Gomez-Nicola and Boche, 2015</xref>). This means that subjects with a negative diagnosis might nevertheless carry a latent form of a disease without showing symptoms yet. Assigning those individuals to the control group in a (bio)medical study can introduce a strong source of errors and can certainly have severe consequences for the patients, if this mislabeling cannot be identified, and correct medical treatment will be withheld.</p>
<p>Most methods that are commonly applied to deal with the problem of mislabeling are based on detection of anomalies and outliers and aim to avoid the problem by thoroughly cleaning the data during preprocessing, removing those points that appear to be mislabeled (<xref ref-type="bibr" rid="B7">Brodley and Friedl, 1999</xref>; <xref ref-type="bibr" rid="B4">Barandela and Gasca, 2000</xref>; <xref ref-type="bibr" rid="B43">Teng, 2001</xref>; <xref ref-type="bibr" rid="B25">Jiang and Zhou, 2004</xref>; <xref ref-type="bibr" rid="B14">Fr&#xe9;nay and Kaban, 2014</xref>; <xref ref-type="bibr" rid="B15">Frenay and Verleysen, 2014</xref>). They detect outliers because they deviate significantly from a model that has already been imposed on a data subset&#x2014;which is again&#x2014;presumed to be correctly labeled (<xref ref-type="bibr" rid="B9">Chandola et&#x20;al., 2009</xref>). This cleaning however can be problematic because it first may not be possible if too little is known to determine what might be mislabeled, and second because it can severely reduce the size of the available data, making statistical results less reliable (<xref ref-type="bibr" rid="B43">Teng, 2001</xref>; <xref ref-type="bibr" rid="B6">Bootkrajang and Kab&#xe1;n, 2012</xref>). Popular methods to analyze data based on supervised (<xref ref-type="bibr" rid="B45">Tibshirani, 1996</xref>; <xref ref-type="bibr" rid="B23">Hastie et&#x20;al., 2009</xref>; <xref ref-type="bibr" rid="B28">Luo and Ren, 2014</xref>) and semi-supervised machine learning methods (<xref ref-type="bibr" rid="B30">Moya and Hush, 1996</xref>; <xref ref-type="bibr" rid="B3">Bair and Tibshirani, 2004</xref>; <xref ref-type="bibr" rid="B36">Rodionova et&#x20;al., 2016</xref>) for labeled data analysis (e.g., generalized linear models and neuronal networks) are also implicitly based on an assumption that at least one of the groups to be discriminated has been labeled perfectly or at least assume a subset of perfectly labeled data (<xref ref-type="bibr" rid="B24">Hendrycks et&#x20;al., 2019</xref>). On the other side, common <italic>unsupervised</italic> methods (such as hidden Markov models, Bayesian mixture models (<xref ref-type="bibr" rid="B17">Fr&#xfc;hwirth-Schnatter, 2006</xref>; <xref ref-type="bibr" rid="B46">Todorov et&#x20;al., 2020</xref>), and advanced clustering methods (<xref ref-type="bibr" rid="B1">Andreopoulos et&#x20;al., 2009</xref>; <xref ref-type="bibr" rid="B18">Gerber and Horenko, 2015</xref>; <xref ref-type="bibr" rid="B37">Rodrigues et&#x20;al., 2021</xref>)) ignore any prior assignments of data to labels and groups in a given dataset. Herewith, however, a lot of valuable information is&#x20;lost.</p>
<p>Also in a broader context of unsupervised data anomaly detection, two major method families like the one-class support vector machines (OCSVM, sometimes also referred to as one-class learning methods) (<xref ref-type="bibr" rid="B10">Choi, 2009</xref>; <xref ref-type="bibr" rid="B50">Zhu et&#x20;al., 2016</xref>) and isolation forests (IF, anomaly detection algorithms based on random forest ideas) (<xref ref-type="bibr" rid="B27">Liu et&#x20;al., 2008</xref>; <xref ref-type="bibr" rid="B22">Hariri et&#x20;al., 2021</xref>) rely either on the explicit knowledge of some subsets of correctly identified data anomalies that can be used for training or on knowing the exact proportion of anomalous data in the given dataset. In the latter case, providing the exact proportion of anomalous data in OCSVM and IF allows identifying the exact value of the anomaly threshold that can be used to separate normal data from anomalous data. The primary aim of this study is providing a robust computational mislabeling inference procedure for generalized linear models (e.g., logistic regressions)&#x2014;an algorithmic procedure that does neither rely on the explicit knowledge of the particular mislabeled data instances nor on the knowledge of the exact proportion of the mislabeled data in the given dataset. Instead, the introduced procedure relies on the knowledge of the upper bound for the mislabeled data proportion and deploys the tools from information theory (like Akaike information criterion) to infer the optimal logistic model and the optimal class-specific mislabeling probability matrix. We thus address the currently existing methodological gap and propose a scalable method that can realistically be applied in the analysis of biomedical data. The method permits reduced sets of discriminative features to be inferred while co-estimating the group-specific data mislabeling risks. We apply the method to two examples: breast cancer diagnoses based on radiographs (example 1) and an analysis of genomic features from a Wellderly cohort (example 2) (<xref ref-type="bibr" rid="B13">Erikson et&#x20;al., 2016</xref>)&#x2014;a cohort of people who live to be 80&#xa0;years or more without having experienced a serious or chronic disease. Analysis of a synthetic dataset (i.e.,&#x20;a dataset created by a generalized linear model with known parameters and known group-specific mislabeling, which mimics the breast cancer imaging data from example 1) is shown in Section 3 of the <xref ref-type="sec" rid="s10">Supplementary Material</xref>.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and Methods</title>
<p>Here, we give a brief description of the methodology. Detailed mathematical derivation and an investigation of its mathematical properties (a case of mislabeled Bernoulli trials, proofs of conditions for existence and uniqueness of solutions, monotonicity, and convergence of the numerical method) can be found in Lemmas 1&#x2013;5 in the <xref ref-type="sec" rid="s10">Supplementary Material</xref>. We consider a problem of analyzing the labeled datasets <inline-formula id="inf1">
<mml:math id="m1">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> that are grouped into <italic>N</italic>
<sub>
<italic>g</italic>
</sub> cohorts/groups, with <italic>T</italic>
<sub>
<italic>g</italic>
</sub> being the number of instances, for example, the number of patients in the cohort/group&#x20;<italic>g</italic>.</p>
<p>For every data instance <inline-formula id="inf2">
<mml:math id="m2">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> (for every patient number <italic>t</italic> in the group <italic>g</italic>), we would like to identify a relation between a vector of features <italic>X</italic>
<sub>
<italic>t</italic>,<italic>g</italic>
</sub> (an <italic>n</italic>-dimensional vector containing, e.g., the genotype, the age, and some other patient-specific information) and a &#x201c;true&#x201d;&#x2014;but directly unobserved&#x2014;categorical label <italic>Y</italic>
<sub>
<italic>t</italic>,<italic>g</italic>
</sub>. This &#x201c;true&#x201d; label is taking values in the finite set of <italic>m</italic> categories <italic>y</italic>&#x20;&#x3d; {<italic>y</italic>
<sub>1</sub>, <italic>y</italic>
<sub>2</sub>, <italic>&#x2026;</italic> , <italic>y</italic>
<sub>
<italic>m</italic>
</sub>} and represents, for example, a certain phenotype. A typical setting would be to compare features from a cohort of ill people to a control group, resulting in <italic>m</italic>&#x20;&#x3d; 2 representing labels like <italic>y</italic>
<sub>1</sub> &#x3d; &#x201c;sick&#x201d; and <italic>y</italic>
<sub>2</sub> &#x3d; &#x201c;healthy.&#x201d; We consider the &#x201c;true&#x201d; labels <italic>Y</italic>
<sub>
<italic>t</italic>,<italic>g</italic>
</sub> to be unobserved since they are not directly available. Only the observed labelings <inline-formula id="inf3">
<mml:math id="m3">
<mml:msubsup>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> are available and can be mislabeled in every instance <inline-formula id="inf4">
<mml:math id="m4">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> with some, yet unknown, cohort-specific mislabeling probability <inline-formula id="inf5">
<mml:math id="m5">
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>. We assume that a parametric (i.e.,&#x20;dependent on a vector of parameters <italic>&#x3b1;</italic>) discriminative model is establishing a conditional dependence between the particular &#x201c;true&#x201d; unobserved labeling and a particular observed feature vector <italic>X</italic>
<sub>
<italic>t</italic>,<italic>g</italic>
</sub>. The parametric function relating features and labels is denoted as <inline-formula id="inf6">
<mml:math id="m6">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>. These parametric functions can, for example, be a generalized linear model (GLM, e.g., the standard logit and probit models for <italic>m</italic>&#x20;&#x3d; 2) (<xref ref-type="bibr" rid="B29">McFadden, 1974</xref>; <xref ref-type="bibr" rid="B16">Friedman et&#x20;al., 2010</xref>) or a neuronal network (<xref ref-type="bibr" rid="B23">Hastie et&#x20;al., 2009</xref>).</p>
<p>Let <inline-formula id="inf7">
<mml:math id="m7">
<mml:msubsup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c7;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>, where <italic>&#x3c7;</italic> is the indicator function, taking value 1 if its argument is true and 0 otherwise. Then it can be shown that the unknown optimal model parameters <italic>&#x3b1;</italic>&#x2a; can be inferred together with the optimal mislabeling risk matrix <italic>r</italic>&#x2a; by solving the following maximization problem (see <xref ref-type="sec" rid="s1">Section 1</xref> of SI for a step-by-step derivation):<disp-formula id="e1">
<mml:math id="m8">
<mml:mtable class="eqnarray">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2217;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2217;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:munder>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix" mathvariant="normal">argmax</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi mathvariant="bold">L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi mathvariant="bold">L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(1)</label>
</disp-formula>which is subject to the following constraints:<disp-formula id="e2">
<mml:math id="m9">
<mml:mtable class="eqnarray">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m10">
<mml:mtable class="eqnarray">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mn>0</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x2264;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2264;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m11">
<mml:mtable class="eqnarray">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf8">
<mml:math id="m12">
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> are user-defined intervals for mislabeling risks (based, e.g., on some prior knowledge) and <italic>C</italic> is the a priori unknown constant that implicitly confines the number of non-zero components of the parameter vector <italic>&#x3b1;</italic>. The user-defined choice of matrices <italic>r</italic>
<sup>&#x2b;/&#x2212;</sup> is not really arbitrary and should be done based on prior knowledge in such a way that the r-constraints (3) do not lead to an empty&#x20;set.</p>
<p>It is straightforward to verify that the particular case of problem (1&#x2013;4) with fixed <italic>r</italic> being an identity matrix is equivalent to the widely used Lasso (or <italic>l</italic>1 &#x2212; ) regularization methods introduced by <xref ref-type="bibr" rid="B45">Tibshirani (1996)</xref>, <xref ref-type="bibr" rid="B3">Bair and Tibshirani (2004)</xref>, <xref ref-type="bibr" rid="B16">Friedman et&#x20;al. (2010)</xref>, <xref ref-type="bibr" rid="B41">Simon et&#x20;al. (2011)</xref>, and <xref ref-type="bibr" rid="B42">Taylor and Tibshirani (2015)</xref>. In context of these Lasso-regularized methods, decreasing the constant <italic>C</italic> in (4) one reduces the number of non-zero elements in <italic>&#x3b1;</italic>, thereby reducing the number of non-zero parameters and avoiding overfitting. This becomes especially important in biomedical applications, where the number <italic>n</italic> of model parameters is large compared with the size of the available statistics&#x2014;a typical scenario when the danger of overfitting becomes imminent.</p>
<p>As proven in <xref ref-type="sec" rid="s2">Section 2</xref> of the SI, given the imperfectly labeled datasets <inline-formula id="inf9">
<mml:math id="m13">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>, the solution of this optimization problem (1&#x2013;4) results in the parameter vector <italic>&#x3b1;</italic>&#x2a; for any fixed combination of <italic>r</italic> and <italic>C</italic>. This solution will be optimal in the case of the log-likelihood, that is, choosing this particular <italic>&#x3b1;</italic>&#x2a; will result in a maximal probability for observing the given data <inline-formula id="inf10">
<mml:math id="m14">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>obs</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>, confined to a particular choice of <italic>r</italic> and <italic>C</italic>. Selection of the optimal parametric model class <italic>&#x3d5;</italic>, as well as selection of parameters <italic>r</italic> and <italic>C</italic>, can be approached with the standard model selection procedures of machine learning, for example, by means of the cross-validation, with the help of the information criteria or through selective inference (<xref ref-type="bibr" rid="B8">Burnham and Anderson, 2002</xref>; <xref ref-type="bibr" rid="B49">Zhang et&#x20;al., 2010</xref>; <xref ref-type="bibr" rid="B42">Taylor and Tibshirani, 2015</xref>) (see, e.g., <xref ref-type="fig" rid="F1">Figure&#x20;1A</xref>). Uncertainty of the obtained mislabeling risks <italic>r</italic>&#x2a; and model parameters <italic>&#x3b1;</italic>&#x2a; can be obtained using the common non-parametric bootstrap sampling procedure (<xref ref-type="bibr" rid="B11">Efron and Tibshirani, 1993</xref>) (see, e.g., <xref ref-type="fig" rid="F1">Figures 1C,E</xref>,&#x20;<xref ref-type="fig" rid="F2">2A</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Application of (1&#x2013;4) to the analysis of the standard BI-RADS dataset from <ext-link ext-link-type="uri" xlink:href="http://archive.ics.uci.edu/ml/datasets/Mammographic+Mass">http://archive.ics.uci.edu/ml/datasets/Mammographic&#x2b;Mass</ext-link>: <bold>(A)</bold> Model selection by cross-validation (bootstrap-averaged values of the functional <bold>L</bold> from (1) with optimal parameters from the training sets being evaluated on the validation datasets); <bold>(B)</bold> optimal parameter vector <italic>&#x3b1;</italic>&#x2a;; <bold>(C)</bold> probability of a malignant diagnosis as a function of BI-RADS features for the two groups of patients; <bold>(D)</bold> average impact of single BI-RADS features (sensitivity of the risk to the 7 binary features of importance).</p>
</caption>
<graphic xlink:href="frai-04-739432-g001.tif"/>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>
<bold>(A)</bold> Significant correlation between the true synthetically induced mislabeling rate of the mammography data and the error rate predicted by the co-inference method. <bold>(B)</bold> Performance of different model types on the mammography dataset with various mislabeling rates. All models, except the original one, were trained using the respective mislabeled dataset. The average prediction accuracy was calculated based on the original mammography dataset. Co-inference outperforms a linear SVC and performs nearly on par with a state-of-the-art SVC using an RBF kernel.</p>
</caption>
<graphic xlink:href="frai-04-739432-g002.tif"/>
</fig>
<p>If <italic>m</italic> and <italic>N</italic>
<sub>
<italic>g</italic>
</sub> are not too large, one can use the following numerical scheme: 1) first, the rectangular domain spanning a set of admissible values for mislabeling matrix elements in (3) and admissible values of <italic>C</italic> in (4) is sampled (e.g., by means of a uniform equidistant grid), and 2) for every particular grid point (<italic>r</italic>
<sub>
<italic>s</italic>
</sub>, <italic>C</italic>
<sub>
<italic>s</italic>
</sub>), one deploys some standard gradient-based optimization method to solve (1&#x2013;4). In the second step (ii), one can use an interior-point method or the sequential quadratic programming (<xref ref-type="bibr" rid="B31">Nocedal and Wright, 2006</xref>)), performing constrained concave optimization of (2) subject to a constraint (4) only, with fixed values of <italic>r</italic>
<sub>
<italic>s</italic>
</sub> and <italic>C</italic>
<sub>
<italic>s</italic>
</sub>. For example, when <italic>m</italic>&#x20;&#x3d; 2 and <italic>N</italic>
<sub>
<italic>g</italic>
</sub> &#x3d; 1 (the case emerging in examples 1 and 2), there will be only two independent parameters in <italic>r</italic>. Together with the scalar dimension for the regularization constant <italic>C</italic>, this will result in a 3D grid (<italic>r</italic>
<sub>
<italic>s</italic>
</sub>,&#x20;<italic>C</italic>
<sub>
<italic>s</italic>
</sub>).</p>
<p>With respect to the model function <italic>&#x3d5;</italic>, optimization of the concave problem (1&#x2013;4) would only require the evaluation and communication of the function values and the gradients of <italic>&#x3d5;</italic> with respect to <italic>&#x3b1;</italic>. This means that a solution of the overall problem (1&#x2013;4) can be easily integrated into the common software packages for labeled data analysis. Moreover, solutions of problem (1&#x2013;4) for different particular choices of (<italic>r</italic>
<sub>
<italic>s</italic>
</sub>, <italic>C</italic>
<sub>
<italic>s</italic>
</sub>) can be found completely independent of each other, herewith allowing for a highly scalable (&#x201c;embarrassingly parallel&#x201d;) implementation on high-performance computing facilities.</p>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>Example 1: Breast cancer diagnostics based on the standard BI-RADS X-ray imaging data.</title>
<p>First, we consider an analysis and feature selection problem for the standard breast cancer BI-RADS dataset based on X-ray imaging. This dataset is available for open access at the UCI Machine Learning Repository <ext-link ext-link-type="uri" xlink:href="http://archive.ics.uci.edu/ml/datasets/Mammographic+Mass">http://archive.ics.uci.edu/ml/datasets/Mammographic&#x2b;Mass</ext-link> and contains information from biopsies of 403 healthy (benign) subjects and 427 malignant breast cancer patients. For each patient, six attributes are given: 1) the BI-RADS assessment (with values from 1 to 5), 2) the patient&#x2019;s age, 3) the mass shape (with 4 subcategories), 4) the mass margin (with 5 subcategories), 5) the mass density (with 4 subcategories), and 6) the final binominal label for severity: benign &#x3d; 0 or malignant &#x3d; 1. This results in a set of 26 binary features for the shape, margin, and density and one real-valued age feature, that is, in total, we have 27 features to consider. This standard categorical dataset is widely used to access the quality of various computer-aided diagnostic tools (CADs), with the general aim of identifying such a CAD that would use non-invasive information of age and mammographic image features for the precise diagnostics of breast cancer (<xref ref-type="bibr" rid="B12">Elder et&#x20;al., 2007</xref>; <xref ref-type="bibr" rid="B2">Ayer et&#x20;al., 2010</xref>; <xref ref-type="bibr" rid="B49">Zhang et&#x20;al., 2010</xref>).</p>
<p>The standard measure for CAD performance adopted in the medical literature is called area under curve (AUC) (<xref ref-type="bibr" rid="B35">Qin and Hotilovac, 2008</xref>). The closer the AUC value is to 1.0, the better is the performance of the respective CAD and the lower is the probability of a false-positive or false-negative diagnosis. To compute the AUC values of different CADs&#x2014;together with 95% confidence intervals of AUC, we use the methodology described in <xref ref-type="bibr" rid="B35">Qin and Hotilovac (2008)</xref>. The implementation of this method is available for open access at <ext-link ext-link-type="uri" xlink:href="https://github.com/brian-lau/MatlabAUC">https://github.com/brian-lau/MatlabAUC</ext-link>.</p>
<p>The given label &#x201c;benign&#x201d; or &#x201c;malignant&#x201d; (<italic>m</italic>&#x20;&#x3d; 2) in the dataset is obtained based on an invasive core-needle biopsy analysis of the tissue. Whereas the rate of the false-positive core-needle breast biopsy outcomes is practically zero, the rate of the false-negative biopsy findings can be quite significant. According to the literature, it can vary in a wide range between 0.005 and 0.19 (<xref ref-type="bibr" rid="B40">Shah et&#x20;al., 2003</xref>; <xref ref-type="bibr" rid="B47">Verkooijen et&#x20;al., 2004</xref>; Boba et&#x20;al., 2011). CADs based on artificial neuronal networks (ANNs) have been reported to have the highest AUC for these data (<xref ref-type="bibr" rid="B12">Elder et&#x20;al., 2007</xref>; <xref ref-type="bibr" rid="B2">Ayer et&#x20;al., 2010</xref>; <xref ref-type="bibr" rid="B19">Gerber and Horenko, 2017</xref>). Training such ANNs results in an AUC of 0.85 with an 95% confidence interval of <inline-formula id="inf11">
<mml:math id="m15">
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>0.82</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.88</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>, whereas using the standard BI-RADS diagnostics (on the same data and computed deploying the methodology from <xref ref-type="bibr" rid="B35">Qin and Hotilovac (2008)</xref>, one obtains an AUC of 0.82 with a 95% confidence interval of <inline-formula id="inf12">
<mml:math id="m16">
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>0.78</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.84</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>.</p>
<p>However, published CAD methodologies that used this standard BI-RADS dataset for training do not consider an eventual risk of mislabeling due to wrong biopsy outcomes. As mentioned before, this risk may achieve 0.19 (<xref ref-type="bibr" rid="B40">Shah et&#x20;al., 2003</xref>; <xref ref-type="bibr" rid="B47">Verkooijen et&#x20;al., 2004</xref>; <xref ref-type="bibr" rid="B5">Boba et&#x20;al., 2011</xref>). If there are mislabelings in the data, then the AUC of the validation set is biased and misleading.</p>
<p>To apply the suggested methodology presented here, we set the broad a priori bounds for mislabeling risks (with <italic>r</italic>
<sup>&#x2212;</sup> &#x3d; 0, <italic>r</italic>
<sup>&#x2b;</sup> &#x3d; 0.5) and tested various model classes <italic>&#x3d5;</italic>, including the naive Bayesian classifier models, linear models, probit models, and logit models. The optimal results are achieved for <italic>&#x3d5;</italic> being a logit model function. Analysis results are summarized in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref>.</p>
<p>
<xref ref-type="fig" rid="F1">Figure&#x20;1A</xref> shows the negative values of <italic>L</italic> from optimal (1) with <italic>&#x3b1;</italic>&#x2217; obtained for the training sets and evaluated on the validation sets, averaged over 500 random cross-validations, when the full dataset is randomly divided into the training set (with 75% of the data) and the validation set (25% of the data). It appears that the cross-validation optimal model is characterized by the false-negative biopsy rate <italic>r</italic>&#x2217; close to zero and the optimal regularization parameter value C around eight. <xref ref-type="fig" rid="F1">Figure&#x20;1B</xref> visualizes the optimal parameter vector <italic>&#x3b1;</italic>&#x2217; with the features of interest together with their impact.</p>
<p>Furthermore, to estimate the confidence intervals for <italic>&#x3b1;</italic>&#x2217; and <italic>r</italic>&#x2217;, we deploy the common non-parametric bootstrap sampling procedure (<xref ref-type="bibr" rid="B11">Efron and Tibshirani, 1993</xref>) (see <xref ref-type="fig" rid="F1">Figure&#x20;1C</xref>). The resulting posterior distribution of inferred false-negative biopsy outcomes is shown in <xref ref-type="fig" rid="F1">Figure&#x20;1C</xref>, together with their expected posterior estimate&#x2014;being 0.6<italic>%</italic>. As can be seen from <xref ref-type="fig" rid="F1">Figures 1C,D</xref>, the obtained diagnostic model is robust and contains the binary yes/no characteristics from just three features that statistically significantly influence malignancy risk (from age, margin, and shape of the inclusion). In contrast, commonly applied diagnostic strategies include 28 characteristics from four features, including an additional intrusion density feature (with 4 categories). Age appears to be the most significant feature, almost doubling the average risk for individuals older than 80&#x20;years as compared to twenty-year-olds. Besides age, there are two binary factors that increase malignancy risk (&#x201c;margin is not circumscribed&#x201d; and &#x201c;shape is irregular&#x201d;) and four risk-decreasing binary factors (&#x201c;margin is circumscribed,&#x201d; &#x201c;shape is not irregular,&#x201d; &#x201c;margin is not spiculated,&#x201d; and &#x201c;shape is not lobular&#x201d;).</p>
<p>In a perfect situation, one would need to have a validation set with 100% correct labelings to make the comparison of different diagnostic strategies with respect to their AUC. Since such datasets are not available in the published medical literature, we follow a different way and investigate and compare the systematic bias that is imposed by the latent mislabeling risks on the AUC values of different diagnostic procedures. In order to better address the aforementioned issue of AUC with mislabelings to the bias induced by different mislabeling rates, we computed and compared the bootstrap confidence intervals of the systematic bias that is introduced by the latent mislabeling risks on the AUC values of different diagnostic procedures (see <xref ref-type="sec" rid="s10">Supplementary Figure S2</xref> from the Supplement).</p>
<p>The application of the introduced strategy described in the <italic>Methods</italic> section to these BI-RADS data revealed that it is almost perfectly labeled (inferred expected malignant mislabeling rate is almost 0% and the benign mislabeling rate is 0.6%). This result is in agreement with the range of false-negative biopsy outcomes from the clinical reports (<xref ref-type="bibr" rid="B40">Shah et&#x20;al., 2003</xref>; <xref ref-type="bibr" rid="B47">Verkooijen et&#x20;al., 2004</xref>; <xref ref-type="bibr" rid="B5">Boba et&#x20;al., 2011</xref>). The AUC values obtained with the mislabeling model (1&#x2013;4) introduced in the article are statistically significantly higher than the AUC values of the BI-RADS and of the ANN strategies (without considering potential mislabeling) published in the literature. These results indicate that the obtained classification model with mislabeling would have AUC values that are statistically significantly higher than the common BI-RADS diagnostics.</p>
<p>Furthermore, the logit model performs robustly, despite the number of mislabelings, and offers a good estimate with respect to the mislabeled data, as indicated by the calculated the Pearson correlation between the estimated and true mislabeling rates (R &#x3d; 0.096, <italic>p</italic>&#x20;&#x3d; 0.002&#x2009;9) (see <xref ref-type="fig" rid="F2">Figure&#x20;2A</xref>). Additionally, we evaluated our logit model against two types of support vector classifiers (SVCs) with linear and radial basis function (RBF) kernels to compare model performance apart from the aforementioned AUC metric. For this, we trained all models with 200 bootstrap steps on synthetically mislabeled datasets based on the breast cancer data with known mislabeling rates and calculated their average model accuracy with respect to the original breast cancer data. The logit model outperforms the linear SVC and performs nearly on par with a state-of-the-art SVM with RBF kernel (see <xref ref-type="fig" rid="F2">Figure&#x20;2B</xref> and <xref ref-type="sec" rid="s10">Supplementary Material Chapter S3</xref>, <xref ref-type="sec" rid="s10">Supplementary Table S1</xref>). Compared to the SVM with RBF kernel, which performs about equally well, our method is the only one that can predict the expected mislabeling rate. This prediction&#x2014;although generally somewhat less than the true value&#x2014;shows a strong correlation between the true synthetically induced mislabeling rate of the mammography data and the error rate predicted by the co-inference method (see <xref ref-type="fig" rid="F2">Figure&#x20;2A</xref>).</p>
</sec>
<sec id="s3-2">
<title>Example 2: Wellderly Data Analysis and Extraction of Genomic Patterns of Healthy Aging</title>
<p>In the second example, our method is applied to human sequencing data in the context of genome-wide association studies (GWASs), where a population sharing a trait is compared to a &#x201c;normal&#x201d; control population. The sample consists of the Wellderly cohort, a cohort of healthy elderly (older than 80&#x20;years) individuals (<xref ref-type="bibr" rid="B13">Erikson et&#x20;al., 2016</xref>), and the Caucasian population of phase 3 from the &#x201c;1000 Genomes Project&#x201d; (1&#xa0;KG), which serves as the control (<xref ref-type="bibr" rid="B44">The 1000 Genomes Project Consortium, 2015</xref>). The analytical question would be to compare both groups and to find genetic patterns that correlate with the &#x201c;Wellderly&#x201d; phenotype.</p>
<p>GWAS may suffer a number of statistical errors, such as overfitting, <italic>p</italic>-value misinterpretation, or batch effects due to different sequencing platforms (<xref ref-type="bibr" rid="B32">Nuzzo, 2014</xref>; <xref ref-type="bibr" rid="B20">Gerber et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B34">Pfenninger et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B48">Wei&#xdf;bach et&#x20;al., 2021</xref>). In our Wellderly control cohort, we face the additional problem of almost certain sample mislabeling between &#x201c;cases&#x201d; and controls, which is impossible to prevent. Specifically, some of the individuals from the control group may actually progress to be healthy individuals of advanced age and thus belong to the Wellderly population and not the control population. It would be impractical to wait several decades to observe the outcome.</p>
<p>For the analysis, precomputed VCF files were obtained from the 1000 Genomes Project and the Wellderly cohort. Then the cohorts were merged and filtered (minimum allele frequency filter of 1%, no missing genotypes, only SNPs that appear in both cohorts). The merged cohort was subset further to 163 individuals from each population, who were most closely related (according to the genomic distance) and whose self-reported Caucasian inheritance was above 95% in the Wellderly cohort, to counteract population stratification. Lastly, the vcf data were filtered to only contain biallelic SNPs, since the model does not yet accommodate other SNP types, and the vcf entries were recoded as 0, 1, and 2 to indicate major minor or mixed alleles. The final cohort was split into training and test data (25 and 75%, respectively) and the mathematical model introduced above (1&#x2013;4). A non-parametric bootstrap sampling paradigm was used for independent random separations of the cohort into training and validation groups. All 100 results from applying (1&#x2013;4) to each of these random training and validation choices were used to create the posterior probability density functions of mislabeling risks for the two data groups, as well as to compute the 95% confidence intervals for feature weights and for individual Wellderly probabilities in the groups. Optimal results appear to be achieved with the logit model function <italic>&#x3d5;</italic>.</p>
<p>First, the mislabeling probability between Wellderly and control individuals was assessed. Here, the estimated posterior probability for the perfect group labeling altogether was only 0.09, meaning that samples were mislabeled between Wellderly and control with a probability of 91<italic>%</italic>. <xref ref-type="fig" rid="F3">Figures 3A,B</xref> illustrate the expected proportion of &#x201c;mislabeled&#x201d; individuals in each cohort. According to the model, there were 7.5% control cases mistakenly classified as Wellderly and about 3.2% Wellderly individuals possibly mislabeled.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Application of (1&#x2013;4) to the analysis of filtered SNP data from (<xref ref-type="bibr" rid="B13">Erikson et&#x20;al., 2016</xref>): <bold>(A)</bold> posterior probability distribution of the inferred optimal mislabelings from the &#x201c;close-to-Wellderly European&#x201d; cohort from 1000 Genomes (basis for the control group); <bold>(B)</bold> posterior probability distribution of the inferred optimal mislabelings from the &#x201c;Wellderly Caucasian&#x201d; cohort (<xref ref-type="bibr" rid="B13">Erikson et&#x20;al., 2016</xref>) (basis for the case group); <bold>(C)</bold> estimated optimal weights <italic>&#x3b1;</italic>
<sub>
<italic>i</italic>
</sub> of the SNP patterns together with their 95% confidence intervals; <bold>(D)</bold> individual probabilities of Wellderly being due to genomic factors (together with their 95% confidence intervals), as inferred from the optimal feature weights <italic>&#x3b1;</italic> from panel <bold>(C)</bold>. Posterior distributions in <bold>(A,B)</bold> and confidence intervals in <bold>(C,D)</bold> are obtained by means of the non-parametric bootstrap sampling (<xref ref-type="bibr" rid="B11">Efron and Tibshirani, 1993</xref>) with 100 ensemble realizations.</p>
</caption>
<graphic xlink:href="frai-04-739432-g003.tif"/>
</fig>
<p>Then the feature weights <italic>&#x3b1;</italic> were recorded, which means that the SNPs were statistically relevant to the Wellderly phenotype, according to the model. Forty-five SNP features had statistically significant values of <italic>&#x3b1;</italic>. The full list of features, together with their estimated mean impacts on Wellderly probability, is provided in <xref ref-type="sec" rid="s3">Section 3</xref> of the SI. Note that this result does not imply that only these 45 features have a significant impact on the healthy ageing probability, and it does not mean that the other features (that got zero weights <italic>&#x3b1;</italic>
<sub>
<italic>i</italic>
</sub>) are completely unrelated to the Wellderly genotype. As mentioned before, deploying our model (1&#x2013;4), one can identify a unique compact pattern of features. However, the model tries to establish a consensus between finding the largest number of informative features and overfitting, which may result in some SNPs not labeled as informative simply because the model tries to correct for potential overfitting, and vice&#x20;versa.</p>
<p>The investigation of these 45 SNP features revealed rs429358, situated in the coding region of the APOE gene. rs429358 is one of two markers that define the APOE-E4 status, which is the strongest common genetic risk factor for Alzheimer&#x2019;s disease. The remainder of statistically significant SNP features are either intronic (38 SNPs) or intergenic (6 SNPs), where intergenic SNPs were often near a gene bearing a statistically significant intronic SNP. In fact, the 45 SNP features tended to cluster within a smaller number of genes 28 genes (see the table in <xref ref-type="sec" rid="s10">Supplementary Material</xref>), suggesting multiple independent genetic signals are present within these genes. Further inspection of these genes reveals dramatic enrichment of genes associated with longevity, including lipid metabolism genes: APOE, APOC3, and CETP; insulin signaling and mTOR signaling: ADCY2, AKT3, CREB5, IGF1R, INSR, PIK3CD, and RHEB; and AMPK-dependent metabolic signaling: CAMK4, PPARGC1A, PRKAA1, and PRKAG2.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>Although the raw datasets used in many types of biomedical data analysis are very large, making sense of them requires statistical methods able to handle data problems where the dimensionality <italic>n</italic> of their feature spaces is typically orders of magnitude larger than the number <italic>T</italic> of individuals in the groups. According to central limit theorems, the uncertainty of typical parameter estimation procedures will reduce as <italic>T</italic>
<sup>1/2</sup> with the growing statistics size <italic>T</italic> only if the individual data instances in the statistics can be assumed to be independent. The more that dependence between the data instances, the slower will be the rate of this uncertainty reduction with <italic>T</italic>. In this respect, genomic SNP data pose special problems as they contain a huge fraction of dependencies between SNP pairs that are in a linkage equilibrium with each other. Another source of bias is introduced through the impact of latent/unobserved factors (e.g., in the form of stratification effects and latent variables) and various forms of mislabeling. Mislabeling can bias the results obtained by standard relation measures that dwell on an exact labeling assumption (e.g., it can bias the results of the <italic>t</italic>-test, chi-square test, Fisher&#x2019;s exact test, odds ratio, and many other methods). Common methods to overcome this problem, including outlier detection methods and HMMs, either rely on the presence of some subsets that are exactly labeled or introduce additional variables that estimate the probability of every particular data point being an outlier. This significantly increases the dimension of the parameter space and the risk of overfitting. These problems remain an issue even when the numbers of individuals sequenced by platforms such as <italic>23andme</italic> approach millions. All these issues (<italic>n</italic>&#x20;&#x3e; &#x3e; <italic>T</italic>, violation of the independence assumption, latent impacts, and mislabeling) lead to uncertainty in estimating parameters and make biomedical GWAS applications very challenging. These issues also limit the applicability of advanced Big Data tools such as artificial neuronal networks to problems of this type. A promising direction toward solving these problems can be found in methods based on Lasso regularization ideas first introduced by R. Tibshirani and coworkers (<xref ref-type="bibr" rid="B45">Tibshirani, 1996</xref>; <xref ref-type="bibr" rid="B3">Bair and Tibshirani, 2004</xref>; <xref ref-type="bibr" rid="B16">Friedman et&#x20;al., 2010</xref>; <xref ref-type="bibr" rid="B41">Simon et&#x20;al., 2011</xref>; <xref ref-type="bibr" rid="B42">Taylor and Tibshirani, 2015</xref>), through a robust and computationally efficient shrinkage of the feature space and zeroing out of less relevant feature components. As demonstrated before, introducing a measurement of the probability of group-specific mislabeling and deploying Bayesian tools permit a natural extension of these ideas to situations in which none of the data groups is presumed perfectly labeled. This is accomplished without introducing many new parameters that have to be estimated. For example, in the case of a one-data group (<italic>N</italic>
<sub>
<italic>g</italic>
</sub> &#x3d; 1) with two data labels (<italic>m</italic>&#x20;&#x3d; 2, e.g., &#x201c;Wellderly&#x201d; and &#x201c;non-Wellderly&#x201d; labels in example 2), only two additional mislabeling parameters need to be estimated. The open-source MATLAB implementation we provide here permits implementation of the algorithm in a strongly scalable way. A full analysis of the data in example 2 takes 24&#xa0;days on a single-core PC, 2&#xa0;days on a PC workstation with 12 cores, and only 5&#xa0;h on a small-scale computer cluster with hundred nodes. Measuring the performance of such methods has a general problem due to the presence of latent impacts and mislabeling, which can bias either standard measures such as AUC, accuracy scores, and Fisher&#x2019;s exact test or and linkage disequilibrium measures. In future studies, a better understanding of ever-growing sets of biomedical data requires the further development of robust and computationally scalable relation measures that can explicitly infer and take into account eventual latent effects and mislabeling.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>Publicly available datasets were analyzed in this study. These data can be found in the following site; a parallel MATLAB implementation of this method is provided for open access via GitHub: <ext-link ext-link-type="uri" xlink:href="https://github.com/SusanneGerber/Mislabeling_Coinference/tree/master/Release/Mislabeling_Coinference">https://github.com/SusanneGerber/Mislabeling_Coinference/tree/master/Release/Mislabeling_Coinference</ext-link>. The breast cancer BI-RADS dataset is available for open access at the UCI Machine Learning Repository: <ext-link ext-link-type="uri" xlink:href="http://archive.ics.uci.edu/ml/datasets/Mammographic+Mass">http://archive.ics.uci.edu/ml/datasets/Mammographic&#x2b;Mass</ext-link>. Aggregate unfiltered annotated variants for healthy-ageing Caucasian individuals (Wellderly) and their allele and genotype frequencies are available via Scripps Translational Science Institute Variant Browser: <ext-link ext-link-type="uri" xlink:href="https://genomics.scripps.edu/browser">https://genomics.scripps.edu/browser</ext-link>.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>SG contributed to conceptualization, investigation, supervision, writing&#x2014;original draft, writing&#x2014;review and editing, visualization, project administration, and funding acquisition. LP assisted with methodology, software, formal analysis, and writing&#x2014;review and editing. SS helped with writing&#x2014;original draft, validation, and writing&#x2014;review and editing. CH assisted with software, validation, formal analysis, investigation, writing&#x2014;review and editing, and visualization. AT contributed to data acquisition, software, formal analysis, and writing&#x2014;review and editing. IH helped with conceptualization, methodology, software, validation, formal analysis, investigation, writing&#x2014;original draft, writing&#x2014;review and editing, visualization, supervision, and funding acquisition.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>The work of CH was supported by the Forschungsinitiative Rheinland-Pfalz and ReALity. The work of SS was funded by the Forschungsinitiative Rheinland-Pfalz and M3odel. AT acknowledges funding by the NIH-NCATS UL1TR002550 grant. The work of IH was partly funded by the German Research Foundation (&#x201c;Mercator Fellowship&#x201d; of IH in the Collaborative Research Center 1114 Scaling Cascades in Complex Systems).</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ack>
<p>SG and IH acknowledges funding from the Emergent AI Center funded by the Carl-Zeiss-Stiftung. SG and CH acknowledge funding by the Landesinitiative Rheinland-Pfalz and the Resilience, Adaptation, and Longevity (ReALity) initiative of the Johannes Gutenberg University of Mainz. SS and SG acknowledge funding by M3odel Initiative.</p>
</ack>
<sec id="s10">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2021.739432/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frai.2021.739432/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table1.XLSX" id="SM2" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Andreopoulos</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>An</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Schroeder</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>A Roadmap of Clustering Algorithms: Finding a Match for a Biomedical Application</article-title>. <source>Brief. Bioinform.</source> <volume>10</volume>, <fpage>297</fpage>&#x2013;<lpage>314</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbn058</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ayer</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ayvaci</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z. X.</given-names>
</name>
<name>
<surname>Alagoz</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Burnside</surname>
<given-names>E. S.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Computer-aided Diagnostic Models in Breast Cancer Screening</article-title>. <source>Imaging Med.</source> <volume>2</volume>, <fpage>313</fpage>&#x2013;<lpage>323</lpage>. <pub-id pub-id-type="doi">10.2217/iim.10.24</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bair</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Semi-supervised Methods to Predict Patient Survival from Gene Expression Data</article-title>. <source>PLOS Biol.</source> <volume>2</volume>. <pub-id pub-id-type="doi">10.1371/journal.pbio.0020108</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Barandela</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gasca</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2000</year>). <source>Decontamination of Training Samples for Supervised Pattern Recognition Methods</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>, <fpage>621</fpage>&#x2013;<lpage>630</lpage>. <pub-id pub-id-type="doi">10.1007/3-540-44522-6_64</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Boba</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ko&#x142;tun</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Bobek-Billewicz</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chmielik</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Eksner</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Olejnik</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>False-negative Results of Breast Core Needle Biopsies &#x2013;retrospective Analysis of 988 Biopsies</article-title>. <source>Polish J.&#x20;Radiol.</source> <volume>76</volume>, <fpage>25</fpage>&#x2013;<lpage>29</lpage>. </citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bootkrajang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kab&#xe1;n</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2012</year>). <source>Label-noise Robust Logistic Regression and its Applications</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>, <fpage>143</fpage>&#x2013;<lpage>158</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-642-33460-3_15</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brodley</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Friedl</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>1999</year>). <source>Identifying mislabeled Train. Data</source> <volume>11</volume>, <fpage>131</fpage>&#x2013;<lpage>167</lpage>. <pub-id pub-id-type="doi">10.1613/jair.606</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Burnham</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Anderson</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2002</year>). <source>Model Selection and Multimodel Inference: A Practical Information-Theoretic Approach</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer-Verlag</publisher-name>. </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chandola</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Banerjee</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Anomaly Detection: A Survey</article-title>. <source>ACM Comput. Surv.</source> <volume>41</volume> (<issue>15</issue>), <fpage>58</fpage>. <comment>1&#x2013;15</comment>. <pub-id pub-id-type="doi">10.1145/1541880.1541882</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Choi</surname>
<given-names>Y.-S.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Least squares one-class support vector machine</source> <volume>30</volume>, <fpage>1236</fpage>&#x2013;<lpage>1240</lpage>. <pub-id pub-id-type="doi">10.1016/j.patrec.2009.05.007</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Efron</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1993</year>). <source>An Introduction to the Bootstrap</source>. <publisher-loc>New York, United&#x20;States</publisher-loc>: <publisher-name>Macmillan Publishers Limited</publisher-name>. </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Elder</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Schulz-Wendtland</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wittenberg</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>The Prediction of Breast Cancer Biopsy Outcomes Using Two CAD Approaches that Both Emphasize an Intelligible Decision Process</article-title>. <source>Med. Phys.</source> <volume>34</volume>, <fpage>4164</fpage>&#x2013;<lpage>4172</lpage>. </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Erikson</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Bodian</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Rueda</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Molparia</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Scott</surname>
<given-names>E. R.</given-names>
</name>
<name>
<surname>Zeeland</surname>
<given-names>A. A. S.-V.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Whole-genome Sequencing of a Healthy Aging Cohort</article-title>. <source>Cell</source> <volume>165</volume>, <fpage>1002</fpage>&#x2013;<lpage>1011</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2016.03.022</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Fr&#xe9;nay</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kaban</surname>
<given-names>A.</given-names>
</name>
</person-group> &#x201c;<article-title>A Comprehensive Introduction to Label Noise</article-title>,&#x201d; in <conf-name>Proceedings of the 2014 European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning (ESANN 2014)</conf-name>, <conf-loc>Bruges, Belgium</conf-loc>, <conf-date>April 2014</conf-date>. <comment>(i6doc.com.publ.)</comment>. </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Frenay</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Verleysen</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Classification in the Presence of Label Noise</article-title>. <source>A Surv.</source> <volume>25</volume>, <fpage>845</fpage>&#x2013;<lpage>869</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2013.2292894</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Friedman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hastie</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Regularization Paths for Generalized Linear Models via Coordinate Descent</article-title>. <source>J.&#x20;Stat. Softw.</source> <volume>33</volume>, <fpage>1</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v033.i01</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fr&#xfc;hwirth-Schnatter</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2006</year>). <source>Finite Mixture and Markov Switching Models</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>. </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gerber</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Horenko</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Improving Clustering by Imposing Network Information</article-title>. <source>Sci. Adv.</source> <volume>1</volume> (<issue>7</issue>), <fpage>e1500163</fpage>. <pub-id pub-id-type="doi">10.1126/sciadv.1500163</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gerber</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Horenko</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Toward a Direct and Scalable Identification of Reduced Models for Categorical Processes</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>114</volume>, <fpage>4863</fpage>&#x2013;<lpage>4868</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1612619114</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gerber</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pospisil</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Navandar</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Horenko</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Low-cost Scalable Discretization, Prediction, and Feature Selection for Complex Systems</article-title>. <source>Sci. Adv.</source> <volume>6</volume>, <fpage>eaaw0961</fpage>. <pub-id pub-id-type="doi">10.1126/sciadv.aaw0961</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gomez-Nicola</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Boche</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Post-mortem Analysis of Neuroinflammatory Changes in Human Alzheimer&#x2019;s Disease</article-title>. <source>Alzheimer&#x2019;s Res. Ther.</source> <volume>7</volume>, <fpage>42</fpage>. <pub-id pub-id-type="doi">10.1186/s13195-015-0126-1</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hariri</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kind</surname>
<given-names>M. C.</given-names>
</name>
<name>
<surname>Brunner</surname>
<given-names>R. J.</given-names>
</name>
</person-group> (<year>2021</year>). <source>Extended isolation For.</source> <volume>33</volume>, <fpage>1479</fpage>&#x2013;<lpage>1489</lpage>. <pub-id pub-id-type="doi">10.1109/tkde.2019.2947676</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hastie</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Friedman</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). <source>The Elements of Statistical Learning: Data Mining, Inference and Prediction</source>. <edition>2 edn</edition>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>. </citation>
</ref>
<ref id="B24">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Hendrycks</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mazeika</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wilson</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Gimpel</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Using Trusted Data to Train Deep Networks on Labels Corrupted by Severe Noise</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1802.05300">https://arxiv.org/abs/1802.05300</ext-link>
</comment>. </citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Z.-H.</given-names>
</name>
</person-group> (<year>2004</year>). <source>Editing Training Data for kNN Classifiers with Neural Network Ensemble</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>, <fpage>356</fpage>&#x2013;<lpage>361</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-540-28647-9_60</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lam</surname>
<given-names>H. Y. K.</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Natsoulis</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>O&#x2019;Huallachain</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Performance Comparison of Whole-Genome Sequencing Platforms</article-title>. <source>Nat. Biotechnol.</source> <volume>30</volume>, <fpage>78</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.2065</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>F. T.</given-names>
</name>
<name>
<surname>Ting</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Z.-H.</given-names>
</name>
</person-group> &#x201c;<article-title>Isolation forest (IEEE)</article-title>,&#x201d; in <conf-name>Proceedings of the 2008 Eighth IEEE International Conference on Data Mining</conf-name>, <conf-loc>Pisa, Italy</conf-loc>, <conf-date>December 2008</conf-date>. <pub-id pub-id-type="doi">10.1109/icdm.2008.17</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2014</year>). <source>An Infinite Latent Generalized Linear Model</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>, <fpage>155</fpage>&#x2013;<lpage>166</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-08010-9_18</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>McFadden</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>1974</year>). &#x201c;<article-title>Conditional Logit Analysis of Qualitative Choice Behaviour</article-title>,&#x201d; in <source>Frontiers in Econometrics</source>. Editor <person-group person-group-type="editor">
<name>
<surname>Zarembka</surname>
<given-names>P.</given-names>
</name>
</person-group> (<publisher-loc>NewYork</publisher-loc>: <publisher-name>Academic Press</publisher-name>), <fpage>105</fpage>&#x2013;<lpage>142</lpage>. </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moya</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hush</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>1996</year>). <article-title>Network Constraints and Multi-Objective Optimization for One-Class Classification</article-title>. <source>Neural Networks</source> <volume>9</volume>, <fpage>463</fpage>&#x2013;<lpage>474</lpage>. <pub-id pub-id-type="doi">10.1016/0893-6080(95)00120-4</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Nocedal</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wright</surname>
<given-names>S. J.</given-names>
</name>
</person-group> (<year>2006</year>). <source>Numerical Optimization</source>. <edition>2nd edn</edition>. <publisher-loc>New York</publisher-loc>: <publisher-name>Springer</publisher-name>. </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nuzzo</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Scientific Method: Statistical Errors</article-title>. <source>Nature</source> <volume>506</volume>, <fpage>150</fpage>&#x2013;<lpage>152</lpage>. <pub-id pub-id-type="doi">10.1038/506150a</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>ORawe</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Low Concordance of Multiple Variant-Calling Pipelines: Practical Implications for Exome and Genome</article-title>. <source>sequencing</source> <volume>5</volume>, <fpage>28</fpage>. <pub-id pub-id-type="doi">10.1186/gm432</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pfenninger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Reuss</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Kiebler</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sch&#xf6;nnenbeck</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Caliendo</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gerber</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Genomic Basis of Drought Resistance in Fagus Sylvatica</article-title>. <source>eLife</source> <volume>10</volume>, <fpage>e65532</fpage>. <pub-id pub-id-type="doi">10.7554/eLife.65532</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hotilovac</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Comparison of Non-parametric Confidence Intervals for the Area under the ROC Curve of a Continuous-Scale Diagnostic Test</article-title>. <source>Stat. Methods Med. Res.</source> <volume>17</volume>, <fpage>207</fpage>&#x2013;<lpage>221</lpage>. <pub-id pub-id-type="doi">10.1177/0962280207087173</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rodionova</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Oliveri</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pomerantsev</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Rigorous and Compliant Approaches to One-Class Classification</article-title>. <source>Chemometrics Intell. Lab. Syst.</source> <volume>159</volume>, <fpage>89</fpage>&#x2013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1016/j.chemolab.2016.10.002</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rodrigues</surname>
<given-names>D. R.</given-names>
</name>
<name>
<surname>Everschor-Sitte</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Gerber</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Horenko</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A Deeper Look into Natural Sciences with Physics-Based and Data-Driven Measures</article-title>. <source>iScience</source> <volume>24</volume>, <fpage>102171</fpage>. <pub-id pub-id-type="doi">10.1016/j.isci.2021.102171</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ross</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Russ</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Costello</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hollinger</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lennon</surname>
<given-names>N. J.</given-names>
</name>
<name>
<surname>Hegarty</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Characterizing and Measuring Bias in Sequence Data</article-title>. <source>Genome Biol.</source> <volume>14</volume>, <fpage>R51</fpage>. <pub-id pub-id-type="doi">10.1186/gb-2013-14-5-r51</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>S&#xe1;ez</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Romero</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Conejero</surname>
<given-names>J.&#x20;A.</given-names>
</name>
<name>
<surname>Garc&#xed;a-G&#xf3;mez</surname>
<given-names>J.&#x20;M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Potential Limitations in COVID-19 Machine Learning Due to Data Source Variability: A Case Study in the nCov2019 Dataset</article-title>. <source>J.&#x20;Am. Med. Inform. Assoc.</source> <volume>28</volume>, <fpage>360</fpage>&#x2013;<lpage>364</lpage>. <pub-id pub-id-type="doi">10.1093/jamia/ocaa258</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shah</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Raju</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Chitale</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Deshpande</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gregory</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Strand</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>False-negative Core Needle Biopsies of the Breast</article-title>. <source>Cancer</source> <volume>97</volume>, <fpage>1824</fpage>&#x2013;<lpage>1831</lpage>. <pub-id pub-id-type="doi">10.1002/cncr.11278</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Simon</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Friedman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hastie</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Regularization Paths for Cox&#x2019;s Proportional Hazards Model via Coordinate Descent</article-title>. <source>J.&#x20;Stat. Softw.</source> <volume>39</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v039.i05</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taylor</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Statistical Learning and Selective Inference</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>112</volume>, <fpage>7629</fpage>&#x2013;<lpage>7634</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1507583112</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Teng</surname>
<given-names>C. M.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>A Comparison of Noise Handling Techniques</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.aaai.org/Papers/FLAIRS/2001/FLAIRS01-052.pdf">https://www.aaai.org/Papers/FLAIRS/2001/FLAIRS01-052.pdf</ext-link>
</comment>. </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<collab>The 1000 Genomes Project Consortium</collab> (<year>2015</year>). <article-title>A Global Reference for Human Genetic Variation</article-title>. <source>Nature</source> <volume>526</volume>, <fpage>68</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1038/nature15393</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1996</year>). <article-title>Regression Shrinkage and Selection via the Lasso</article-title>. <source>J.&#x20;R. Stat. Soc. Ser. B (Methodological)</source> <volume>58</volume>, <fpage>267</fpage>&#x2013;<lpage>228</lpage>. <pub-id pub-id-type="doi">10.1111/j.2517-6161.1996.tb02080.x</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Todorov</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Searle-White</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Gerber</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Applying Univariate vs. Multivariate Statistics to Investigate Therapeutic Efficacy in (Pre)clinical Trials: A Monte Carlo Simulation Study on the Example of a Controlled Preclinical Neurotrauma Trial</article-title>. <source>PLoS One</source> <volume>15</volume>, <fpage>e0230798</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0230798</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Verkooijen</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Hoorntje</surname>
<given-names>L. E.</given-names>
</name>
<name>
<surname>Peeters</surname>
<given-names>P. H. M.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>False-negative Core Needle Biopsies of the Breast</article-title>. <source>Cancer</source> <volume>100</volume>, <fpage>1104</fpage>&#x2013;<lpage>1105</lpage>. <pub-id pub-id-type="doi">10.1002/cncr.20077</pub-id> </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei&#xdf;bach</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sys</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hewel</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Todorov</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Schweiger</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Winter</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Reliability of Genomic Variants across Different Next-Generation Sequencing Platforms and Bioinformatic Processing Pipelines</article-title>. <source>BMC Genomics</source> <volume>22</volume>. <pub-id pub-id-type="doi">10.1186/s12864-020-07362-8</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>C</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Regularization Parameter Selections via Generalized Information Criterion</article-title>. <source>J.&#x20;Am. Stat. Assoc.</source> <volume>105</volume>, <fpage>312</fpage>&#x2013;<lpage>323</lpage>. <pub-id pub-id-type="doi">10.1198/jasa.2009.tm08013</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>A Weighted One-Class Support Vector Machine</article-title>. <source>Neurocomputing</source> <volume>189</volume>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2015.10.097</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>