<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1644695</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2025.1644695</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>BC-predict: mining of signal biomarkers and production of models for early-stage breast cancer subtyping and prognosis</article-title>
<alt-title alt-title-type="left-running-head">Muthamilselvan et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2025.1644695">10.3389/fbinf.2025.1644695</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Muthamilselvan</surname>
<given-names>Sangeetha</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Vaithilingam</surname>
<given-names>Natarajan</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Palaniappan</surname>
<given-names>Ashok</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/868152"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Systems Computational Biology Lab, Department of Bioinformatics, School of Chemical and Biotechnology, SASTRA Deemed University</institution>, <city>Thanjavur</city>, <country country="IN">India</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Lincoln City Hospital, United Lincolnshire Hospitals, National Health Service</institution>, <city>Lincoln</city>, <country country="GB">United Kingdom</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Ashok Palaniappan, <email xlink:href="apalania@scbt.sastra.edu">apalania@scbt.sastra.edu</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-09-18">
<day>18</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>5</volume>
<elocation-id>1644695</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Muthamilselvan, Vaithilingam and Palaniappan.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Muthamilselvan, Vaithilingam and Palaniappan</copyright-holder>
<license>
<ali:license_ref start_date="2025-09-18">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Disease heterogeneity is the hallmark of breast cancer, which is the most common female malignancy. With a disturbing increase in mortality and disease burden, there remains a need for effective early-stage theragnostic and prognostic biomarkers. In this work, we improved on BrcaDx (<ext-link ext-link-type="uri" xlink:href="https://apalania.shinyapps.io/brcadx/">https://apalania.shinyapps.io/brcadx/</ext-link>) for cancer vs control screening and examined a cluster of adjoining learning problems in breast cancer heterogeneity: (i) identification of metastatic cancers; (ii) molecular subtyping (TNBC, HER2, or luminal); and (iii) histological subtyping (invasive ductal or invasive lobular).</p>
</sec>
<sec>
<title>Methods</title>
<p>We analyzed the transcriptomic profiles of breast cancer patients from public-domain databases such as the TCGA using stage-encoded problem-specific statistical models of gene expression and unveiled stage-salient and progression-significant genes. Using a consensus approach, we identified potential machine learning features, and considered six model classes for each learning problem, with hyperparameter optimization on a training dataset and evaluation on a holdout test dataset. A nested approach enabled us to identify the best model class for each learning problem.</p>
</sec>
<sec>
<title>Results</title>
<p>External validation of the best models yielded balanced accuracies of 97.42% for cancer vs normal; 88.22% for metastatic v/s non metastatic; 88.79% for ternary molecular subtyping; and ensemble accuracy of 94.23% for histological subtyping. The model for molecular subtyping was validated on a 26-sample TNBC-only out-of-distribution cohort, yielding 25 correct predictions. We performed a late integration of multi-omics datasets by validating the feature space used in each problem with miRNA profiles, methylation profiles, and commercial breast cancer panels.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Pending prospective studies, we have translated the models into BC-Predict that forks the best models developed for each problem in a unified interface and provides a complete readout for input instances of expression data, including uncertainty estimates. BC-Predict is freely available for non-commercial purposes at: <ext-link ext-link-type="uri" xlink:href="https://apalania.shinyapps.io/BC-Predict">https://apalania.shinyapps.io/BC-Predict</ext-link>.</p>
</sec>
</abstract>
<kwd-group>
<kwd>breast cancer heterogeneity</kwd>
<kwd>molecular and histological subtype</kwd>
<kwd>metastatic disease</kwd>
<kwd>machine learning</kwd>
<kwd>stage-specific differential gene expression</kwd>
<kwd>biomarker signature discovery</kwd>
<kwd>explainable AI</kwd>
<kwd>integrative multi-omics</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that no financial support was received for the research and/or publication of this article.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="9"/>
<equation-count count="5"/>
<ref-count count="116"/>
<page-count count="25"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Integrative Bioinformatics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Breast cancer is the most common cancer in women, accounting for 32% of all female cancers globally and 28.2% of female cancers in India (<xref ref-type="bibr" rid="B89">Siegel et al., 2024</xref>). With about 2.3 million new cases globally in 2020 (11.7% of total), its incidence surpasses that of lung cancer. The statistics paint a grim portrait of burden of disease: 1 in 4 cancer cases and 1 in 6 cancer deaths globally could be attributed to breast cancer, with 88% higher incidence in transitioned countries relative to transitioning countries (<xref ref-type="bibr" rid="B95">Sung et al., 2021</xref>). The risk of a person developing breast cancer depends on many factors like sex (women account for &#x3e;99.5%), age (&#x3e;80% occur in postmenopausal women), high-risk family history (upto 30% of cases), and genetic factors. The interplay between weak susceptibility alleles and the other risk factors is key to the etiology of the &#x2018;cancer phenotype&#x2019; (<xref ref-type="bibr" rid="B16">Cassidy et al., 2015</xref>; <xref ref-type="bibr" rid="B40">Hanahan, 2022</xref>). Genetic loci with predisposing mutations include: BRCA1/ BRCA2 (autosomal dominant, 50%&#x2013;85% life time risk) (<xref ref-type="bibr" rid="B82">Risch et al., 2006</xref>), TP53 (Li-Fraumeni syndrome, 80%&#x2013;90% life time risk) (<xref ref-type="bibr" rid="B3">Allain, 2008</xref>), CDH1 (60% life time risk and primarily lobular subtype), STK11 (Peutz-Jeghers syndrome, 50% risk), PTEN (Cowden syndrome with 20%&#x2013;50% risk (<xref ref-type="bibr" rid="B59">Lindor et al., 2008</xref>); Lynch syndrome with 25% risk), PALB2 (partner and localiser to BRCA2, age-dependent risk), ATM, BRIP1, CHEK2 (all about 20% risk) and RAD51C/RAD51D (14%&#x2013;20% risk). The modifiable lifestyle risk factors include physical inactivity especially post-menopausal obesity (100% additional risk), smoking (24% more risk), alcohol (7% risk for every 10g/day), and combined Hormone Replacement therapy (&#x223c;20% further risk depending on length of use/stop) (<xref ref-type="bibr" rid="B65">Manyonda et al., 2022</xref>). The prevalence of the risk factors varies by country and region. The typical onset of breast cancer is 60&#x2013;70 years in western countries, but appears to be anticipated at 40&#x2013;50 years in countries like India (<xref ref-type="bibr" rid="B12">Bhattacharyya et al., 2020</xref>). Data maintained at national registries suggest that the urbanization and growth of cities, &#x2018;modernized&#x2019; food habits (e.g., high consumption of ultra-processed foods), and lifestyle changes have contributed to the increased incidence of breast cancer in urban areas, whereas betel quid and tobacco chewing habits have significantly contributed to its incidence in rural areas (<italic>P</italic> &#x3d; 0.003) (<xref ref-type="bibr" rid="B64">Malvia et al., 2017</xref>). These cancers tend to be more aggressive with poorer prognosis (higher grade/size, lymphovascular-invasion positive, triple negative, HER2 positive, node positive, and medullary/metaplastic/micro-papillary/pleomorphic sub-types). The frequent presentation of breast cancer in its advanced and less treatable stages in traditional societies could be traced partly to the inadequate social awareness and extant taboos, leading to subpar survival outcomes. Such conditions tend to compound existing gender inequalities, outdated stereotypes, and burden of disease for whole families, and call for remediation of the situation.</p>
<p>Due to the complexity associated with cancers, a composite feature space is necessary to capture the transformation of cells and subsequent disease progression. This may be balanced with the curse of dimensionality that dominates machine learning. AI models based on whole-genome or whole-exome sequencing may be impractical and uninterpretable. McKinney et al. have developed a mammogram-based AI model for breast cancer screening rivalling radiologist readings, paving the way for AI-based decision support systems (<xref ref-type="bibr" rid="B70">McKinney et al., 2020</xref>). Convolutional neural network (CNN) models have been developed for identifying breast cancer samples as well as cancer subtyping based on 7091 genes (<xref ref-type="bibr" rid="B73">Mostavi et al., 2020</xref>). CUP-AI-DX includes two models: 1D inception CNN model for classifying cancers of unknown primary based on 817 expression features; and (ii) Random Forest model for breast cancer subtyping based on 5925 expression features (<xref ref-type="bibr" rid="B116">Zhao et al., 2020</xref>). Breast cancer subtyping models include learning on PAM50 inferred labels (<xref ref-type="bibr" rid="B10">Bastien et al., 2012</xref>) via either functional spectra of gene expression profiles (<xref ref-type="bibr" rid="B34">Gao et al., 2019</xref>) or deep convolution of RNAseq and CNV profiles (<xref ref-type="bibr" rid="B72">Mohaiminul Islam et al., 2020</xref>). Significant strides have been made towards mechanistic understanding and treatment of breast cancer, which has the most number of FDA-approved molecular panels aimed at early-stage actionable information about the disease. These biomarker panels include OncotypeDx based on TAILORx and RxPONDER studies (<xref ref-type="bibr" rid="B114">Zhang et al., 2022</xref>), EndoPredict and EndoPredict Plus (<xref ref-type="bibr" rid="B4">Almstedt et al., 2020</xref>), MammaPrint (<xref ref-type="bibr" rid="B91">Soliman et al., 2020</xref>), Prosigna (based on PAM50 and OPTIMA study) (<xref ref-type="bibr" rid="B9">Baskota et al., 2021</xref>), and Breast Cancer Index (<xref ref-type="bibr" rid="B8">Bartlett et al., 2019</xref>). Decision aids like PREDICT, Nottingham Prognostic Index (NPI) and Adjuvant Online based on IHC4 (ER/PR/HER2/Ki67) or IHC4&#x2b;C (including clinical/pathological features like age, tumour size, grade and nodal status) parameters define the level of clinical risk for adjuvant chemotherapy without relying on tumour profiling tests. The translation of AI models into software-as-medical-devices holds promise for bridging health disparities (<xref ref-type="bibr" rid="B76">Muthamilselvan et al., 2023</xref>).</p>
<p>The heterogeneity of breast cancer poses formidable challenges, and individual cancer manifestations vary so much that the available biomarker panels retain validity only in limited settings, thereby leaving a large cohort indeterminate (<xref ref-type="bibr" rid="B39">G&#xfc;ler, 2017</xref>). Changes in gene expression and mutations modifying protein activities are etiological molecular events driving the cancer phenotype (<xref ref-type="bibr" rid="B14">Brierley et al., 2016</xref>). An integrated precision-medicine approach to early detection, effective therapy and favourable prognosis is necessary. Techniques from the field of machine learning could be highly effective in discerning key features in complex datasets, including gene expression datasets, and learning models that map these features to crucial clinical outcomes related to the diagnosis, prognosis, and treatment of cancers (<xref ref-type="bibr" rid="B49">Kourou et al., 2015</xref>). Unsupervised learning techniques have been used to identify subtypes in breast cancer based on gene expression (<xref ref-type="bibr" rid="B43">Horr and Buechler, 2021</xref>). The molecular subtype of breast cancer could influence the choice of adjuvant therapy (<xref ref-type="bibr" rid="B46">Johnson et al., 2021</xref>; <xref ref-type="bibr" rid="B102">Vaidya et al., 2018</xref>). Among the histological subtypes, invasive lobular carcinoma is considered indolent and demands a treatment regimen tailored to the prognostic subtype (<xref ref-type="bibr" rid="B32">Fu et al., 2017</xref>). Here we have developed a novel framework for identifying the markers of changes in gene expression profiles across the stages and subtypes of breast cancer, enabling means for differential diagnosis and personalized medicine. These candidate features were utilized to create models that address the multiple challenges in breast cancer heterogeneity: (i) cancer or normal screening; (ii) non-metastatic or metastatic discrimination; (iii) molecular subtyping; and (iv) histological subtyping. Together these models could also enable the prognosis of breast cancer (<xref ref-type="bibr" rid="B29">Fitzgibbons et al., 2000</xref>; <xref ref-type="bibr" rid="B81">Rakha et al., 2010</xref>). The optimal models for each problem required only a handful of features that could be quantified using experimental techniques such as qRT-PCR. All the models were integrated into BC-Predict, a web-based unified interface for harnessing the models. BC-Predict is available for academic research at: <ext-link ext-link-type="uri" xlink:href="https://apalania.shinyapps.io/BC-Predict">https://apalania.shinyapps.io/BC-Predict</ext-link>. All the Supplementary Information for this study are available at: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6084/m9.figshare.25282906">https://doi.org/10.6084/m9.figshare.25282906</ext-link>.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2-1">
<label>2.1</label>
<title>Problems related to the characterization of breast cancer heterogeneity</title>
<p>Four problems related to the delineation of individual breast cancers with respect to the expression data of patient samples were considered:<list list-type="order">
<list-item>
<p>Is the patient sample &#x2018;cancer&#x2019; or &#x2018;normal&#x2019;?</p>
</list-item>
<list-item>
<p>If cancer: predict &#x2018;non-metastatic&#x2019; (stages I, II or III) or &#x2018;metastatic&#x2019; (stage-IV cancer).</p>
</list-item>
<list-item>
<p>If cancer: predict the molecular subtype of the cancer.</p>
</list-item>
<list-item>
<p>If cancer: predict the histological subtype of the cancer.</p>
</list-item>
</list>
</p>
<p>A generalized workflow for the problems is depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>ML model development for Cancer vs. Normal binary classification. Data-driven optimization of a multi-phase workflow, including nested model selection, is shown. Hypothesis space pruning is achieved via feature selection techniques, leading to a consensus gene-signature. Six different classes of machine learning algorithms were considered, with hyperparameter optimization via k-fold cross-validation on the training dataset and model class selection on the holdout test dataset. External validation of the best model yielded a robust assessment of generalizability. Problem-specific substitutions yield workflows adapted to the other problems considered.</p>
</caption>
<graphic xlink:href="fbinf-05-1644695-g001.tif">
<alt-text content-type="machine-generated">Flowchart depicting a data analysis pipeline for BRCA. It begins with data preprocessing, involving TCGA-BRCA data and removal of minimal variance genes. The data is split into training and independent validation datasets. Feature engineering uses Linear modeling genes and a two-tier contrast to select stage-salient genes. Hypothesis space optimization is performed via feature selection by RFE and Boruta, leading to consensus features. These load onto a machine learning model, followed by k-fold cross-validation for hyperparameter tuning. The model undergoes external validation on ICGC-KR cancer and GTEx normal data sets.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Dataset preprocessing</title>
<p>Preprocessing was done in a manner similar to Sarathi and Palaniappan (<xref ref-type="bibr" rid="B87">Sarathi and Palaniappan, 2019</xref>). The source dataset for all problems modeled here was obtained from the TCGA. Normalised BRCA expression data was acquired using the firebrowse portal (<xref ref-type="bibr" rid="B94">Summary, 2016</xref>) (gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0. tar.gz), and RSEM counts were obtained. The patient barcode was matched with the clinical data (gdac.broadinstitute.org_BRCA.Merge_Clinical.Level_1.2016012800.0.0. tar) to extract the patient. stage_event.pathologic_stage variable values that encode the AJCC TNM staging (<xref ref-type="bibr" rid="B35">Giuliano et al., 2018</xref>). The sub-stages were then merged to obtain the macro stage categories. <xref ref-type="table" rid="T1">Table 1</xref> shows the distribution of sample stages for the breast cancer samples according to the AJCC staging system. It is noted that early-stage BC indicates TNM stage-I or stage-II cancer. Stage-III BC (including T3N1, T4, N2-3) represents loco-regionally advanced BC, whereas T3N0 represents a borderline diagnosis between stages II and III. For the purposes of our study, stages I, II, and III were combined into the &#x2018;non-metastatic&#x2019; class.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Stage-wise distribution of TCGA breast cancer samples based on AJCC system, 2018 revision. Numeric suffix is used to indicate the size of tumor (T), number of nodes (N), and presence of metastasis (M).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">TCGA stage</th>
<th align="left">TNM classification</th>
<th colspan="2" align="left">Cases</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">1</td>
<td align="left">T1N0M0</td>
<td align="left">90</td>
<td rowspan="3" align="left">181</td>
</tr>
<tr>
<td align="left">1A</td>
<td align="left">T1aN0M0</td>
<td align="left">85</td>
</tr>
<tr>
<td align="left">1B</td>
<td align="left">T1bN0M0</td>
<td align="left">6</td>
</tr>
<tr>
<td align="left">2</td>
<td align="left">T2N0M0</td>
<td align="left">6</td>
<td rowspan="3" align="left">616</td>
</tr>
<tr>
<td align="left">2A</td>
<td align="left">T2aN0M0</td>
<td align="left">357</td>
</tr>
<tr>
<td align="left">2B</td>
<td align="left">T2b (N0/N1)M0</td>
<td align="left">253</td>
</tr>
<tr>
<td align="left">3</td>
<td align="left">T3N0M0</td>
<td align="left">2</td>
<td rowspan="4" align="left">249</td>
</tr>
<tr>
<td align="left">3A</td>
<td align="left">T3a (N1/N2)M0</td>
<td align="left">155</td>
</tr>
<tr>
<td align="left">3B</td>
<td align="left">T4(N0/N1/N2)M0</td>
<td align="left">27</td>
</tr>
<tr>
<td align="left">3C</td>
<td align="left">T (any)N3M0</td>
<td align="left">65</td>
</tr>
<tr>
<td align="left">4</td>
<td align="left">T (any)N (any)M1</td>
<td align="left">20</td>
<td align="left">20</td>
</tr>
<tr>
<td align="left">Control</td>
<td align="left">&#x2014;</td>
<td colspan="2" align="left">112</td>
</tr>
<tr>
<td align="left">X</td>
<td align="left"/>
<td colspan="2" align="left">14</td>
</tr>
<tr>
<td align="left">NA</td>
<td align="left">&#x2014;</td>
<td colspan="2" align="left">8</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The immunohistochemical (IHC) status of oestrogen receptor (ER) and progesterone receptor (PgR), human epidermal growth factor receptor 2 (HER2) oncogene, and Ki-67 (a marker of cell proliferation) are used together to subtype breast tumors into Triple-negative breast cancer (TNBC), HER2-positive, Luminal A and Luminal B (<xref ref-type="bibr" rid="B35">Giuliano et al., 2018</xref>; <xref ref-type="bibr" rid="B23">Dai et al., 2015</xref>), as shown in <xref ref-type="table" rid="T2">Table 2</xref>. Where reliable Ki-67 measurements are not available, an alternative assessment of tumor proliferation such as tumor grade could be used to distinguish between &#x2018;Luminal A&#x2019; and &#x2018;Luminal B&#x2019; (which tends to be HER2 negative). Complete ER, PgR and HER2 IHC metadata were available for 719 samples of the TCGA Breast Cancer dataset, and of these, no sample had information on the Ki-67 labeling index nor on the tumor grade, precluding precise differentiation of luminal subtypes of breast cancers into &#x2018;Luminal A&#x2019; or &#x2018;Luminal B&#x2019;. The luminal subtypes A and B were perforce lumped into one &#x2018;Luminal&#x2019; type. The 719 samples were accordingly annotated as 567 &#x2018;Luminal&#x2019; (generally Luminal A with Grade 1 or 2 and Luminal B with G3), 115 TNBC (generally Grade 3), and 37 HER2 (generally Grade 3) based on the status of ER, PgR and HER2 extracted from the clinical file (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Molecular taxonomy of breast cancer. Luminal A is HER2 negative, whereas Luminal B could be either HER2 positive (accounting for 30% of HER2 positive) or HER2 negative (majority of Luminal B).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">S.No.</th>
<th align="center">HER2 status</th>
<th align="center">ER status</th>
<th align="center">PgR status</th>
<th align="left">Ki-67 labelling index</th>
<th align="left">Intrinsic subtype</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">1</td>
<td rowspan="3" align="center">&#x2b;</td>
<td align="center">&#x2b;</td>
<td align="center">&#x2b;</td>
<td rowspan="3" align="left">Any</td>
<td rowspan="3" align="left">Luminal B (HER2 positive)</td>
</tr>
<tr>
<td align="center">&#x2b;</td>
<td align="center">&#x2013;</td>
</tr>
<tr>
<td align="center">&#x2013;</td>
<td align="center">&#x2b;</td>
</tr>
<tr>
<td align="left">2</td>
<td align="center">&#x2b;</td>
<td align="center">&#x2013;</td>
<td align="center">&#x2013;</td>
<td align="left">n/a</td>
<td align="left">HER2&#x2b;</td>
</tr>
<tr>
<td rowspan="6" align="left">3</td>
<td rowspan="6" align="center">&#x2013;</td>
<td rowspan="2" align="center">&#x2b;</td>
<td rowspan="2" align="center">&#x2b;</td>
<td align="left">Low (&#x3c;14%)</td>
<td align="left">Luminal A</td>
</tr>
<tr>
<td align="left">High</td>
<td align="left">Luminal B (HER2 negative)</td>
</tr>
<tr>
<td rowspan="2" align="center">&#x2b;</td>
<td rowspan="2" align="center">&#x2013;</td>
<td align="left">Low (&#x3c;14%)</td>
<td align="left">Luminal A</td>
</tr>
<tr>
<td align="left">High</td>
<td align="left">Luminal B (HER2 negative)</td>
</tr>
<tr>
<td rowspan="2" align="center">&#x2013;</td>
<td rowspan="2" align="center">&#x2b;</td>
<td align="left">Low (&#x3c;14%)</td>
<td align="left">Luminal A</td>
</tr>
<tr>
<td align="left">High</td>
<td align="left">Luminal B (HER2 negative)</td>
</tr>
<tr>
<td align="left">4</td>
<td align="center">&#x2013;</td>
<td align="center">&#x2013;</td>
<td align="center">&#x2013;</td>
<td align="left">n/a</td>
<td align="left">Triple negative breast cancer (TNBC)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The two most common histological subtypes of breast cancer are infiltrating ductal carcinoma (IDC - no special type) and infiltrating lobular carcinoma (ILC) (<xref ref-type="bibr" rid="B108">Weigelt et al., 2010</xref>). ILC tends to be difficult to diagnose, with MR imaging required for determining size and multifocality including contralateral breast (mirror image), and preferential spread to gastrointestinal tract and peritoneum (<xref ref-type="bibr" rid="B109">Winchester et al., 1998</xref>). The sample histological subtype is encoded in the clinical metadata &#x2018;patient.histological_type&#x2019; with the major values being, &#x2018;infiltrating ductal carcinoma (IDC)&#x2019; and &#x2018;infiltrating lobular carcinoma (ILC)&#x2019;, and minor values including &#x2018;mixed histology&#x2019;, &#x2018;metaplastic carcinoma&#x2019;, &#x2018;mucinous carcinoma&#x2019;, &#x2018;medullary carcinoma&#x2019;, and &#x2018;other (specify)&#x2019;.</p>
<p>Genes that had minimal variation in expression across the samples (i.e., &#x3c3; &#x3c; 1) were removed. Cancer samples which were missing stage annotation details were removed. The expression dataset was subjected to variance-stabilization using <monospace>voom</monospace> function in <monospace>limma</monospace> (<xref ref-type="bibr" rid="B55">Law et al., 2014</xref>). Linear modeling was then performed. The resulting dataset was split 80:20 into a training set and a holdout testset stratified on the outcome variable of each problem. It is noted that the training dataset for Problem &#x23;2 suffered an imbalance in the distribution of the outcome classes (16 metastatic vs. 837 non-metastatic samples), which prompted the application of SMOTE correction (<xref ref-type="bibr" rid="B18">Chawla et al., 2002</xref>) (Synthetic Minority Oversampling TEchnique; with arguments: perc. over-represented &#x3d; 1,000% and perc. under-represented &#x3d; 300%). Data preprocessing and analysis was done using R (<ext-link ext-link-type="uri" xlink:href="http://www.r-project.org/">www.r-project.org</ext-link>). The annotated pre-processed final dataset is available as Supplementary File S1.</p>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Construction of feature space</title>
<p>Feature spaces for each problem were constructed using only the training dataset. Initially the differential expression of genes across cancer stages relative to healthy samples was studied using linear modelling with limma (<xref ref-type="bibr" rid="B83">Ritchie et al., 2015</xref>):<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Where the independent variables are indicator variables of the sample&#x2019;s stage, the intercept <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the baseline expression estimated from the controls, and <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the estimated stagewise log fold-change (lfc) coefficients relative to controls.</p>
<p>We then applied a two-level contrast protocol (<xref ref-type="bibr" rid="B76">Muthamilselvan et al., 2023</xref>), viz. level-I: stage vs. control and level-II: inter-stages contrast, to produce the following classes of features:<list list-type="order">
<list-item>
<p>Stage-salient genes obtained from all possible pairwise contrasts between the cancer stages using the following model:</p>
</list-item>
</list>
<disp-formula id="e2">
<mml:math id="m4">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Where the controls themselves constitute one of the indicator variables (<inline-formula id="inf3">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>), and the <inline-formula id="inf4">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are coefficients estimated from samples of the corresponding annotation only.<list list-type="order">
<list-item>
<p>Monotonically expressed genes obtained from strictly increasing or strictly decreasing mean expression across the cancer stages.</p>
</list-item>
</list>
</p>
<p>In addition, expression contrasts specific to the problem under consideration were used, namely:<list list-type="order">
<list-item>
<p>contrast of non-metastatic vs. metastatic cancers using the following model modified from <xref ref-type="disp-formula" rid="e2">Equation 2</xref>:</p>
</list-item>
</list>
<disp-formula id="e3">
<mml:math id="m7">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>Where the <inline-formula id="inf5">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are coefficients estimated from samples of the corresponding annotation only.<list list-type="order">
<list-item>
<p>three-way pairwise contrasts between the molecular subtypes; viz. (i) Luminal vs. HER2&#x2b;, (ii) Luminal vs. TNBC and (iii) HER2&#x2b; vs. TNBC using the following model modified from <xref ref-type="disp-formula" rid="e2">Equation 2</xref>:</p>
</list-item>
</list>
<disp-formula id="e4">
<mml:math id="m9">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>Where the <inline-formula id="inf6">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b4;</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are coefficients estimated from samples of the corresponding annotation only.<list list-type="order">
<list-item>
<p>contrast of ductal vs. lobular histologies using the following model modified from <xref ref-type="disp-formula" rid="e2">Equation 2</xref>:</p>
</list-item>
</list>
<disp-formula id="e5">
<mml:math id="m11">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3d1;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3d1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3d1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>Where the <inline-formula id="inf7">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3d1;</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are coefficients estimated from samples of the corresponding annotation only.</p>
<p>The above strategies yielded problem-specific chimeric feature spaces that could span the informative dimensions in each case.</p>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>Building problem-specific classification models</title>
<p>A composite feature space comprising the top-ranked genes from the linear model, stage-salient genes, and genes from the problem-specific contrast was subjected to the consensus of two feature selection techniques: (i) Boruta, a wrapper algorithm using Random Forest to select features based on a measure of importance to the outcome variable of interest (<xref ref-type="bibr" rid="B51">Kursa and Rudnicki, 2010</xref>); and (ii) Recursive Feature Elimination (RFE), a method that uses backward selection passes to trim the space of predictor variables. The workflow of the machine learning model development in <xref ref-type="fig" rid="F1">Figure 1</xref> presented in the context of cancer v/s normal was adapted for the non-metastatic v/s metastatic, molecular subtype, and histological subtype classification problems. The training dataset with the final set of features was loaded onto models based on six different algorithms, including Random Forest (ensemble bagging classifier that builds numerous decision trees and &#x2018;bags&#x2019; the majority vote), Support Vector Machine (geometric method that finds the maximum margin separating hyperplane in high-dimensional space), k-NN (based on distance-based proximal classes), 1-layer and 2-layer Neural Networks, and XGBoost (ensemble boosting classifier that builds a sequence of classifiers iteratively &#x2018;boosted&#x2019; on challenging instances).</p>
</sec>
<sec id="s2-5">
<label>2.5</label>
<title>Nested model selection</title>
<p>Subsequent to an 80:20 train-test split, algorithm-specific hyperparameter configuration was optimized using 10-fold cross-validation on the training dataset for each of the six algorithms considered. Different algorithm classes were then compared based on their outer-fold testset performance, to identify the optimal algorithm class for each learning problem. The design of such a nested model selection prevents information leakage between model tuning and evaluation, and provides for a more reliable assessment of model generalizability to unseen cohorts than merely cross-validation. Evaluation metrics on the holdout testset as well as external datasets (described below) included balanced accuracy, F1-score, area under ROC (AUROC), Mathews&#x2019; correlation coefficient (MCC), and Positive Predictive Value (PPV).</p>
</sec>
<sec id="s2-6">
<label>2.6</label>
<title>Validation</title>
<p>The overall best model for each problem was validated primarily by performing inference on out-of-domain external datasets. <xref ref-type="table" rid="T3">Table 3</xref> shows the datasets used in the development and validation of the ML models for the respective classification problems. In addition, we sought to obtain concurrence for our models from multi-omic signatures, as discussed below.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Datasets used in the modelling of BRCA classification problems. In addition, GSE18549, GSE211167, and METABRIC datasets were also used for external validation in &#x2018;normal vs. cancer&#x2019;.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">S.No</th>
<th align="left">Problem</th>
<th colspan="2" align="left">Dataset used</th>
<th align="left">Sample details</th>
<th align="left">Purpose</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="4" align="left">1</td>
<td rowspan="4" align="left">Normal v/s cancer</td>
<td rowspan="2" align="left">TCGA</td>
<td align="left">Training</td>
<td align="left">90 Normal; 854 Cancer</td>
<td align="left">Model building and hyperparameter tuning</td>
</tr>
<tr>
<td align="left">Testing</td>
<td align="left">22 Normal; 212 Cancer</td>
<td align="left">Internal validation</td>
</tr>
<tr>
<td colspan="2" align="left">ICGC (BRCA-KR)</td>
<td align="left">3 Normal; 47 Cancer</td>
<td align="left">External validation</td>
</tr>
<tr>
<td colspan="2" align="left">GTEx</td>
<td align="left">218 Normal</td>
<td align="left">External validation</td>
</tr>
<tr>
<td rowspan="4" align="left">2</td>
<td rowspan="4" align="left">Non-metastatic V/s Metastatic</td>
<td rowspan="2" align="left">TCGA</td>
<td align="left">SMOTE- enhanced Training</td>
<td align="left">480 non-metastatic (downsampled from 837); 176 metastatic (upsampled from 16)</td>
<td align="left">Model building and hyperparameter optimization</td>
</tr>
<tr>
<td align="left">Testing</td>
<td align="left">209 non-metastatic; 4 metastatic</td>
<td align="left">Internal validation</td>
</tr>
<tr>
<td colspan="2" align="left">ICGC (BRCA-KR)</td>
<td align="left">47 non-metastatic</td>
<td align="left">External validation</td>
</tr>
<tr>
<td colspan="2" align="left">GSE18549</td>
<td align="left">14 metastatic</td>
<td align="left">External Validation</td>
</tr>
<tr>
<td rowspan="4" align="left">3</td>
<td rowspan="4" align="left">Molecular Subtype</td>
<td rowspan="2" align="left">TCGA</td>
<td align="left">Training</td>
<td align="left">454 Luminal; 30 HER2; 92 TNBC</td>
<td align="left">Model building and hyperparameter optimization</td>
</tr>
<tr>
<td align="left">Testing</td>
<td align="left">113 Luminal; 7 HER2; 23 TNBC</td>
<td align="left">Internal validation</td>
</tr>
<tr>
<td colspan="2" align="left">METABRIC</td>
<td align="left">1,415 Luminal; 127 HER2; 299 TNBC</td>
<td align="left">External validation</td>
</tr>
<tr>
<td colspan="2" align="left">GSE211167</td>
<td align="left">26 TNBC</td>
<td align="left">External validation</td>
</tr>
<tr>
<td rowspan="3" align="left">4</td>
<td rowspan="3" align="left">Histological subtype: Ductal v/s Lobular</td>
<td rowspan="2" align="left">TCGA</td>
<td align="left">Training</td>
<td align="left">624 Ductal; 162 Lobular</td>
<td align="left">Model building and hyperparameter optimization</td>
</tr>
<tr>
<td align="left">Testing</td>
<td align="left">156 Ductal; 40 Lobular</td>
<td align="left">Internal validation</td>
</tr>
<tr>
<td colspan="2" align="left">The Metastatic Breast Cancer Project</td>
<td align="left">96 Ductal; 19 Lobular</td>
<td align="left">External validation</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s2-6-1">
<label>2.6.1</label>
<title>External validation</title>
<sec id="s2-6-1-1">
<label>2.6.1.1</label>
<title>Normal vs. cancer</title>
<p>We validated model&#x23;1 on multiple independent external breast cancer datasets:<list list-type="alpha-lower">
<list-item>
<p>BRCA-KR dataset retrieved from the ICGC DataPortal (<ext-link ext-link-type="uri" xlink:href="https://dcc.icgc.org/">https://dcc.icgc.org/</ext-link>) using &#x2018;BRCA&#x2019; as the search keyword (<xref ref-type="bibr" rid="B45">Hudson et al., 2010</xref>), containing 47 cancer samples and 3 control samples.</p>
</list-item>
<list-item>
<p>GTEx normal breast dataset (by querying for &#x2018;Breast&#x2019; in the &#x201c;GTEX_phenotype primarysite&#x201d;) (<xref ref-type="bibr" rid="B36">GTEx Consortium et al., 2013</xref>) with 218 control samples.</p>
</list-item>
<list-item>
<p>GSE18549, GSE211167, and METABRIC datasets.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2-6-1-2">
<label>2.6.1.2</label>
<title>Non-metastatic vs. metastatic</title>
<p>We validated model&#x23;2 on two different external breast cancer datasets:<list list-type="alpha-lower">
<list-item>
<p>BRCA-KR dataset described above, with all 47 cancer samples being non-metastatic cancers.</p>
</list-item>
<list-item>
<p>GSE18549 dataset of metastatic cancers (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE18549">https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc&#x3d;GSE18549</ext-link>) (<xref ref-type="bibr" rid="B7">Barrett et al., 2013</xref>), with 14 samples having &#x2018;Breast&#x2019; as the primary tumor site.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2-6-1-3">
<label>2.6.1.3</label>
<title>Molecular subtyping</title>
<p>We validated model&#x23;3 on two different external breast cancer datasets:<list list-type="alpha-lower">
<list-item>
<p>METABRIC a landmark study of breast cancer transcriptomics, available on cBioPortal (<ext-link ext-link-type="uri" xlink:href="https://www.cbioportal.org/study/summary?id=brca_metabric">https://www.cbioportal.org/study/summary?id&#x3d;brca_metabric</ext-link>) (<xref ref-type="bibr" rid="B22">Curtis et al., 2012</xref>). Breast cancer samples in METABRIC were subtyped as Luminal, HER2, or TNBC based on the IHC status of ER, PgR and HER2 extracted from the METABRIC clinical metadata. This yielded 1,415 Luminal, 127 HER2, and 299 TNBC METABRIC samples. Since METABRIC had used microarray technology to measure gene expression, a platform-specific bias might be induced. To mitigate this bias and obtain data compatible with RNA-Seq technology, we applied the Feature Specific Quantile Normalization (FSQN) technique to the METABRIC data (<xref ref-type="bibr" rid="B31">Franks et al., 2018</xref>).</p>
</list-item>
<list-item>
<p>GEO Dataset GSE211167 (<xref ref-type="bibr" rid="B67">Martini et al., 2022</xref>), consisting of only TNBC samples from 26 patients of African ancestry. The dataset was log<sub>2</sub>-transformed prior to serving for model inference.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2-6-1-4">
<label>2.6.1.4</label>
<title>Histological subtyping</title>
<p>We validated model&#x23;4 on an external breast cancer dataset from cBioPortal with 96 IDC and 19 ILC samples from the Metastatic Breast Cancer Project (<ext-link ext-link-type="uri" xlink:href="https://www.cbioportal.org/study/summary?id=brca_mbcproject_wagle_2017">https://www.cbioportal.org/study/summary?id&#x3d;brca_mbcproject_wagle_2017</ext-link>) (<xref ref-type="bibr" rid="B69">MBCP, 2025</xref>).</p>
</sec>
</sec>
<sec id="s2-6-2">
<label>2.6.2</label>
<title>Late integration of multi-omics data</title>
<sec id="s2-6-2-1">
<label>2.6.2.1</label>
<title>Integration of miRNA analysis</title>
<p>MiRNAs play a crucial role in the regulation of global mRNA expression in both physiological and pathological processes, including the invasion and metastasis of cancer. By exerting control over the expression of target genes, miRNAs act as oncogenes, tumor-suppressive genes, and modulators of distant metastasis in breast cancer. To identify differentially expressed (DE) miRNAs, we used the miRSeq dataset from the same TCGA BRCA cohort (<ext-link ext-link-type="uri" xlink:href="https://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_isoform_expression__data.Level_3.2016012800.0.0.tar.gz">gdac.broadinstitute.org_BRCA.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_isoform_expression__data.Level_3.2016012800.0.0.tar.gz</ext-link>). Being a transcriptomics dataset, the miRSeq dataset was treated akin to the mRNASeq dataset, with cancer stage as indicator variable. DE stage-specific miRNAs were revealed upon application of the two-level contrast (stage vs. control level-I contrast and inter-stages level-II contrast). For each identified stage-salient miRNA, the target genes were predicted using multiMiR (<xref ref-type="bibr" rid="B85">Ru et al., 2014</xref>), which provides an integration of 14 miRNA-mRNA interaction databases including TargetScan (<xref ref-type="bibr" rid="B1">Ag et al., 2015</xref>), miRDB (<xref ref-type="bibr" rid="B106">Wang, 2008</xref>), miRanda (<xref ref-type="bibr" rid="B27">Enright et al., 2003</xref>), and miRTarBase (<xref ref-type="bibr" rid="B44">Huang et al., 2022</xref>). Of the predicted targets for each miRNA, the stage-salient targets were investigated for differential miRNA expression-driven genes.</p>
</sec>
<sec id="s2-6-2-2">
<label>2.6.2.2</label>
<title>Identification of differential methylation-driven genes (DMDGs)</title>
<p>Epigenetic processes such as methylation could contribute to changes in gene expression and drive pathological processes. To evaluate differentially methylated genes, we used the Level3-processed 450k methylation dataset from the same TCGA BRCA cohort (<ext-link ext-link-type="uri" xlink:href="https://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.aux.2016012800.0.0.tar.gz">gdac.broadinstitute.org_BRCA.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.aux.2016012800.0.0.tar.gz</ext-link>). The correlation between methylation and expression of the stage-salient genes was analyzed using R MethylMix (<xref ref-type="bibr" rid="B17">Cedoz et al., 2018</xref>), with the preset threshold &#x2212;0.3 and p-value &#x3c;0.001. Differentially methylated states were identified using significance from Wilcoxon rank-sum testing (adj. p. value &#x3c;0.05) with an additional effect size filter (&#x3e;0.1). Genes passing these marker filters were designated as differential methylation-driven genes. Stage-salient differentially methylated genes were identified using the consensus of three stage-informed models, namely Averep, M-value and MethylMix as described (<xref ref-type="bibr" rid="B75">Muthamilselvan et al., 2022</xref>).</p>
</sec>
</sec>
</sec>
<sec id="s2-7">
<label>2.7</label>
<title>Development of cascade classifier</title>
<p>A prediction pipeline that integrates the predictions from all the models into one combined readout was designed. A schematic for one such cascade model is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. Based on the decision at the shown fork, the new sample may be taken forward for assessment of metastatic potential and molecular/ histological subtyping. The final readout for a sample from the cascade classifier would consolidate the inference from each model; for e.g., &#x2018;Metastatic triple-negative ductal cancer&#x2019;. This formed the basis for the development of BC-Predict.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Design of BC-Predict. A schematic of a cascade model for early-stage breast cancer subtyping and prognosis is presented. If the sample is predicted as &#x2018;cancer&#x2019; in the first level, it is passed through three more models in the second level that holistically characterize the cancer sample toward personalized medicine.</p>
</caption>
<graphic xlink:href="fbinf-05-1644695-g002.tif">
<alt-text content-type="machine-generated">Flowchart depicting cancer diagnosis. Starting from &#x0022;Features from new Sample&#x0022; leading to &#x0022;Cancer Vs Normal.&#x0022; If cancer, it branches into &#x0022;Lobular vs Ductal,&#x0022; &#x0022;Non-metastatic vs Metastatic,&#x0022; and &#x0022;Molecular Subtyping,&#x0022; culminating in a &#x0022;Combined Readout,&#x0022; exemplified as &#x0022;Metastatic Ductal Triple-negative breast cancer.&#x0022; A &#x0022;Normal&#x0022; branch also originates from the &#x0022;Cancer Vs Normal&#x0022; decision point.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<p>The TCGA BRCA dataset consisted of 1,212 samples, each with the measurement of expression of 20532 genes. Post data preprocessing, we obtained an annotated dataset of 1,178 samples x 18880 genes (Supplementary File S1). An adj. p.value cut-off of 0.05 yielded 14838 DE genes in breast cancer samples. Tightening the significance to adj. p-value &#x3c; 1E-05 still yielded 10167 DE genes, underscoring the persistence of genome instability in the March of cancer (<xref ref-type="bibr" rid="B40">Hanahan, 2022</xref>) A volcano plot depicting differentially expressed genes showed significant dispersion (<xref ref-type="fig" rid="F3">Figure 3a</xref>), meaning some genes were much more dysregulated than others. We performed a principal components analysis with the top ten genes from the linear modelling, and found that a clear separation between the normal and cancer samples could be obtained (<xref ref-type="fig" rid="F3">Figure 3b</xref>). This provided some basis for considering top-ranked genes from the linear modeling as candidate cancer-specific features. <xref ref-type="table" rid="T4">Table 4</xref> provides information on the top ten genes of the linear modeling, including their regulation status. Information on the top 200 such cancer-specific genes from the linear modelling are provided in Supplementary File S2. <xref ref-type="fig" rid="F4">Figure 4</xref> shows violin-plot representations of expression distribution of the top ranked genes of the linear model. Violin plots for all the top 200 genes from the linear model are provided in Supplementary File S3.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Mining of candidate biomarkers. <bold>(A)</bold> Volcano plot of statistical significance vs log-fold change of differentially expressed genes. Downregulated genes (log-fold change &#x3c;2) are shown as blue dots, whereas upregulated genes (log-fold change &#x3e;2) are shown as red dots. Stage-salient genes are highlighted. <bold>(B)</bold> Top two principal components of the expression matrix of the top ten genes from linear modelling. Normal samples can be seen to orient away from cancer samples. <bold>(C)</bold> UpSet plot of the stage-specific contrast analysis illustrating the shared counts of DEGs. <bold>(D)</bold> Heatmap representation of the stagewise expression of the 24 stage-salient genes, with both sample and gene dendrograms. It is seen that the gene dendrogram exhibits two main clusters, corresponding to overexpressed genes (red) and downregulated genes (blue). Euclidean distance metric was used for hierarchical clustering.</p>
</caption>
<graphic xlink:href="fbinf-05-1644695-g003.tif">
<alt-text content-type="machine-generated">(a) Volcano plot showing gene expression data with downregulated, upregulated, and significant genes highlighted. (b) PCA plot illustrating data grouping by stages with various colors. (c) Bar chart and intersection plot showing stage-specific gene overlaps. (d) Heatmap displaying gene expression patterns across different stages with hierarchical clustering.</alt-text>
</graphic>
</fig>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Top ten genes of the linear model with their stagewise mean log-fold change with respect to control. FDR-corrected significance and inferred regulation type are indicated.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Gene</th>
<th align="left">Stage1<break/>lfc (&#x3b2;<sub>1</sub>)</th>
<th align="left">Stage2<break/>lfc (&#x3b2;<sub>2</sub>)</th>
<th align="left">Stage3<break/>lfc (&#x3b2;<sub>3</sub>)</th>
<th align="left">Stage4<break/>lfc (&#x3b2;<sub>4</sub>)</th>
<th align="left">Adj.P.Val</th>
<th align="left">Regulation status</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">NEK2</td>
<td align="left">4.34</td>
<td align="left">4.83</td>
<td align="left">4.65</td>
<td align="left">4.82</td>
<td align="left">1.37E-188</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">MMP11</td>
<td align="left">5.94</td>
<td align="left">5.75</td>
<td align="left">5.96</td>
<td align="left">6.43</td>
<td align="left">3.80E-173</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">PKMYT1</td>
<td align="left">4.42</td>
<td align="left">4.83</td>
<td align="left">4.73</td>
<td align="left">4.90</td>
<td align="left">1.60E-172</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">GPAM</td>
<td align="left">&#x2212;3.57</td>
<td align="left">&#x2212;3.68</td>
<td align="left">&#x2212;3.65</td>
<td align="left">&#x2212;3.85</td>
<td align="left">9.39E-171</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">CPA1</td>
<td align="left">&#x2212;4.34</td>
<td align="left">&#x2212;4.56</td>
<td align="left">&#x2212;4.28</td>
<td align="left">&#x2212;4.21</td>
<td align="left">6.39E-170</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">COL10A1</td>
<td align="left">7.04</td>
<td align="left">6.74</td>
<td align="left">6.95</td>
<td align="left">7.22</td>
<td align="left">3.43E-169</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">MYOC</td>
<td align="left">&#x2212;6.06</td>
<td align="left">&#x2212;6.55</td>
<td align="left">&#x2212;6.34</td>
<td align="left">&#x2212;7.17</td>
<td align="left">1.06E-166</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">KIF4A</td>
<td align="left">4.05</td>
<td align="left">4.54</td>
<td align="left">4.33</td>
<td align="left">4.55</td>
<td align="left">1.61E-164</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">CA4</td>
<td align="left">&#x2212;6.63</td>
<td align="left">&#x2212;7.35</td>
<td align="left">&#x2212;6.91</td>
<td align="left">&#x2212;7.11</td>
<td align="left">2.01E-162</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">LYVE1</td>
<td align="left">&#x2212;4.76</td>
<td align="left">&#x2212;5.19</td>
<td align="left">&#x2212;4.90</td>
<td align="left">&#x2212;4.91</td>
<td align="left">5.79E-159</td>
<td align="left">Down</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Distribution of expression of the top-ranked genes in linear model sorted by sample stage, to illustrate differential expression patterns. It is seen that <bold>(a)</bold> NEK2 (rank&#x23;1), <bold>(b)</bold> MMP11 (rank&#x23;2), and <bold>(c)</bold> PKMY11 (rank&#x23;3) in the top row are overexpressed in cancers, whereas <bold>(d)</bold> GPAM (rank&#x23;4) and <bold>(e)</bold> HSD17B13 (rank&#x23;11) in the bottom row are downregulated in cancers. A variability in expression levels of each gene across stages is also seen. The expression violins of all the top 200 genes from the linear model are presented in Supplementary File S3.</p>
</caption>
<graphic xlink:href="fbinf-05-1644695-g004.tif">
<alt-text content-type="machine-generated">Violin plots depicting the expression levels of five genes&#x2014;NEK2, MMP11, PKMYT1, GPAM, and HSD17B13&#x2014;across different cancer stages. Each plot shows data for control and cancer stages one through four, illustrating variability and distribution in expression levels for each gene.</alt-text>
</graphic>
</fig>
<p>Applying the level-I expression filters (&#x7c;lfc&#x7c; &#x3e; 2 and p-value cut-off &#x3c;0.001) yielded a total of 927 stage-specific genes (74 Stage-I, 238 Stage-II, 90 Stage-III, and 525 Stage-IV specific DEGs, visualized as an Upset plot (<xref ref-type="bibr" rid="B56">Lex et al., 2014</xref>) in <xref ref-type="fig" rid="F3">Figure 3c</xref>). For the identification of stage-salient genes two contrasts were applied with stringent criteria and the DEGs identified with different comparisons. This contrast has yielded 2 Stage I salient, 2 Stage II salient, 10 Stage III salient and 20 Stage IV salient genes. Limiting to the top ten stage-IV salient genes (by significance), we finally obtained 24 stage salient genes (<xref ref-type="table" rid="T5">Table 5</xref>). A heatmap visualization of the stage-salient genes exhibited a systematic differential regulation relative to the controls (<xref ref-type="fig" rid="F3">Figure 3d</xref>). Stage III 4 genes cluster along with Stage I genes and DEPDC1 Stage II with outward CST2. Rest genes from stage III and stage IV form a cluster along with COX7A1 Stage II gene. Violin plots of expression distribution across sample phenotypes for these genes could be found in Supplementary File S4.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Trends in mean expression of stage-salient genes with cancer progression. The inferred regulation status in cancer is noted.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Gene</th>
<th align="left">Stage information</th>
<th align="left">&#x3b2;<sub>0</sub>
</th>
<th align="left">&#x3b2;<sub>1</sub>
</th>
<th align="left">&#x3b2;<sub>2</sub>
</th>
<th align="left">&#x3b2;<sub>3</sub>
</th>
<th align="left">&#x3b2;<sub>4</sub>
</th>
<th align="left">Adj.P.Val (from contrast)</th>
<th align="left">Adj.P.Val (from control)</th>
<th align="left">Regulation status</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">CHRNA6</td>
<td align="left">Stage I</td>
<td align="left">&#x2212;1.67</td>
<td align="left">
<bold>3.35</bold>
</td>
<td align="left">2.85</td>
<td align="left">2.93</td>
<td align="left">2.21</td>
<td align="left">2.25E-52</td>
<td align="left">7.59E-51</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">MMP10</td>
<td align="left">Stage I</td>
<td align="left">0.04</td>
<td align="left">
<bold>3.19</bold>
</td>
<td align="left">2.76</td>
<td align="left">2.61</td>
<td align="left">1.68</td>
<td align="left">5.07E-23</td>
<td align="left">1.66E-24</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">DEPDC1</td>
<td align="left">Stage II</td>
<td align="left">2.01</td>
<td align="left">2.83</td>
<td align="left">
<bold>3.32</bold>
</td>
<td align="left">3.03</td>
<td align="left">2.43</td>
<td align="left">3.26E-92</td>
<td align="left">1.39E-89</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">COX7A1</td>
<td align="left">Stage II</td>
<td align="left">2.36</td>
<td align="left">&#x2212;2.31</td>
<td align="left">
<bold>&#x2212;2.62</bold>
</td>
<td align="left">&#x2212;2.30</td>
<td align="left">&#x2212;2.03</td>
<td align="left">3.15E-72</td>
<td align="left">4.39E-69</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">KCNK15</td>
<td align="left">Stage III</td>
<td align="left">1.99</td>
<td align="left">2.40</td>
<td align="left">1.85</td>
<td align="left">
<bold>2.59</bold>
</td>
<td align="left">1.72</td>
<td align="left">8.24E-21</td>
<td align="left">5.27E-20</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">MFSD4</td>
<td align="left">Stage III</td>
<td align="left">1.56</td>
<td align="left">&#x2212;2.06</td>
<td align="left">&#x2212;1.96</td>
<td align="left">
<bold>&#x2212;2.32</bold>
</td>
<td align="left">&#x2212;1.79</td>
<td align="left">4.51E-41</td>
<td align="left">2.88E-41</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">CDH19</td>
<td align="left">Stage III</td>
<td align="left">&#x2212;3.13</td>
<td align="left">&#x2212;2.60</td>
<td align="left">&#x2212;2.58</td>
<td align="left">
<bold>&#x2212;3.19</bold>
</td>
<td align="left">&#x2212;2.61</td>
<td align="left">3.31E-26</td>
<td align="left">1.53E-24</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">CXCL5</td>
<td align="left">Stage III</td>
<td align="left">&#x2212;2.03</td>
<td align="left">&#x2212;2.47</td>
<td align="left">&#x2212;2.17</td>
<td align="left">
<bold>&#x2212;2.87</bold>
</td>
<td align="left">&#x2212;2.83</td>
<td align="left">5.12E-24</td>
<td align="left">1.30E-22</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">AKR7A3</td>
<td align="left">Stage III</td>
<td align="left">3.26</td>
<td align="left">2.05</td>
<td align="left">1.52</td>
<td align="left">
<bold>2.33</bold>
</td>
<td align="left">2.12</td>
<td align="left">1.83E-13</td>
<td align="left">2.55E-12</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">DEGS2</td>
<td align="left">Stage III</td>
<td align="left">4.82</td>
<td align="left">2.60</td>
<td align="left">2.02</td>
<td align="left">
<bold>2.69</bold>
</td>
<td align="left">2.27</td>
<td align="left">9.30E-22</td>
<td align="left">1.68E-21</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">CST2</td>
<td align="left">Stage III</td>
<td align="left">&#x2212;0.60</td>
<td align="left">4.18</td>
<td align="left">3.57</td>
<td align="left">
<bold>4.22</bold>
</td>
<td align="left">3.52</td>
<td align="left">2.19E-48</td>
<td align="left">8.75E-52</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">LOC100124692</td>
<td align="left">Stage III</td>
<td align="left">&#x2212;2.52</td>
<td align="left">&#x2212;3.64</td>
<td align="left">&#x2212;3.60</td>
<td align="left">
<bold>&#x2212;4.13</bold>
</td>
<td align="left">&#x2212;3.83</td>
<td align="left">2.98E-46</td>
<td align="left">8.24E-48</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">GDF5</td>
<td align="left">Stage III</td>
<td align="left">&#x2212;1.26</td>
<td align="left">&#x2212;2.08</td>
<td align="left">&#x2212;2.31</td>
<td align="left">
<bold>&#x2212;2.63</bold>
</td>
<td align="left">&#x2212;2.24</td>
<td align="left">1.67E-26</td>
<td align="left">3.64E-26</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">FOXA1</td>
<td align="left">Stage III</td>
<td align="left">7.19</td>
<td align="left">2.09</td>
<td align="left">1.64</td>
<td align="left">
<bold>2.32</bold>
</td>
<td align="left">1.94</td>
<td align="left">4.81E-13</td>
<td align="left">1.30E-11</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">EGR3</td>
<td align="left">Stage IV</td>
<td align="left">4.14</td>
<td align="left">&#x2212;2.33</td>
<td align="left">&#x2212;2.71</td>
<td align="left">&#x2212;2.57</td>
<td align="left">
<bold>&#x2212;4.04</bold>
</td>
<td align="left">3.53E-18</td>
<td align="left">1.46E-44</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">FOS</td>
<td align="left">Stage IV</td>
<td align="left">7.27</td>
<td align="left">&#x2212;2.44</td>
<td align="left">&#x2212;3.07</td>
<td align="left">&#x2212;3.09</td>
<td align="left">
<bold>&#x2212;4.19</bold>
</td>
<td align="left">3.40E-21</td>
<td align="left">3.50E-62</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">FOSB</td>
<td align="left">Stage IV</td>
<td align="left">4.71</td>
<td align="left">&#x2212;3.80</td>
<td align="left">&#x2212;4.33</td>
<td align="left">&#x2212;4.30</td>
<td align="left">
<bold>&#x2212;5.66</bold>
</td>
<td align="left">9.16E-25</td>
<td align="left">4.51E-76</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">DUSP1</td>
<td align="left">Stage IV</td>
<td align="left">7.00</td>
<td align="left">&#x2212;2.13</td>
<td align="left">&#x2212;2.40</td>
<td align="left">&#x2212;2.23</td>
<td align="left">
<bold>&#x2212;3.13</bold>
</td>
<td align="left">2.51E-19</td>
<td align="left">1.81E-58</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">FREM1</td>
<td align="left">Stage IV</td>
<td align="left">0.85</td>
<td align="left">&#x2212;3.67</td>
<td align="left">&#x2212;4.13</td>
<td align="left">&#x2212;3.70</td>
<td align="left">
<bold>&#x2212;5.09</bold>
</td>
<td align="left">1.29E-23</td>
<td align="left">2.43E-77</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">EGR1</td>
<td align="left">Stage IV</td>
<td align="left">7.45</td>
<td align="left">&#x2212;2.72</td>
<td align="left">&#x2212;3.18</td>
<td align="left">&#x2212;3.11</td>
<td align="left">
<bold>&#x2212;4.00</bold>
</td>
<td align="left">3.63E-23</td>
<td align="left">2.23E-75</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">HFM1</td>
<td align="left">Stage IV</td>
<td align="left">&#x2212;3.44</td>
<td align="left">&#x2212;2.02</td>
<td align="left">&#x2212;2.24</td>
<td align="left">&#x2212;2.23</td>
<td align="left">
<bold>&#x2212;3.02</bold>
</td>
<td align="left">6.13E-18</td>
<td align="left">1.43E-52</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">ABCA10</td>
<td align="left">Stage IV</td>
<td align="left">&#x2212;0.28</td>
<td align="left">&#x2212;4.38</td>
<td align="left">&#x2212;4.80</td>
<td align="left">&#x2212;4.48</td>
<td align="left">
<bold>&#x2212;5.67</bold>
</td>
<td align="left">5.63E-33</td>
<td align="left">3.89E-115</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">KLK5</td>
<td align="left">Stage IV</td>
<td align="left">1.26</td>
<td align="left">&#x2212;3.21</td>
<td align="left">&#x2212;3.44</td>
<td align="left">&#x2212;3.44</td>
<td align="left">
<bold>&#x2212;5.45</bold>
</td>
<td align="left">6.93E-20</td>
<td align="left">2.41E-09</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">KCNA1</td>
<td align="left">Stage IV</td>
<td align="left">&#x2212;1.69</td>
<td align="left">&#x2212;2.58</td>
<td align="left">&#x2212;2.99</td>
<td align="left">&#x2212;2.81</td>
<td align="left">
<bold>&#x2212;3.93</bold>
</td>
<td align="left">3.08E-15</td>
<td align="left">1.99E-45</td>
<td align="left">Down</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate coefficients with the largest absolute values, enabling insight into stage-specific expression.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The GO and KEGG pathway analysis was performed for the Stage salient genes to identify over-represented biological processes among these candidate features (complete results in Supplementary File S5; Supplementary File S6, respectively). Genes that were monotonically expressed with cancer progression were identified by observing the trend in mean expression with increasing cancer stage. This yielded 2,246 significantly monotonic genes (1,015 with increasing expression, and 1,231 with decreasing expression). The top 20 such genes with their inferred regulation status are shown in <xref ref-type="table" rid="T6">Table 6</xref>. A stage-specific gene is said to be contra-regulated when its mean expression is &#x201c;paradoxical&#x201d; with cancer progression. There are six patterns of &#x201c;paradoxical&#x201d; mean expression, studied in Supplementary File S7. We identified 112 stage-specific genes with such contra-regulation, including one stage-I salient gene (CHRNA6). Contra-regulated genes exhibit unstable expression with cancer progression, and their anomalous behavior might represent possible directions for experimental investigations (Supplementary File S7). Stage-specific DEGs devoid of such contra-regulation suggest a more general role as enhancers of cancer progression.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Top 20 genes with significant monotonic patterns of expression. Intercept, coefficient and adj. p-values from the ordinal model are used. Status indicates monotonic upregulation (UP) or monotonic downregulation (DOWN). The table is sorted by significance (adj.p-value). Adj. R<sup>2</sup> goodness-of-fit of a stage-ordinal model of expression for each gene is provided.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Gene</th>
<th align="left">Intercept</th>
<th align="left">Coefficient</th>
<th align="left">Adj.P-value</th>
<th align="left">Adj.R<sup>2</sup>
</th>
<th align="left">Status</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">FAM13A</td>
<td align="left">9.842826</td>
<td align="left">&#x2212;0.62121</td>
<td align="left">1.70E-64</td>
<td align="left">0.2255</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">GABRD</td>
<td align="left">3.697762</td>
<td align="left">0.889287</td>
<td align="left">2.27E-64</td>
<td align="left">0.2249</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">KLHL31</td>
<td align="left">6.778289</td>
<td align="left">&#x2212;0.8667</td>
<td align="left">2.33E-63</td>
<td align="left">0.2217</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">POC1A</td>
<td align="left">6.587719</td>
<td align="left">0.525973</td>
<td align="left">4.14E-63</td>
<td align="left">0.2209</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">PAFAH1B3</td>
<td align="left">8.753896</td>
<td align="left">0.602506</td>
<td align="left">1.23E-62</td>
<td align="left">0.2193</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">SORBS1</td>
<td align="left">11.50753</td>
<td align="left">&#x2212;0.83632</td>
<td align="left">5.17E-62</td>
<td align="left">0.2174</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">NIPSNAP3B</td>
<td align="left">6.082268</td>
<td align="left">&#x2212;0.70387</td>
<td align="left">1.27E-61</td>
<td align="left">0.2161</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">TMEM220</td>
<td align="left">6.96875</td>
<td align="left">&#x2212;0.67023</td>
<td align="left">7.56E-60</td>
<td align="left">0.2102</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">SPTBN1</td>
<td align="left">13.42746</td>
<td align="left">&#x2212;0.45273</td>
<td align="left">2.81E-59</td>
<td align="left">0.2083</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">SIK2</td>
<td align="left">10.23114</td>
<td align="left">&#x2212;0.52331</td>
<td align="left">2.56E-58</td>
<td align="left">0.2051</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">RECQL4</td>
<td align="left">6.916714</td>
<td align="left">0.743136</td>
<td align="left">1.59E-57</td>
<td align="left">0.2025</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">C7orf41</td>
<td align="left">10.91012</td>
<td align="left">&#x2212;0.61324</td>
<td align="left">1.81E-57</td>
<td align="left">0.2023</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">RAG1AP1</td>
<td align="left">9.736787</td>
<td align="left">0.453142</td>
<td align="left">5.56E-57</td>
<td align="left">0.2001</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">HSD17B6</td>
<td align="left">4.70826</td>
<td align="left">0.715399</td>
<td align="left">6.98E-57</td>
<td align="left">0.2004</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">SLC35A2</td>
<td align="left">9.380796</td>
<td align="left">0.311207</td>
<td align="left">7.48E-57</td>
<td align="left">0.2002</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">CCDC64</td>
<td align="left">6.871398</td>
<td align="left">0.724435</td>
<td align="left">3.72E-56</td>
<td align="left">0.1979</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">DMD</td>
<td align="left">9.497599</td>
<td align="left">&#x2212;0.92277</td>
<td align="left">2.47E-55</td>
<td align="left">0.1952</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">RUSC1</td>
<td align="left">9.565741</td>
<td align="left">0.353172</td>
<td align="left">1.24E-53</td>
<td align="left">0.1897</td>
<td align="left">Up</td>
</tr>
<tr>
<td align="left">CXCL2</td>
<td align="left">6.668874</td>
<td align="left">&#x2212;1.23033</td>
<td align="left">4.45E-53</td>
<td align="left">0.1877</td>
<td align="left">Down</td>
</tr>
<tr>
<td align="left">PRR19</td>
<td align="left">4.794229</td>
<td align="left">0.497467</td>
<td align="left">1.87E-52</td>
<td align="left">0.1857</td>
<td align="left">Up</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Having completed the mining of signal features, we proceeded to the problem of production of machine learning models. Six model classes were optimized on the train data for each problem and subsequently evaluated on the holdout test to identify the best model class for that problem (Supplementary File S8). A summary of the best overall model for each problem and its validation on the external dataset(s) is presented in <xref ref-type="table" rid="T7">Table 7</xref>.</p>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>The best model class and its performance for each of the problems of interest: (i) normal v/s cancer using ten features, (ii) metastatic v/s non-metastatic using five features, (iii) molecular subtyping using 16 features, and (iv) histological subtyping using 24 features. Nested model selection was used to identify the best model class, with subsequent validation on external datasets. In the case of histological subtype, a voting ensemble of the two models shown was used for the external validation. The RF model for molecular subtyping was externally validated on another 26 TNBC samples, yielding 25 correct predictions. MCC and AUROC values of the best model in each case are scaled to the range [0,100].</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">S.No</th>
<th rowspan="2" align="left">Model</th>
<th align="right">Train</th>
<th align="center">Test</th>
<th colspan="6" align="center">External validation</th>
</tr>
<tr>
<th colspan="2" align="center">Balanced acc. (%)</th>
<th align="right">Balanced acc. (%)</th>
<th align="right">Specificity</th>
<th align="right">Sensitivity</th>
<th align="center">Precision (PPV)</th>
<th align="center">MCC</th>
<th align="center">AUROC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="10" align="center">Normal v/s cancer</td>
</tr>
<tr>
<td align="left">1</td>
<td align="left">NN (1 layer)</td>
<td align="right">99.82</td>
<td align="right">100</td>
<td align="right">97.42</td>
<td align="right">95.74</td>
<td align="right">99.09</td>
<td align="right">95.74</td>
<td align="right">94.84</td>
<td align="right">97.42</td>
</tr>
<tr>
<td colspan="10" align="center">Non-metastatic v/s Metastatic</td>
</tr>
<tr>
<td align="left">2</td>
<td align="left">NN (1 layer)</td>
<td align="right">99.17</td>
<td align="center">82.24</td>
<td align="right">88.22</td>
<td align="right">93.87</td>
<td align="right">78.57</td>
<td align="right">91.67</td>
<td align="right">80.87</td>
<td align="right">88.22</td>
</tr>
<tr>
<td colspan="10" align="center">Molecular subtype</td>
</tr>
<tr>
<td align="left">3</td>
<td align="left">RF</td>
<td align="right">99.99</td>
<td align="right">91.43</td>
<td align="right">88.79</td>
<td align="right">93.11</td>
<td align="right">84.46</td>
<td align="right">93.63</td>
<td align="right">84.06</td>
<td align="right">90.23</td>
</tr>
<tr>
<td colspan="10" align="center">Histological subtype</td>
</tr>
<tr>
<td align="left">4</td>
<td align="left">XGBoost</td>
<td align="right">95.13</td>
<td rowspan="2" align="right">88.74</td>
<td rowspan="2" align="right">76.92</td>
<td rowspan="2" align="right">53.85</td>
<td rowspan="2" align="right">100</td>
<td rowspan="2" align="right">93.81</td>
<td rowspan="2" align="right">71.07</td>
<td rowspan="2" align="right">76.92</td>
</tr>
<tr>
<td align="left">5</td>
<td align="left">NN (1 layer)</td>
<td align="right">96.97</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s3-1">
<label>3.1</label>
<title>Normal v/s cancer</title>
<p>The workflow for this learning problem is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. Stratified sampling of the TCGA BRCA dataset based on the class &#x2018;cancer&#x2019; or &#x2018;normal&#x2019; yielded a training dataset of 90 Normal and 854 Cancer samples, and a test dataset of 22 Normal and 212 Cancer samples. The 24 stage-salient genes from the contrasts shown in <xref ref-type="disp-formula" rid="e2">Equation 2</xref> (namely CHRNA6, MMP10, DEPDC1, COX7A1, KCNK15, MFSD4, CDH19, CXCL5, AKR7A3, DEGS2, CST2, LOC100124692, GDF5, FOXA1, EGR3, FOS, FOSB, DUSP1, FREM1, EGR1, HFM1, ABCA10, KLK5, KCNA1) were combined with the top 10 linear modelling genes from <xref ref-type="disp-formula" rid="e1">Equation 1</xref> (namely NEK2, MMP11, PKMYT1, GPAM, CPA1, COL10A1, MYOC, KIF4A, CA4, LYVE1) to obtain 34 base features for feature selection. Application of the RFE procedure identified ten features for model development, including two stage-salient genes (FREM1, ABCA10) and eight genes from the linear model (NEK2, MMP11, PKMYT1, GPAM, CPA1, COL10A1, CA4, LYVE1). Of the six ML models trained, four models yielded &#x3e;99% balanced accuracy on the training set. Subsequent evaluation on holdout testset identified only one model class with 100% accuracy, namely the neural network with one hidden layer model (Supplementary File S8). The model was re-built using the full dataset and validated on external datasets: (i) BRCA-KR, yielding a balanced accuracy &#x223c;94.00%; and (ii) GTEx, yielding &#x223c;100% accuracy (all correct predictions). Together, the model yielded an overall balanced accuracy &#x223c;97.42% on external validation (<xref ref-type="table" rid="T7">Table 7</xref>). The details could be found in Supplementary File S9, along with the prediction probabilities for all instances in both the external validation. Prediction probability is a measure of the strength of evidence for the predicted class, and based on the distribution of its values, recommendations for evidence of the predicted class may be generated. It was observed that correct predictions were supported by very strong prediction probabilities (&#x3e;0.9) relative to incorrect predictions.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Non-metastatic v/s metastatic</title>
<p>The workflow for this learning problem is a variation on <xref ref-type="fig" rid="F1">Figure 1</xref>, and available in Supplementary File S10. Stratified sampling of the TCGA BRCA dataset based on the class &#x2018;non-metastatic&#x2019; or &#x2018;metastatic&#x2019; yielded a training dataset of 837 non-metastatic and 16 Metastatic samples, and a test dataset of 209 non-metastatic and 4 Metastatic samples. SMOTE balancing of the training dataset yielded a dataset with 480 non-metastatic and 176 Metastatic samples. The contrast shown in <xref ref-type="disp-formula" rid="e3">Equation 3</xref> between non-metastatic and metastatic samples in the SMOTE-balanced dataset produced two lists of genes, one sorted by log-fold change and the other by significance (adj. p-value). The consensus of the top 50 genes from the two lists identified 15 features (namely SRMS, OXT, MMP27, LOC158696, C4orf26, CECR4, ANKRD55, GALNTL6, KRTAP3-1, FAM69C, AFP, CCDC33, SLC5A5, CXorf48, RGS7), to which were added the six top genes by significance missing in the consensus (namely GIP, SSX5, LOC100101938, C9, ASZ1, COX8C). Finally, these 21 genes were pooled with the 24 Stage-salient genes discussed in Cancer V/s Normal classification problem, to obtain 45 base features for feature selection. Application of the Boruta protocol yielded 14 features, while application of RFE procedure yielded just five features. The five RFE features were a subset of the features identified by Boruta, thus we obtained five consensus features for model development, namely DEPDC1, FOSB, DUSP1, MMP27 and ABCA10. Of the six different ML models trained, three models yielded &#x3e;99% balanced accuracy on the training set. Subsequent evaluation on the holdout testset identified the neural network with one hidden layer model as the best performing model class, with 82.24% balanced accuracy (Supplementary File S8). The model was re-built using the full dataset and validated on the BRCA-KR and GSE18549 datasets, yielding an overall balanced accuracy &#x223c;88.22% on the external validation (<xref ref-type="table" rid="T7">Table 7</xref>). The details could be found in Supplementary File S10, which includes the prediction probabilities for all instances in the external validation. On inspection of the distribution of prediction probabilities, correct predictions were found to be supported by high values (&#x3e;0.75) relative to incorrect predictions.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Molecular subtype classification</title>
<p>The workflow for this learning problem is a variation on <xref ref-type="fig" rid="F1">Figure 1</xref>, and available in Supplementary File S11. Stratified sampling of the TCGA BRCA dataset based on the molecular subtype class (&#x2018;Luminal&#x2019; or &#x2018;TNBC&#x2019; or &#x2018;HER2&#x2019;) yielded a training dataset of 434 Luminal, 30 HER2 and 92 TNBC samples, and a test dataset of 113 Luminal, 7 HER2 and 23 TNBC samples. The three-way pairwise contrasts shown in <xref ref-type="disp-formula" rid="e4">Equation 4</xref> between the molecular subtypes; viz. (i) Luminal vs. HER2, (ii) Luminal vs. TNBC and (iii) HER2 vs. TNBC; yielded subtype-specific genes, from which the top ten genes of each subtype (by significance) were pooled together to obtain 30 base features for feature selection (namely MLPH, AGR3, CA12, TBC1D9, AGR2, TFF3, SIDT1, FZD9, BCAS1, CXorf61, ERBB2, PGAP3, STARD3, C17orf37, GRB7, PSMD3, PCSK6, PNMT, TCAP, LOC150622, GATA3, ANXA9, FLJ45983, PRR15, FOXA1, DEGS2, SLC44A4, ZMYND10, KCNK15, NAT1). Application of the Boruta protocol did not identify any redundant feature, whereas application of RFE procedure yielded 16 features. These 16 features were identified as the consensus features for model development, namely GATA3, AGR3, CA12, TBC1D9, ERBB2, MLPH, KCNK15, ANXA9, FLJ45983, GRB7, PGAP3, STARD3, SLC44A4, PCSK6, FOXA1 and BCAS1. Of the six different ML models trained, the Random forest model provided superlative performance on both the training and outerfold test sets, with balanced accuracies of &#x3e;99% and 91.43% respectively (Supplementary File S8). The model was re-built using the full dataset and was validated on the METABRIC dataset, yielding a balanced accuracy &#x223c;88.79% (<xref ref-type="table" rid="T7">Table 7</xref>). Availability of the TNBC-only dataset provided an opportunity to execute a second out-of-cohort validation, yielding correct identification of 25 TNBC samples out of the total 26 samples (96.15% accuracy). The details could be found in the Supplementary File 11, including the prediction probabilities for all instances in the METABRIC and TNBC external validation datasets. On inspection of the distribution of prediction probabilities, correct predictions were found to be supported by high values (&#x3e;0.7) relative to incorrect predictions. We investigated the 16 features used in the RandomForest model for feature importance based on mean decrease in Gini score in R caret (<xref ref-type="bibr" rid="B50">Kuhn, 2008</xref>). The top five features contributing to the model performance were identified as GATA3, CA12, AGR3, TBC1D9, and MLPH (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Importance ranking of features used in developing the molecular subtype model. The scores are normalized with respect to the top-scoring feature, GATA3, and presented in the sorted order.</p>
</caption>
<graphic xlink:href="fbinf-05-1644695-g005.tif">
<alt-text content-type="machine-generated">Horizontal bar chart showing the importance of variables, ranked from highest to lowest. GATA3 is the most important, followed by CA12, AGR3, and others in decreasing order, ending with PCSK6. The importance scale ranges from zero to one hundred.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Histological subtype classification</title>
<p>Stratified sampling of the TCGA BRCA dataset based on the histological subtype (&#x2018;IDC&#x2019; or &#x2018;ILC&#x2019;) yielded a training dataset of 624 IDC and 162 ILC samples, and a test dataset of 156 IDC and 40 ILC samples. The contrast shown in <xref ref-type="disp-formula" rid="e5">Equation 5</xref> between the ductal and lobular histologies was used to detect differentially expressed genes between the two histologies, specifically applying a log-fold change threshold, &#x7c;lfc&#x7c; &#x3e;2, to binarize genes useful as features. This obtained 62 base features for feature selection. Application of the Boruta protocol yielded 58 features, while application of the RFE procedure yielded 24 features. The 24 RFE features were a subset of the Boruta features, thus we obtained 24 consensus features features for model development, namely ADCY5, ALDH1L1, ANKRD43, C1orf64, C7, CAPN8, CCL14, CDH1, CIDEA, CTSG, DARC, F7, FXYD1, HPX, IGFN1, MMP1, PEBP4, PLCXD3, PROL1, SHROOM1, TFAP2B, TFF1, TNNT3, and WNK4. Of the six different ML models trained, four models yielded &#x3e;95% balanced accuracy on the training set. Subsequent evaluation on the holdout testset identified XGBoost as the best performing model class, with 84.94% balanced accuracy (Supplementary File S8). To mitigate overfitting to the larger IDC class at the expense of the ILC class, we sought to combine the XGBoost model with the 1-layer neural network model, producing a voting ensemble classifier with a slightly better 88.74% balanced accuracy on the holdout testset (<xref ref-type="table" rid="T7">Table 7</xref>). The ensemble model re-built using the full dataset was validated on the external dataset: brca_mbcproject_wagle_2017, encoding both the histological subtypes of interest (IDC and ILC) as well as other subtypes such as &#x2018;mixed histology&#x2019;, &#x2018;DCIS&#x2019; (ductal carcinoma <italic>in situ</italic>), and &#x2018;NOS&#x2019;. Predictions were accepted if the two models of the ensemble agreed on the predicted class. If the models disagreed on the predicted class, then the predictions were rejected as ambiguous. Such instances represent challenges to the ensemble classifier whose resolution might not be simple. Omitting the eleven such instances from the external dataset, we obtained correct predictions on all 91 IDC samples as well seven (out of thirteen) ILC samples, yielding an ensemble accuracy &#x223c;94.23% and balanced accuracy &#x223c;76.92% (<xref ref-type="table" rid="T7">Table 7</xref>). Even with ensembling, generalization errors persisted in learning the ILC class, with an imbalance in the type-II error between the two classes. The details could be found in Supplementary File 12, including the prediction probabilities for all instances in the external validation. On inspection of the distribution of prediction probabilities, correct predictions were found to be supported by high values (&#x3e;0.7) relative to incorrect predictions. Histological subtyping from molecular features has remained a refractory learning problem, and we have made our models and code freely available for non-commercial use (<ext-link ext-link-type="uri" xlink:href="http://www.github.com/apalania/BCPredict_Histological">www.github.com/apalania/BC-Predict_Histological</ext-link>).</p>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>Validation with miRNA analysis</title>
<p>Stage-salient miRNA were identified using the two-level contrasts of the miRNA expression data, and then their targets were identified using the R multiMiR library (Supplementary File S13). Based on these results, we determined the concordance between the regulatory miRNAs and their target genes. Temporal concordance in expression exists if the salience in miRNA expression is at least as early as the salience in target gene expression. If the expression pattern of miRNA is discordant with its target gene, a paradoxical aberration with a protective function is possible. <xref ref-type="table" rid="T8">Table 8</xref> summarizes the validation of stage-salient gene expression from the angle of miRNA expression. Concordance between the mRNA and miRNA in the direction of expression as well as the temporal dimension is achieved for 13 stage-salient genes: MMP10, DEPDC1, CDH19, FOXA1, DEGS2, CST2, AKR7A3, EGR1, EGR3, FOS, FOSB, FGF2, and HCN2. The key regulatory miRNAs decoded by stage included 25 stage-salient miRNAs (Supplementary File S13), appearing to regulate most of the stage-salient genes. Stage-salient miRNA that were fully concordant with target mRNAs included hsa-miR-182-5p, hsa-miR-210-3p, hsa-miR10b-5p, hsa-miR-200a-5p, hsa-miR-96-5p, hsa-miR-21-5p, hsa-miR-133a-3p, hsa-miR-335-5p, hsa-miR-204-5p, and hsa-miR-145-5p. Further, four of the stage-salient miRNAs regulated genes that featured in the ML models, namely hsa-miR-210-3p, hsa-miR10b-5p, hsa-miR-200a-5p, and hsa-miR-96-5p. Only five stage-salient miRNAs displayed no overlap between their targets and stage-salient genes, and conversely, eleven stage-salient genes were predicted to be free of regulation by a stage-salient miRNA (namely COX7A1, DACT2, KCNK15, MFSD4, DSC3, KLK5, KRT15, LOC100124692, ABCA10, MAPK8IP2, and MASP1). The complete and fully detailed analysis could be found in Supplementary File S13.</p>
<table-wrap id="T8" position="float">
<label>TABLE 8</label>
<caption>
<p>Putative target stage-salient genes mapped with their regulatory stage-salient miRNA. Concordance in expression is noted if miRNA overexpression is observed with target gene downregulation or vice-versa. Evaluation of temporal concordance is useful if concordance in expression exists. If there is no concordance in expression, temporal concordance is not evaluated. Genes that display concordance with regulatory miRNA in the direction of expression as well as temporal dimension are emphasized. Target stage-salient genes that represent features used in the ML models are <italic>italicized</italic>. Upregulated miRNAs denote candidate oncomiRs, whereas downregulated miRNAs denote candidate TSmiRs.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="3" align="center">S.No</th>
<th colspan="3" align="center">Gene</th>
<th colspan="3" align="center">Regulatory miRNA</th>
</tr>
<tr>
<th rowspan="2" align="center">Name</th>
<th rowspan="2" align="center">Expression</th>
<th rowspan="2" align="center">Salience</th>
<th rowspan="2" align="center">Name</th>
<th colspan="2" align="center">Concordance</th>
</tr>
<tr>
<th align="center">Expression</th>
<th align="center">Temporal</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">1</td>
<td align="left">CHRNA6</td>
<td align="left">Up</td>
<td align="left">Stage I</td>
<td align="left">hsa-miR-452-3p</td>
<td align="left">Yes</td>
<td align="left">No</td>
</tr>
<tr>
<td rowspan="2" align="left">2</td>
<td rowspan="2" align="left">
<bold>MMP10</bold>
</td>
<td rowspan="2" align="left">
<bold>Up</bold>
</td>
<td rowspan="2" align="left">
<bold>Stage I</bold>
</td>
<td align="left">
<bold>hsa-miR-182-5p</bold>
</td>
<td align="left">
<bold>Yes</bold>
</td>
<td align="left">
<bold>Yes</bold>
</td>
</tr>
<tr>
<td align="left">hsa-miR-210-3p</td>
<td align="left">Yes</td>
<td align="left">No</td>
</tr>
<tr>
<td rowspan="5" align="left">3</td>
<td rowspan="5" align="left">
<bold>
<italic>DEPDC1</italic>
</bold>
</td>
<td rowspan="5" align="left">
<bold>
<italic>Up</italic>
</bold>
</td>
<td rowspan="5" align="left">
<bold>
<italic>Stage II</italic>
</bold>
</td>
<td align="left">
<bold>
<italic>hsa-miR-200b-3p</italic>
</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<bold>
<italic>hsa-miR-210-3p</italic>
</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<bold>
<italic>hsa-miR10b-5p</italic>
</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<bold>
<italic>hsa-miR-200a-5p</italic>
</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<italic>hsa-miR-96-5p</italic>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="3" align="left">4</td>
<td rowspan="3" align="left">
<bold>CDH19</bold>
</td>
<td rowspan="3" align="left">
<bold>Down</bold>
</td>
<td rowspan="3" align="left">
<bold>Stage III</bold>
</td>
<td align="left">
<bold>hsa-miR10b-5p</bold>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">
<bold>hsa-miR-182-5p</bold>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-335-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="3" align="left">5</td>
<td rowspan="3" align="left">GDF5</td>
<td rowspan="3" align="left">Down</td>
<td rowspan="3" align="left">Stage III</td>
<td align="left">hsa-miR-21-5p</td>
<td align="left">Yes</td>
<td align="left">No</td>
</tr>
<tr>
<td align="left">hsa-miR-335-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-182-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="2" align="left">6</td>
<td rowspan="2" align="left">
<bold>FOXA1</bold>
</td>
<td rowspan="2" align="left">
<bold>Up</bold>
</td>
<td rowspan="2" align="left">
<bold>Stage III</bold>
</td>
<td align="left">
<bold>hsa-miR-200a-3p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">hsa-miR-141-3p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">7</td>
<td align="left">
<bold>DEGS2</bold>
</td>
<td align="left">
<bold>Up</bold>
</td>
<td align="left">
<bold>Stage III</bold>
</td>
<td align="left">
<bold>hsa-miR-200b-3p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td rowspan="2" align="left">8</td>
<td rowspan="2" align="left">
<bold>CST2</bold>
</td>
<td rowspan="2" align="left">
<bold>Up</bold>
</td>
<td rowspan="2" align="left">
<bold>Stage III</bold>
</td>
<td align="left">
<bold>hsa-miR-210-3p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">hsa-miR-335-5p</td>
<td align="left">Yes</td>
<td align="left">No</td>
</tr>
<tr>
<td align="left">9</td>
<td align="left">
<bold>AKR7A3</bold>
</td>
<td align="left">
<bold>Up</bold>
</td>
<td align="left">
<bold>Stage III</bold>
</td>
<td align="left">
<bold>hsa-miR-210-3p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">10</td>
<td align="left">CXCL5</td>
<td align="left">Down</td>
<td align="left">Stage III</td>
<td align="left">hsa-miR10b-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="9" align="left">11</td>
<td rowspan="9" align="left">
<bold>EGR1</bold>
</td>
<td rowspan="9" align="left">
<bold>Down</bold>
</td>
<td rowspan="9" align="left">
<bold>Stage IV</bold>
</td>
<td align="left">
<bold>hsa-miR-21-5p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<bold>hsa-miR183-5p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">hsa-miR-204-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-133a-3p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-452-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-224-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR10b-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-210-3p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-182-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="4" align="left">12</td>
<td rowspan="4" align="left">
<bold>EGR3</bold>
</td>
<td rowspan="4" align="left">
<bold>Down</bold>
</td>
<td rowspan="4" align="left">
<bold>Stage IV</bold>
</td>
<td align="left">
<bold>hsa-miR183-5p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">hsa-miR-335-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR10b-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-182-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="4" align="left">13</td>
<td rowspan="4" align="left">
<bold>
<italic>FOSB</italic>
</bold>
</td>
<td rowspan="4" align="left">
<bold>
<italic>Down</italic>
</bold>
</td>
<td rowspan="4" align="left">
<bold>
<italic>Stage IV</italic>
</bold>
</td>
<td align="left">
<bold>
<italic>hsa-miR183-5p</italic>
</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<italic>hsa-miR-224-3p</italic>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">
<italic>hsa-miR-224-5p</italic>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">
<italic>hsa-miR-200b-3p</italic>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="2" align="left">14</td>
<td rowspan="2" align="left">KLK7</td>
<td rowspan="2" align="left">Down</td>
<td rowspan="2" align="left">Stage IV</td>
<td align="left">hsa-miR-335-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-182-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="3" align="left">15</td>
<td rowspan="3" align="left">
<italic>DUSP1</italic>
</td>
<td rowspan="3" align="left">
<italic>Down</italic>
</td>
<td rowspan="3" align="left">
<italic>Stage IV</italic>
</td>
<td align="left">
<italic>hsa-miR10b-5p</italic>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">
<italic>hsa-miR-200b-3p</italic>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-200b-3p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="6" align="left">17</td>
<td rowspan="6" align="left">
<bold>FOS</bold>
</td>
<td rowspan="6" align="left">
<bold>Down</bold>
</td>
<td rowspan="6" align="left">
<bold>Stage IV</bold>
</td>
<td align="left">
<bold>hsa-miR-196a-5p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<bold>hsa-miR183-5p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">hsa-miR-335-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR10b-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-139-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-182-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">18</td>
<td align="left">KCNA1</td>
<td align="left">Down</td>
<td align="left">Stage IV</td>
<td align="left">hsa-miR-210-3p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td rowspan="7" align="left">19</td>
<td rowspan="7" align="left">
<bold>FGF2</bold>
</td>
<td rowspan="7" align="left">
<bold>Down</bold>
</td>
<td rowspan="7" align="left">
<bold>Stage IV</bold>
</td>
<td align="left">
<bold>hsa-miR-196a-5p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">
<bold>hsa-miR-96-5p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">hsa-miR-145-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-133a-3p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR10b-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-210-3p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">hsa-miR-182-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">20</td>
<td align="left">
<bold>HCN2</bold>
</td>
<td align="left">
<bold>Up</bold>
</td>
<td align="left">
<bold>Stage IV</bold>
</td>
<td align="left">
<bold>hsa-miR-133a-3p</bold>
</td>
<td align="left">Yes</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">21</td>
<td align="left">KIT</td>
<td align="left">Down</td>
<td align="left">Stage IV</td>
<td align="left">hsa-miR-335-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">22</td>
<td align="left">
<italic>FREM1</italic>
</td>
<td align="left">
<italic>Down</italic>
</td>
<td align="left">
<italic>Stage IV</italic>
</td>
<td align="left">
<italic>hsa-miR-335-5p</italic>
</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">23</td>
<td align="left">HFM1</td>
<td align="left">Down</td>
<td align="left">Stage IV</td>
<td align="left">hsa-miR-335-5p</td>
<td align="left">No</td>
<td align="left">&#x2014;</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate gene-miRNA combinations with double concordance, in the direction of expression as well as temporal dimension.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-6">
<label>3.6</label>
<title>Validation with methylation analysis</title>
<p>Aberrant methylation in the core/ proximal promoter regions as well as enhancers could have profound regulatory effects on gene expression. We obtained a total of 22 stage-salient DMGs from the consensus of Averep, Mvalue, and MethylMix procedures: 1 stage-I salient DMG (VOPP1), 8 stage-II salient DMGs (HS3ST3B1, CPLX1, EGR1, GMDS, ITPKB, TGFB1I1, C6orf145, SHC1), 10 stage-III salient DMGs (BTLA, TNFAIP2, PHYHIPL, LYN, MAML2, C16orf62, GPRC5B, CAPN9, AIPL1, AGAP1), and 4 stage-IV salient DMGs (CNP, TSPYL5, SLC7A5, HCN2). Salient methylation of a gene is an epigenetic mechanism to tune gene expression and would precede changes in its expression. In this respect, the stage-II salient methylation of EGR1 possibly set the stage for its stage-IV salience (minimization) in expression. It is observed that the stage-IV salient hypermethylation of HCN2 was at odds with its stage-IV salient overexpression.</p>
<p>Mining the methylation patterns of all stage-salient genes for differential methylation-driven genes revealed five transcriptionally predictive genes negatively correlated with gene expression, namely AKR7A3, COX7A1, DEGS2, EGR1, and FOXA1 (<xref ref-type="fig" rid="F6">Figure 6</xref>). Four of these genes exhibited two-component mixtures of methylation distribution, indicating a probable shift in methylation levels in cancer samples relative to healthy ones. COX7A1 showed three-component mixtures of methylation distribution, indicating a reliance on methylation to achieve regulatory fine-tuning. <xref ref-type="table" rid="T9">Table 9</xref> summarizes the methylation patterns for these five genes, showing the correlation size with expression and if the correlation is concordant as well. In the epigenetic context, the methylation pattern of a gene could be deemed concordant with its expression if maximal methylation is observed <italic>ahead of</italic> minimal mRNA expression. FOXA1 mRNA expression is at odds with both its epigenetic profiles (methylation and miRNA), suggesting that epigenetic modulation was being used to restore FOXA1 aberrant expression. Concordance in methylation is observed for AKR7A3, DEGS2, EGR1, and COX7A1, providing strong support for their stage-salience. The above genes except COX7A1 were also concordantly modulated by stage-salient miRNAs. Such findings lead to a belief in the existence of concert between the different layers of omics, adding &#x2018;definiteness&#x2019; to gene expression on the path to phenotypic states. Further investigations could shed light on the emergent hypotheses in the future. The mixture decomposition of methylation patterns of the remaining stage-salient genes is provided in Supplementary File S14. It could be seen, for e.g., that the methylation of ABCA10 is positively correlated with its expression, escaping clear interpretation.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Mixture model of methylation densities, and scatter of expression vs methylation for the respective cluster of each stage-salient differential methylation-driven gene. <bold>(a)</bold> FOXA1 <bold>(b)</bold> AKR7A3 <bold>(c)</bold> COX7A1 <bold>(d)</bold> DEGS2 and <bold>(e)</bold> EGR1. Density plots include mixture components in orange, green, and purple, two for each of FOXA1, AKR7A3, DEGS2, and EGR1, and three for COX7A1. Bayesian Information Criterion was used for estimating the number of mixture components. Scatter plots revealed a consistent negative correlation between DNA methylation and gene expression, marked by different colors for mixture components. Visualized using MethylMix.</p>
</caption>
<graphic xlink:href="fbinf-05-1644695-g006.tif">
<alt-text content-type="machine-generated">Nine-panel grid showing DNA methylation and gene expression data for various clusters. Panels (a) to (e) display density plots and scatter plots for genes FOXA1, AKR7A3, COX7A1, DEGS2, and EGR1, respectively. Density plots include mixture components highlighted in orange, green, and purple. Scatter plots reveal negatively correlated trends between DNA methylation and gene expression, marked by different colors for mixture components.</alt-text>
</graphic>
</fig>
<table-wrap id="T9" position="float">
<label>TABLE 9</label>
<caption>
<p>Summary of the stage-salient differential methylation-driven genes. Since the methylation of each gene was assayed at a variable number of CpG probe locations, the methylation patterns at different probes for a given gene were clustered based on Pearson&#x2019;s correlation coefficient cut-off (&#x3e;0.7). Significant clusters were used to obtain values for: effect size of differential methylation across mixture components, significance of the methylation pattern, coefficient of correlation between expression and methylation, and concordance. Sign of the DM effect signifies the type of aberrant methylation (hyper/ hypo) across the mixture components.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Gene of interest</th>
<th colspan="2" align="center">CpG sites</th>
<th colspan="2" align="left">Significant cluster</th>
<th rowspan="2" align="left">DM effect size</th>
<th rowspan="2" align="center">p-value</th>
<th rowspan="2" align="left">Type of DM</th>
<th rowspan="2" align="center">Correlation with expression</th>
<th rowspan="2" align="center">Concordance</th>
</tr>
<tr>
<th align="left">Probes</th>
<th align="left">Clusters</th>
<th align="left">ID</th>
<th align="left">Probes</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">FOXA1</td>
<td align="left">18</td>
<td align="left">10</td>
<td align="left">Cluster2</td>
<td align="left">5</td>
<td align="left">0.373</td>
<td align="left">1.25E-98</td>
<td align="left">Hyper</td>
<td align="left">&#x2212;0.66</td>
<td align="left">No</td>
</tr>
<tr>
<td align="left">AKR7A3</td>
<td align="left">14</td>
<td align="left">6</td>
<td align="left">Cluster4</td>
<td align="left">1</td>
<td align="left">&#x2212;0.321</td>
<td align="left">9.89E-48</td>
<td align="left">Hypo</td>
<td align="left">&#x2212;0.49</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">COX7A1</td>
<td align="left">4</td>
<td align="left">2</td>
<td align="left">Cluster2</td>
<td align="left">3</td>
<td align="left">0.413</td>
<td align="left">3.36E-45</td>
<td align="left">Hyper</td>
<td align="left">&#x2212;0.48</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">DEGS2</td>
<td align="left">15</td>
<td align="left">13</td>
<td align="left">Cluster12</td>
<td align="left">1</td>
<td align="left">&#x2212;0.157</td>
<td align="left">1.56E-25</td>
<td align="left">Hypo</td>
<td align="left">&#x2212;0.36</td>
<td align="left">Yes</td>
</tr>
<tr>
<td align="left">EGR1</td>
<td align="left">13</td>
<td align="left">11</td>
<td align="left">Cluster4</td>
<td align="left">2</td>
<td align="left">0.185</td>
<td align="left">1.21E-23</td>
<td align="left">Hyper</td>
<td align="left">&#x2212;0.35</td>
<td align="left">Yes</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>External validation of the models on out-of-domain cohorts suggested that they may be robust to distribution shifts in expression profiles that characterize demographic changes. In a recent study, we applied dimensionality reduction and unsupervised learning to the space of nine expression features (viz. NEK2, PKMYT1, MMP11, CPA1, COL10A1, HSD17B13, CA4, MYOC, LYVE1) and addressed the &#x2018;cancer&#x2019; vs. &#x2018;normal&#x2019; binary classification, producing BrcaDx (<ext-link ext-link-type="uri" xlink:href="https://apalania.shinyapps.io/BrcaDx">https://apalania.shinyapps.io/BrcaDx</ext-link>) (<xref ref-type="bibr" rid="B74">Muthamilselvan and Palaniappan, 2023</xref>) with a balanced accuracy of 95.52% on the BRCA-KR and GTEx. Here we have used a supervised learning approach to the same problem (<xref ref-type="fig" rid="F2">Figure 2</xref>), and derived ten features, including ABCA10, GPAM, FREM1, and the first seven features noted in the prior BrcaDx model. This has yielded a balanced accuracy of 97.42% on the same external datasets, constituting a significant improvement. Beyond the performance improvement, it is noted that BrcaDx suffers from the relative opaqueness of surrogate biomarker spaces (viz. principal components) in its implementation, which tend to obscure interpretation. Other recent advances for discriminating breast cancer from normal samples include a supervised learning model of 20 biomarkers, which was validated on only an internal test set with a balanced accuracy that does not exceed 86% (<xref ref-type="bibr" rid="B97">Taghizadeh et al., 2022</xref>). BC-Predict and BrcaDx are both reproducible and interestingly share no common biomarkers with these earlier models.</p>
<sec id="s4-1">
<label>4.1</label>
<title>Literature discussion</title>
<p>We searched Pubmed (<ext-link ext-link-type="uri" xlink:href="http://www.pubmed.gov">www.pubmed.gov</ext-link>) using the keyword: &#x201c;breast cancer&#x201d; AND &#x201c;stage specific&#x201d; AND &#x201c;gene&#x201d;, and found a handful of known stage-specific genes. TIEG (or KLF10) is an anti-metastasis/ tumor-suppressor gene, which inhibits invasive breast cancer by blocking EGFR transcription in the EGFR signalling pathway (<xref ref-type="bibr" rid="B105">W et al., 2012</xref>). Stage-specific expression of KLF10 in breast cancer biopsies has been published, with sustained downregulation leading to complete absence of expression in invasive subtypes (<xref ref-type="bibr" rid="B93">Subramaniam et al., 1998</xref>). Here KLF10 expression is found to be decreasing with stage relative to the normals. &#x3b3;-Synuclein (SNCG) expression is strongly correlated with the stages of breast cancer, showing little expression in normal or benign samples and increasing expression with cancer stage, and detectable only in a subset of patients (<xref ref-type="bibr" rid="B110">Wu et al., 2003</xref>). Here we find increasing expression of SNCG in late-stage cancers, but downregulated with respect to expression in normal samples, which is a contrarian finding.</p>
<sec id="s4-1-1">
<label>4.1.1</label>
<title>Top genes from linear models</title>
<p>Players in cell cycle regulation featured among the top genes of the linear model, namely NEK2, PKYMT1, DEPDC1, KIF4A and CA4. Aberrations in cell cycle regulation facilitate sustained proliferative signalling and evasion of the growth suppressor, which are complementary hallmarks of cancers (<xref ref-type="bibr" rid="B40">Hanahan, 2022</xref>). The top 200 linear model genes were screened against the known cancer driver genes in Cancer Gene Census, yielding four hits: BUB1B, EBF1, PPARG, and RECQL4. RECQL4 is a key DNA helicase, with a vital role in the maintenance of genomic stability (<xref ref-type="bibr" rid="B21">Croteau et al., 2012</xref>). It has been found to be mutated and often upregulated in breast cancer (<xref ref-type="bibr" rid="B62">Luong et al., 2022</xref>), and its tumor-promoting activity has been observed in sporadic breast cancers with aggressive tumor behavior (<xref ref-type="bibr" rid="B5">Arora et al., 2016</xref>). Searching the top 200 MEGs against the Cancer Gene Census yielded two other hits: EGFR and QKI. EGFR is the first antitumor target to be identified, and known to be overexpressed in most of the TNBC and inflammatory breast cancers (<xref ref-type="bibr" rid="B68">Masuda et al., 2012</xref>), but associated with paradoxical function in metastatic cancer progression (<xref ref-type="bibr" rid="B2">Ali and Wendt, 2017</xref>). Significant downregulation of QKI has been noted in breast cancer relative to normal tissues, along with poor prognosis, which suggest its tumor-suppressor role (<xref ref-type="bibr" rid="B15">Cao et al., 2021</xref>). Expression of SLUG and QKI was correlated with epithelial to mesenchymal transition (EMT), and showed promise for use in breast cancer prognosis (<xref ref-type="bibr" rid="B37">Gu et al., 2019</xref>). Intersection of the top 200 linear model genes with the top 200 MEGs yielded 18 genes (including RECQL4), whereas intersection with the top 200 of the second linear model yielded 32 genes. We found 17 genes in common to all the three sets, including FAM13A, GABRD, and SORBS1. Supplementary File S15 presents the complete results. FAM13A is a hypoxia-induced gene in non-small lung cancer, increasing susceptibility to BC in a population-based cohort (<xref ref-type="bibr" rid="B107">Wei et al., 2019</xref>). Genes coexpressed with GABRD in colon cancer showed an enrichment for breast cancer and HPV infection pathway (<xref ref-type="bibr" rid="B60">Liu and Fang, 2021</xref>), hinting at a possible regulatory role for the monotonic expression of GABRD. Downregulation of SORBS1 in cancer samples was associated with increased metastasis and poor survival outcomes (<xref ref-type="bibr" rid="B92">Song et al., 2017</xref>). Stage-wise distribution of expression of representative consensus genes is presented in Supplementary File S16.</p>
<p>The 34 stage-salient candidate biomarkers identified here were cross-referenced with the Human Protein Atlas (<xref ref-type="bibr" rid="B100">Uhlen et al., 2017</xref>). We found 11 genes (2 stage-III salient genes and 9 stage-IV salient genes) annotated as &#x2018;cancer related genes&#x2019;, of which two stage-IV salient markers, namely EGR3 and KRT15, were specifically noted as prognostic markers of breast cancer (Supplementary File S17).</p>
</sec>
<sec id="s4-1-2">
<label>4.1.2</label>
<title>Early-stage salient genes</title>
<p>Supplementary File S18 shows the expression distribution of early-stage salient genes in all the TCGA samples grouped by stage. Notice the curved trend in expression signifying salience of expression in an intermediate stage of cancer progression, not the terminal stage. Nicotine in tobacco exerts its action through nicotinic acetylcholine receptors, which initiate cell proliferation (<xref ref-type="bibr" rid="B90">Singh et al., 2011</xref>), according with the identification of CHRNA6 (neuronal nicotinic acetylcholine receptor) as stage-I salient here. The downregulation of CHRNA6 with cancer progression is supported by studies on nicotinic expression in non-small cell lung cancer progression, where expression of CHRNA6 was found higher in non-smokers than smokers (<xref ref-type="bibr" rid="B54">Lam et al., 2007</xref>). MMP10 is a member of the peptidase M10 family of matrix metalloproteinases, and could set the stage for cancer progression by facilitating tumor cell dissociation, augmenting migration/invasion capability, promoting endothelial cell tube formation, and inducing the expression of key angiogenic and metastatic factors (<xref ref-type="bibr" rid="B113">Zhang et al., 2014</xref>). Recently, Piskor et al. proposed that MMP10 in combination with MMP3 and CA-15 could be used as a biomarker panel for early-stage BC through a non-invasive approach (<xref ref-type="bibr" rid="B78">Pisk&#xf3;r et al., 2020</xref>). Both these results accord with maximum expression of MMP10 in the early stages of cancer, reaffirming the effectiveness of our study design in identifying stage-salient markers. DEPDC1 is a novel cell cycle gene regulating apoptosis (<xref ref-type="bibr" rid="B71">Mi et al., 2015</xref>), whose over-expression signifies cancer progression in BC and its subtypes (<xref ref-type="bibr" rid="B115">Zhao et al., 2019</xref>; <xref ref-type="bibr" rid="B52">L et al., 2019</xref>). Here we have pinpointed the stage-II salience of DEPDC1 over-expression. COX7A1 is involved in mitochondrial metabolism and was identified as a tumor suppressor in invasive breast carcinoma, due to aberrant promoter hypermethylation (<xref ref-type="bibr" rid="B41">He et al., 2019</xref>). The stage-II salience of COX7A1 obtained in our studies supports its further exploration as a new biomarker and therapeutic target.</p>
</sec>
<sec id="s4-1-3">
<label>4.1.3</label>
<title>Stage-III salient genes</title>
<p>Supplementary File S18 includes the expression distribution of stage-III salient genes in all the TCGA samples grouped by stage. It is known that KCNK15 is overexpressed in BC (<xref ref-type="bibr" rid="B86">S et al., 2013</xref>), specifically in Luminal A subtype, but downregulated in TNBC subtype (<xref ref-type="bibr" rid="B25">Dookeran et al., 2017</xref>). MFSD4 (major facilitator superfamily domain containing 4) has been identified as a tumor suppressor of cell motility and invasiveness (by influencing promoter methylation) and a biomarker of hepatic metastasis in gastric cancer (<xref ref-type="bibr" rid="B47">Kanda et al., 2016</xref>), correctly identified here as downregulated. CDH19 encodes a cell-cell adhesion receptor cadherin, essential to maintenance of intercellular connections, whose loss of function was observed in BC samples (<xref ref-type="bibr" rid="B99">Tervasm&#xe4;ki et al., 2014</xref>). Aligning with this result, CDH19 is seen here to be downregulated. CXCL5, a chemokine, was found to regulate bone colonization in metastatic BC via its functional target CXCR2 (<xref ref-type="bibr" rid="B80">R et al., 2019</xref>), and its downregulation here might need further review. Oncogenic expression of AKR7A3 in the late stages of BC is detrimental to the period of disease-free survival, and it is interesting to note its stage-III salient upregulation here (<xref ref-type="bibr" rid="B101">V et al., 2014</xref>). DEGS2 (delta (4)-desaturase sphingolipid 2) exhibits oncogenic expression in response to increased levels of ceramide in BC (<xref ref-type="bibr" rid="B63">Makoukji et al., 2015</xref>), which resonates with the findings here. Growth differentiation factor-5 (GDF5) regulates TGF&#x3b2;-mediated pro-angiogenic signaling (<xref ref-type="bibr" rid="B66">Margheri et al., 2012</xref>), and its significant downregulation in the late stages here might set the stage for metastatic cancer. Oncogenic expression of FOXA1 (Forkhead box A1) enables widespread epigenetic reprogramming in ER metastatic BC (<xref ref-type="bibr" rid="B33">Fu et al., 2019</xref>), concordant with its overexpression here. Oncogenic expression of CST2 has been documented to promote bone metastasis in breast cancer (<xref ref-type="bibr" rid="B13">Blanco et al., 2012</xref>), borne out by its upregulated stage-III salience here.</p>
</sec>
<sec id="s4-1-4">
<label>4.1.4</label>
<title>Stage-IV salient genes</title>
<p>Supplementary File S18 includes the expression distribution of stage-IV salient genes in all the TCGA samples grouped by stage. A monotonic trend of downregulation culminating in a stage-IV extremum is discernible. Suzuki et al. examined the role of EGR3 in BC and concluded that its overexpression in concert with the expression of other genes is necessary to establish invasive and metastatic BC (<xref ref-type="bibr" rid="B96">Suzuki et al., 2007</xref>), which is in contradiction to the consistent downregulation seen here. FOS and FOSB showed near-monotonic downregulation in mean expression here, which might require further examination in the context of BC subtypes (<xref ref-type="bibr" rid="B61">Lu et al., 2005</xref>; <xref ref-type="bibr" rid="B6">Bamberger et al., 1999</xref>). DUSP1 (dual specificity phosphatase 1 or MAPK phosphatase 1) is a tumor-suppressor in the MAPK pathway that mediates the dephosphorylation of ERK1/2 (<xref ref-type="bibr" rid="B19">Chen et al., 2011</xref>), and its downregulation seen here is likely to underpin sustained proliferative signalling. FREM1 has been identified as a tumor-suppressor, whose downregulation enabled metabolic shift and tumor infiltration (<xref ref-type="bibr" rid="B58">Li et al., 2020</xref>), a finding underlined by the monotonic downregulation seen here. HFM1, helicase for meiosis 1, was reported to be altered in tumors relative to control samples (<xref ref-type="bibr" rid="B98">Taylor et al., 2008</xref>), and seen to be a tumor-suppressor here. ABCA10 is a member of the active transmembrane transport family, and was recently implicated in the progression-free survival of epithelial ovarian sarcoma (<xref ref-type="bibr" rid="B88">Seborova et al., 2019</xref>), and appears to portray a tumor-suppressor role in the context of our findings. KLK5, a serine protease, is a known tumor-suppressor whose activation is a promising anticancer therapy via repression of the mevalonate pathway (<xref ref-type="bibr" rid="B77">Pampalakis et al., 2014</xref>). The downregulation of KCNA1 (a voltage-gated potassium channel subfamily member) has been correlated with breast cancer aggressiveness (<xref ref-type="bibr" rid="B53">Lallet-Daher et al., 2013</xref>), lending its stage-IV salience in our analysis. KRT15 is known as cytokeratin and has recently been shown to be closely associated with tumorigenesis. Overexpression of KRT15 (cytokeratin) was seen in colorectal and squamous cell skin cancers, but its low expression in BC (as seen here) has been significantly associated with poor prognosis (<xref ref-type="bibr" rid="B117">Zhong et al., 2021</xref>). The remaining stage-IV salient genes were found to be involved in tumor progression via processes such as including inflammation, angiogenesis, and EMT transition.</p>
</sec>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Improving histological subtyping</title>
<p>The distinction between IDC and ILC has previously frustrated learning algorithms. An XGBoost model with 147 clinical, histopathological, mammogram features, and sonographic features has been reported with an internal testset accuracy of 0.84 on the binary classification problem (<xref ref-type="bibr" rid="B104">Vy et al., 2022</xref>). An AutoML deep-learning approach for identifying IDC samples alone from whole slide images yielded 0.85 accuracy on an independent dataset (<xref ref-type="bibr" rid="B112">Zeng and Zhang, 2020</xref>). Another study for classifying IDCs as early-stage vs. late-stage yielded an AUROC of 0.47 on the external validation (<xref ref-type="bibr" rid="B84">Roy et al., 2020</xref>). In this context, the external validation of our model yields a significant improvement on the state-of-the-art. However the limited sensitivity to ILC samples (conversely, specificity to IDC samples) in the external dataset presents an outstanding challenge in the histological classification of breast cancer from molecular information. Some noteworthy features from this model include: (i) CDH1 (E-cadherin), whose germline mutations were strongly associated with lobular carcinoma (<xref ref-type="bibr" rid="B20">Corso et al., 2018</xref>), was found to have a specific downregulated expression signature in ILC samples; (ii) CCL14, which is known to promote angiogenesis and metastasis in breast cancer (<xref ref-type="bibr" rid="B57">Li et al., 2011</xref>), was found oncogenic in expression across both histological subtypes. Further improvements to histological subtyping models could come from:<list list-type="roman-lower">
<list-item>
<p>stacking the classifiers: the ensemble of XGBoost and neural network used herein showed that the classifiers disagree on many instances preventing a consensus classification. In such cases, improvements to the performance tradeoff could be achieved by &#x2018;weighting&#x2019; the contribution of the two constituent models to the final prediction.</p>
</list-item>
<list-item>
<p>using cross-modal features, including from early integration of multi-omics and spatial dynamics at cellular resolution.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Commercial gene panels for breast cancer</title>
<p>Available genomic assays (commercial or otherwise) for prognosticating breast-cancer adjuvant chemotherapy include the following gene-signature panels:<list list-type="order">
<list-item>
<p>Prosigna (50 genes from PAM50 for intrinsic subtype classification, 8 housekeeping genes used for signal normalisation, 6 positive controls, and 8 negative controls)</p>
</list-item>
<list-item>
<p>OncotypeDX (16 cancer related &#x2b;5 reference gene panel),</p>
</list-item>
<list-item>
<p>EndoPredict (3 proliferation-associated genes, 5 hormone receptor-associated genes, 3 reference genes),</p>
</list-item>
<list-item>
<p>MammaPrint (70 cancer-related genes; prognostic only) (<xref ref-type="bibr" rid="B103">van de Vijver et al., 2002</xref>),</p>
</list-item>
<list-item>
<p>Breast Cancer Index (exploring benefit of extension of adjuvant hormonal therapy beyond 5 years based on a 11-gene signature),</p>
</list-item>
<list-item>
<p>HER2DX (exploring benefit of neoadjuvant systemic therapy in HER2&#x2b; BC based on a 4-gene signature) (<xref ref-type="bibr" rid="B79">Prat et al., 2022</xref>), and</p>
</list-item>
<list-item>
<p>Guardant 360 (<xref ref-type="bibr" rid="B38">Guardant, 2020</xref>) and Foundation One Test (<xref ref-type="bibr" rid="B30">Foundation Medicine, 2020</xref>) (using liquid biopsies of circulating cell-free tumor DNA to profile 70&#x2b; biomarkers at progression).</p>
</list-item>
</list>
</p>
<p>Scanning the signatures in these genomic assays against the ten features used in our &#x2018;normal&#x2019; vs. &#x2018;cancer&#x2019; model yielded: two genes in common with Prosigna (FOXA1, MMP11), one gene with OncotypeDX (MMP11), one gene with HER2DX (NEK2), and one gene with Breast Cancer Index (NEK2). Scanning these signatures against the 16 features used in our molecular subtyping model yielded: four genes with Prosigna (ERBB2, FOXA1, GRB7, MLPH), four genes with HER2DX (ERBB2, GRB7, STARD3, AGR3), two with OncotypeDX (GRB7 and ERBB2), and two with Guardant360 (ERBB2, GATA3). Scanning these signatures against the 24 features used for histological subtyping yielded: one gene with Guardant360 (CDH1). Scanning these signatures against the five features used in the non-metastatic vs. metastatic model did not identify anything in common. These results indicate that the models developed in this work are novel and deserving of clinical validation. A summary of the existing gene-signature diagnostic tests (with their indications and outcomes) together with a comprehensive comparative study is provided in Supplementary file S19.</p>
</sec>
<sec id="s4-4">
<label>4.4</label>
<title>BC-predict</title>
<p>To transition the results obtained from our studies, we developed BC-Predict which serves the models developed in a cascade inference engine and provides a comprehensive characterization of the given sample (<xref ref-type="fig" rid="F2">Figure 2</xref>). The BC-predict web-server is built on Rshiny (<xref ref-type="bibr" rid="B11">Beeley, 2016</xref>) and deployed for academic research at <ext-link ext-link-type="uri" xlink:href="https://apalania.shinyapps.io/bc-predict/">https://apalania.shinyapps.io/BC-Predict</ext-link>. All predictions are accompanied by prediction probabilities to provide confidence for the predicted class. Documentation and video tutorial for the use of BC-Predict are also provided. BC-Predict generates a unified readout that could nominally support medical decision-making contingent to clinical validation and further refinement. An alternative modeling process that used a nested stratification structure instead of sequential stratification was also investigated but did not yield an improvement. Though the cancer vs. normal model improves on the benchmark, iterative refinement and better datasets could yield further performance improvements for all models. Below we present a systematic enumeration of the limitations of our models and suggested coping strategies:<list list-type="order">
<list-item>
<p>The metastatic model does not distinguish among the stages in pre-metastatic cancer. A refinement may be necessary to discriminate between the early-stage cancers (stages I and II) and stage-III cancers among the pre-metastatic cancers.</p>
</list-item>
<list-item>
<p>The molecular subtype model lumps &#x2018;Luminal A&#x2019; and &#x2018;Luminal B&#x2019; into the &#x2018;Luminal&#x2019; class. Both luminal A and B are HER2-and ER&#x2b;, however the A subtype is PR&#x2b; and the most common molecular subtype comprising 50%&#x2013;60% of breast cancers whereas the B subtype accounts for 15%&#x2013;20%, mostly PR- and with low levels of Ki-67. Thus Luminal B has distinctly better prognosis than Luminal A. Increased data size and quality could afford production of better models that differentiate between these subtypes.</p>
</list-item>
<list-item>
<p>The ILC histological subtype tends to be radiologically and clinically hard to detect, manifesting more as thickening with occult mammogram rather than mass, hence research is urgent to improve the detection of this class, as discussed above.</p>
</list-item>
<list-item>
<p>The identified gene-signature panels could be enhanced with the inclusion of reference gene normalization, for more robust predictions.</p>
</list-item>
<list-item>
<p>In addition, all models would need to be fine-tuned for distribution shifts possible in different populations, though the identity of the biomarkers is likely invariant. Initiatives akin to the Indian Cancer Genome Association (<xref ref-type="bibr" rid="B24">Dixit and Sadanandam, 2021</xref>) could facilitate model monitoring and adaptation.</p>
</list-item>
</list>
</p>
<p>Gene-signature methods remain the clinical standard for both their effectiveness and utility, and works such as ours are a step forward in resolving difficult challenges. Such diagnostic models need to be clinically validated and approved for use by national regulatory bodies such as the FDA (Food and Drug Administration, USA), MHRA (Medicines and Healthcare products Regulatory Agency, UK), EU MDR (European Union Medical Device Regulation), NMPA (National Medical Products Administration, China) and CDSCO (Central Drugs Standard Control Organization, India). Models are complicated by cohort selection bias; for e. g., breast cancer in Black population presents in younger patients and more difficult to treat forms (aggressive, grade-III, TNBC or HER2&#x2b;) than in Hispanic population, with poorer prognosis. Also, metastatic breast cancer is rarely synchronous (more metachronous) in developed nations as opposed to metastatic cancer on presentation in emerging nations. In addition to these variations, AI-based diagnostic modalities need to contend for the interplay of risk factors that could enable or confound the predictions: pre-menopausal vs. post-menopausal, node-positive or not, complete hormonal profile and NPI score. Clinical validation of BC-Predict would involve the synthesis and use of specific forward and reverse primers for each model feature to perform qRT-PCR on the isolated RNA of resected biopsy sample from a patient. Post-quantification (normalized counts) and log<sub>2</sub> transformation, the inference model may be served to yield a prediction. Prior to such deployment, calibration of qRT-PCR may be necessary and could involve reference genes as used in, say, NOVAprep-miR-Cervix (<xref ref-type="bibr" rid="B48">Kniazeva et al., 2023</xref>).</p>
<p>In summary, we have developed performant <italic>de novo</italic> models to characterise breast cancer heterogeneity agnostic of hypothesis. The candidate stage-salient biomarkers could play a role in the progression of breast cancer, whose varying manifestations underlie differential response to treatment regimens. Developing models from minimal feature spaces has several advantages, chief among them being sensitivity to heterogeneous individual presentation, and generalization to out-of-domain population. One example of this in the present study is the performant external validation of the Molecular Subtype model on the TNBC-only African-enriched multiethnic international cohort (25/26 samples correctly identified). It is noteworthy that TNBC is also the most common molecular subtype in the Indian subcontinent, and has frustrated drug discovery programs with few druggable targets. It may be noted that the use of mere five features in the metastatic model mitigates against the limited datasets available, and offers realistic prospects for useful generalization in clinical diagnostics. Validation analysis with miRNA strongly supported DEPDC1, FOSB and DUSP1 as potential biomarkers for metastasis. More generally, the candidate model features identified here could provide novel hypotheses for chemotherapy and immunotherapy investigations. We would like to acknowledge that the late-integration of multi-omics has not consistently provided conclusive evidence for the features used in the models, yielding possible directions for future investigations. Our study overcomes certain limitations of earlier models, namely reporting of balanced performance metrics, availability for academic research, and inclusion of external validation. The confidence returned by BC-Predict predictions could be used to safeguard against weak and uncertain evidence, addressing the hazard with AI/ML modelling (<xref ref-type="bibr" rid="B111">Yao et al., 2022</xref>). The clinical translation of AI/ML models would be a step forward for personalized medicine, necessitating adequate regulation to ensure the benefits of AI for all (<xref ref-type="bibr" rid="B26">El Naqa et al., 2023</xref>; <xref ref-type="bibr" rid="B42">Hickman et al., 2021</xref>). Validation and assurance of model quality could alleviate the risks of distribution drift and cohort selection bias, and pave the way for clinically effective decision support aids in precision oncology centers. The realisation of software-as-medical-devices promises to revolutionize the diagnosis, triage, and treatment of cancers.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>Assessment of low-risk genetic factors unmasks induced vulnerabilities, and early-stage characterization of breast cancer heterogeneity constitutes the premise for personalized and targeted precision medicine. In this work, we have developed <italic>de novo</italic> models for addressing key problems in breast cancer heterogeneity based on public-domain expression datasets. Using custom protocols to identify features of interest to each problem, we have trained, optimised and externally validated the models. Our analysis has yielded novel and stage-salient drivers of cancer progression, including two stage-I salient genes (CHRNA6, MMP10), two stage-II salient genes (DEPDC1, COXA1), ten stage-III salient genes (including AKR7A3, FOXA1, CXCL5 and GDF5) and 20 stage-IV salient genes (including FREM1 and HFM1). We have developed solutions to four problems of interest in characterizing breast cancer heterogeneity: (i) &#x2018;cancer&#x2019; vs. &#x2018;normal&#x2019; based on 10 features (2 stage salient genes and 8 top linear model genes) with balanced accuracy &#x223c;97.42% on external validation; (ii) non-metastatic vs. metastatic based on 5 features with balanced accuracy &#x223c;88.22% on external validation; (iii) molecular subtyping (namely Luminal, HER2&#x2b;, and TNBC) based on 16 features with balanced accuracy &#x223c;88.79% on external validation; and (iv) histological subtyping (IDC vs. ILC) based on 24 features with ensemble accuracy &#x223c;94.23% on external validation. We have validated our results in multiple modalities. Based on these outcomes, we have developed an inference engine BC-Predict, which serves the best models developed for each problem, upon an input instance of expression data from a patient sample. BC-Predict is available for academic and non-commercial purposes as an experimental predictive aid for characterization of breast cancer heterogeneity based on minimal expression information, and subject to refinement with new knowledge. In conclusion, we have identified various novel candidate biomarkers of heterogeneous breast cancers that have been embedded into one integrated and validated cascade model that could pave the path to expediting personalized differential diagnosis and early-stage cure.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The data presented in the study are deposited in the figshare repository, accession number <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6084/m9.figshare.25282906.v2">https://doi.org/10.6084/m9.figshare.25282906.v2</ext-link>
</p>
</sec>
<sec sec-type="ethics-statement" id="s7">
<title>Ethics statement</title>
<p>Ethical approval was not required for the study involving humans in accordance with the local legislation and institutional requirements, as only de-identified / anonymous data from public-domain repositories were used in this work. Written informed consent to participate in this study was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>SM: Data curation, Validation, Methodology, Writing &#x2013; original draft, Software, Investigation, Visualization, Formal Analysis. NV: Writing &#x2013; review and editing, Resources, Validation. AP: Funding acquisition, Validation, Writing &#x2013; review and editing, Resources, Project administration, Writing &#x2013; original draft, Supervision, Software, Methodology, Investigation, Visualization, Conceptualization, Formal Analysis.</p>
</sec>
<ack>
<title>Acknowledgements</title>
<p>We would like to thank the Management of SASTRA Deemed University for infrastructure and support. This study makes use of the TCGA dataset (generated by The Cancer Genome Atlas Consortium), METABRIC dataset (generated by the Molecular Taxonomy of Breast Cancer International Consortium), ICGC dataset (generated by International Cancer Genome Consortium), and GEO datasets. Computing in our lab is also supported on a grant from Google TPU Research Cloud (TRC).</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1456636/overview">Adam Yongxin Ye</ext-link>, Boston Children&#x2019;s Hospital and Harvard Medical School, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1721951/overview">Fu Gao</ext-link>, Yale University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2958436/overview">Yao He</ext-link>, Broad Institute, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3110441/overview">Yilin Xie</ext-link>, Stanford University, United States</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Agarwal</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Bell</surname>
<given-names>G. W.</given-names>
</name>
<name>
<surname>Nam</surname>
<given-names>J.-W.</given-names>
</name>
<name>
<surname>Bartel</surname>
<given-names>D. P.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Predicting effective microRNA target sites in mammalian mRNAs</article-title>. <source>eLife</source> <volume>4</volume>, <fpage>e05005</fpage>. <pub-id pub-id-type="doi">10.7554/elife.05005</pub-id>
<pub-id pub-id-type="pmid">26267216</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ali</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wendt</surname>
<given-names>M. K.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>The paradoxical functions of EGFR during breast cancer progression</article-title>. <source>Signal Transduct. Target. Ther.</source> <volume>2</volume>, <fpage>16042-</fpage>. <pub-id pub-id-type="doi">10.1038/sigtrans.2016.42</pub-id>
<pub-id pub-id-type="pmid">28435746</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Allain</surname>
<given-names>D. C.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Genetic counseling and testing for common Hereditary breast cancer syndromes</article-title>. <source>J. Mol. Diagn. JMD</source> <volume>10</volume>, <fpage>383</fpage>&#x2013;<lpage>395</lpage>. <pub-id pub-id-type="doi">10.2353/jmoldx.2008.070161</pub-id>
<pub-id pub-id-type="pmid">18687797</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Almstedt</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Mendoza</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Otto</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Battista</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Steetskamp</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Heimes</surname>
<given-names>A. S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>EndoPredict&#xae; in early hormone receptor-positive, HER2-negative breast cancer</article-title>. <source>Breast Cancer Res. Treat.</source> <volume>182</volume>, <fpage>137</fpage>&#x2013;<lpage>146</lpage>. <pub-id pub-id-type="doi">10.1007/s10549-020-05688-1</pub-id>
<pub-id pub-id-type="pmid">32436145</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arora</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Agarwal</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Abdel&#x2010;Fatah</surname>
<given-names>T. M.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Croteau</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Moseley</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>RECQL4 helicase has oncogenic potential in sporadic breast cancers</article-title>. <source>J. Pathol.</source> <volume>238</volume>, <fpage>495</fpage>&#x2013;<lpage>501</lpage>. <pub-id pub-id-type="doi">10.1002/path.4681</pub-id>
<pub-id pub-id-type="pmid">26690729</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bamberger</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Methner</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lisboa</surname>
<given-names>B. W.</given-names>
</name>
<name>
<surname>St&#xe4;dtler</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Schulte</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>L&#xf6;ning</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>1999</year>). <article-title>Expression pattern of the AP-1 family in breast cancer: association of fosB expression with a well-differentiated, receptor-positive tumor phenotype</article-title>. <source>Int. J. Cancer</source> <volume>84</volume>, <fpage>533</fpage>&#x2013;<lpage>538</lpage>. <pub-id pub-id-type="doi">10.1002/(sici)1097-0215(19991022)84:5&#x3c;533::aid-ijc16&#x3e;3.0.co;2-j</pub-id>
<pub-id pub-id-type="pmid">10502734</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barrett</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wilhite</surname>
<given-names>S. E.</given-names>
</name>
<name>
<surname>Ledoux</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Evangelista</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>I. F.</given-names>
</name>
<name>
<surname>Tomashevsky</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>NCBI GEO: archive for functional genomics data sets--update</article-title>. <source>Nucleic Acids Res.</source> <volume>41</volume>, <fpage>D991</fpage>&#x2013;<lpage>D995</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gks1193</pub-id>
<pub-id pub-id-type="pmid">23193258</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bartlett</surname>
<given-names>J. M. S.</given-names>
</name>
<name>
<surname>Sgroi</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Treuner</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ahmed</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Piper</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Breast Cancer Index and prediction of benefit from extended endocrine therapy in breast cancer patients treated in the Adjuvant Tamoxifen-To Offer More? (aTTom) trial</article-title>. <source>Ann. Oncol. Off. J. Eur. Soc. Med. Oncol.</source> <volume>30</volume>, <fpage>1776</fpage>&#x2013;<lpage>1783</lpage>. <pub-id pub-id-type="doi">10.1093/annonc/mdz289</pub-id>
<pub-id pub-id-type="pmid">31504126</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Baskota</surname>
<given-names>S. U.</given-names>
</name>
<name>
<surname>Dabbs</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>B. Z.</given-names>
</name>
<name>
<surname>Bhargava</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Prosigna&#xae; breast cancer assay: histopathologic correlation, development, and assessment of size, nodal status, Ki-67 (SiNK&#x2122;) index for breast cancer prognosis</article-title>. <source>Mod. Pathol. Off. J. U. S. Can. Acad. Pathol. Inc.</source> <volume>34</volume>, <fpage>70</fpage>&#x2013;<lpage>76</lpage>. <pub-id pub-id-type="doi">10.1038/s41379-020-0643-8</pub-id>
<pub-id pub-id-type="pmid">32740650</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bastien</surname>
<given-names>R. R. L.</given-names>
</name>
<name>
<surname>Rodr&#xed;guez-Lescure</surname>
<given-names>&#xc1;.</given-names>
</name>
<name>
<surname>Ebbert</surname>
<given-names>M. T.</given-names>
</name>
<name>
<surname>Prat</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mun&#xe1;rriz</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Rowe</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>PAM50 breast cancer subtyping by RT-qPCR and concordance with standard clinical molecular markers</article-title>. <source>BMC Med. Genomics</source> <volume>5</volume>, <fpage>44</fpage>. <pub-id pub-id-type="doi">10.1186/1755-8794-5-44</pub-id>
<pub-id pub-id-type="pmid">23035882</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Beeley</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Web application development with R using Shiny</source>. <publisher-loc>Birmingham, United Kongdom</publisher-loc>: <publisher-name>Packt Publishing Ltd</publisher-name>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bhattacharyya</surname>
<given-names>G. S.</given-names>
</name>
<name>
<surname>Doval</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Desai</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Chaturvedi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Somashekhar</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Overview of breast cancer and Implications of Overtreatment of early-stage breast cancer: an Indian Perspective</article-title>. <source>JCO Glob. Oncol.</source> <volume>6</volume>, <fpage>789</fpage>&#x2013;<lpage>798</lpage>. <pub-id pub-id-type="doi">10.1200/go.20.00033</pub-id>
<pub-id pub-id-type="pmid">32511068</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Blanco</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>LeRoy</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ale&#x10d;kovi&#x107;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zee</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Garcia</surname>
<given-names>B. A.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>Global secretome analysis identifies novel mediators of bone metastasis</article-title>. <source>Cell Res.</source> <volume>22</volume>, <fpage>1339</fpage>&#x2013;<lpage>1355</lpage>. <pub-id pub-id-type="doi">10.1038/cr.2012.89</pub-id>
<pub-id pub-id-type="pmid">22688892</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brierley</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gospodarowicz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>O&#x2019;Sullivan</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>The principles of cancer staging</article-title>. <source>Ecancermedicalscience</source> <volume>10</volume>, <fpage>ed61</fpage>. <pub-id pub-id-type="doi">10.3332/ecancer.2016.ed61</pub-id>
<pub-id pub-id-type="pmid">28101141</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>RNA-binding protein QKI suppresses breast cancer via RASA1/MAPK signaling pathway</article-title>. <source>Ann. Transl. Med.</source> <volume>9</volume>, <fpage>104</fpage>. <pub-id pub-id-type="doi">10.21037/atm-20-4859</pub-id>
<pub-id pub-id-type="pmid">33569406</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cassidy</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bissett</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Spence</surname>
<given-names>R. A. J.</given-names>
</name>
<name>
<surname>Payne</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Morris-Stiff</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2015</year>). <source>Oxford Handbook of oncology</source>. <publisher-name>Oxford University Press</publisher-name>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cedoz</surname>
<given-names>P.-L.</given-names>
</name>
<name>
<surname>Prunello</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Brennan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Gevaert</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>MethylMix 2.0: an R package for identifying DNA methylation genes</article-title>. <source>Bioinforma. Oxf. Engl.</source> <volume>34</volume>, <fpage>3044</fpage>&#x2013;<lpage>3046</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty156</pub-id>
<pub-id pub-id-type="pmid">29668835</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chawla</surname>
<given-names>N. V.</given-names>
</name>
<name>
<surname>Bowyer</surname>
<given-names>K. W.</given-names>
</name>
<name>
<surname>Hall</surname>
<given-names>L. O.</given-names>
</name>
<name>
<surname>Kegelmeyer</surname>
<given-names>W. P.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>SMOTE: Synthetic Minority over-sampling technique</article-title>. <source>J. Artif. Intell. Res.</source> <volume>16</volume>, <fpage>321</fpage>&#x2013;<lpage>357</lpage>. <pub-id pub-id-type="doi">10.1613/jair.953</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>C.-C.</given-names>
</name>
<name>
<surname>Hardy</surname>
<given-names>D. B.</given-names>
</name>
<name>
<surname>Mendelson</surname>
<given-names>C. R.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Progesterone receptor inhibits proliferation of human breast cancer cells via induction of MAPK phosphatase 1 (MKP-1/DUSP1)</article-title>. <source>J. Biol. Chem.</source> <volume>286</volume>, <fpage>43091</fpage>&#x2013;<lpage>43102</lpage>. <pub-id pub-id-type="doi">10.1074/jbc.m111.295865</pub-id>
<pub-id pub-id-type="pmid">22020934</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Corso</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Veronesi</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Sacchini</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Galimberti</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Prognosis and outcome in CDH1-mutant lobular breast cancer</article-title>. <source>Eur. J. Cancer Prev. Off. J. Eur. Cancer Prev. Organ. ECP</source> <volume>27</volume>, <fpage>237</fpage>&#x2013;<lpage>238</lpage>. <pub-id pub-id-type="doi">10.1097/cej.0000000000000405</pub-id>
<pub-id pub-id-type="pmid">29595757</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Croteau</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>D. K.</given-names>
</name>
<name>
<surname>Hoh Ferrarelli</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Bohr</surname>
<given-names>V. A.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>RECQL4 in genomic instability and aging</article-title>. <source>Trends Genet. TIG</source> <volume>28</volume>, <fpage>624</fpage>&#x2013;<lpage>631</lpage>. <pub-id pub-id-type="doi">10.1016/j.tig.2012.08.003</pub-id>
<pub-id pub-id-type="pmid">22940096</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Curtis</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>S. P.</given-names>
</name>
<name>
<surname>Chin</surname>
<given-names>S. F.</given-names>
</name>
<name>
<surname>Turashvili</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rueda</surname>
<given-names>O. M.</given-names>
</name>
<name>
<surname>Dunning</surname>
<given-names>M. J.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>The genomic and transcriptomic architecture of 2,000 breast tumours reveals novel subgroups</article-title>. <source>Nature</source> <volume>486</volume>, <fpage>346</fpage>&#x2013;<lpage>352</lpage>. <pub-id pub-id-type="doi">10.1038/nature10983</pub-id>
<pub-id pub-id-type="pmid">22522925</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Breast cancer intrinsic subtype classification, clinical use and future trends</article-title>. <source>Am. J. Cancer Res.</source> <volume>5</volume>, <fpage>2929</fpage>&#x2013;<lpage>2943</lpage>.<pub-id pub-id-type="pmid">26693050</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dixit</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sadanandam</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The 2nd Conference and Workshop of the cancer genome atlas (TCGA) in India: towards Team Science for multi-omics cancer research in South Asia</article-title>. <source>ecancermedicalscience</source> <volume>15</volume>, <fpage>ed111</fpage>. <pub-id pub-id-type="doi">10.3332/ecancer.2021.ed111</pub-id>
<pub-id pub-id-type="pmid">34221123</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dookeran</surname>
<given-names>K. A.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Stayner</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Argos</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Associations of two-pore domain potassium channels and triple negative breast cancer subtype in the Cancer Genome Atlas: systematic evaluation of gene expression and methylation</article-title>. <source>BMC Res. Notes</source> <volume>10</volume>, <fpage>475</fpage>. <pub-id pub-id-type="doi">10.1186/s13104-017-2777-4</pub-id>
<pub-id pub-id-type="pmid">28899398</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>El Naqa</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Karolak</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Folio</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Tarhini</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Rollison</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Translation of AI into oncology clinical practice</article-title>. <source>Oncogene</source> <volume>42</volume>, <fpage>3089</fpage>&#x2013;<lpage>3097</lpage>. <pub-id pub-id-type="doi">10.1038/s41388-023-02826-z</pub-id>
<pub-id pub-id-type="pmid">37684407</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Enright</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>John</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gaul</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Tuschl</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sander</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Marks</surname>
<given-names>D. S.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>MicroRNA targets in Drosophila</article-title>. <source>Genome Biol.</source> <volume>5</volume> <pub-id pub-id-type="doi">10.1186/gb-2003-5-1-r1</pub-id>
<pub-id pub-id-type="pmid">14709173</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fitzgibbons</surname>
<given-names>P. L.</given-names>
</name>
<name>
<surname>Page</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Weaver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Thor</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Allred</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>G. M.</given-names>
</name>
<etal/>
</person-group> (<year>2000</year>). <article-title>Prognostic factors in breast cancer</article-title>. <source>Arch. Pathol. Lab. Med.</source> <volume>124</volume>, <fpage>966</fpage>&#x2013;<lpage>978</lpage>. <pub-id pub-id-type="doi">10.5858/2000-124-0966-pfibc</pub-id>
<pub-id pub-id-type="pmid">10888772</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<collab>Foundation Medicine</collab> (<year>2020</year>). <article-title>FDA approves Foundation Medicine&#x2019;s FoundationOne Liquid CDx, a comprehensive pan-tumor liquid biopsy test with multiple companion diagnostic indications for patients with advanced cancer</article-title>. <source>News release. Found. Med.</source> <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.foundationmedicine.com/press-releases/fda-approves-foundation-medicine's-foundationone%C2%AEliquid-cdx,-a-comprehensive-pan-tumor-liquid-biopsy-test-with-multiple-companion-diagnostic-indications-for-patients-with-advanced-cancer">https://www.foundationmedicine.com/press-releases/fda-approves-foundation-medicine&#x27;s-foundationone%C2%AEliquid-cdx,-a-comprehensive-pan-tumor-liquid-biopsy-test-with-multiple-companion-diagnostic-indications-for-patients-with-advanced-cancer</ext-link> (Accessed February 1, 2024)</comment>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Franks</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Whitfield</surname>
<given-names>M. L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Feature specific quantile normalization enables cross-platform classification of molecular subtypes using gene expression data</article-title>. <source>Bioinforma. Oxf. Engl.</source> <volume>34</volume>, <fpage>1868</fpage>&#x2013;<lpage>1874</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty026</pub-id>
<pub-id pub-id-type="pmid">29360996</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ring</surname>
<given-names>H. Z.</given-names>
</name>
<name>
<surname>Ring</surname>
<given-names>B. Z.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Molecular classification of lobular carcinoma of the breast</article-title>. <source>Sci. Rep.</source> <volume>7</volume>, <fpage>43265</fpage>. <pub-id pub-id-type="doi">10.1038/srep43265</pub-id>
<pub-id pub-id-type="pmid">28303886</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Pereira</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>De Angelis</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Veeraraghavan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Nanda</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>FOXA1 upregulation promotes enhancer and transcriptional reprogramming in endocrine-resistant breast cancer</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A.</source> <volume>116</volume>, <fpage>26823</fpage>&#x2013;<lpage>26834</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1911584116</pub-id>
<pub-id pub-id-type="pmid">31826955</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fessler</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>DeepCC: a novel deep learning-based framework for cancer molecular subtype classification</article-title>. <source>Oncogenesis</source> <volume>8</volume>, <fpage>44</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1038/s41389-019-0157-8</pub-id>
<pub-id pub-id-type="pmid">31420533</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Giuliano</surname>
<given-names>A. E.</given-names>
</name>
<name>
<surname>Edge</surname>
<given-names>S. B.</given-names>
</name>
<name>
<surname>Hortobagyi</surname>
<given-names>G. N.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Eighth edition of the AJCC cancer staging manual: breast cancer</article-title>. <source>Ann. Surg. Oncol.</source> <volume>25</volume>, <fpage>1783</fpage>&#x2013;<lpage>1785</lpage>. <pub-id pub-id-type="doi">10.1245/s10434-018-6486-6</pub-id>
<pub-id pub-id-type="pmid">29671136</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<collab>GTEx Consortium</collab>
<person-group person-group-type="author">
<name>
<surname>Thomas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Salvatore</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Phillips</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Lo</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Shad</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>The genotype-tissue expression (GTEx) project</article-title>. <source>Nat. Genet.</source> <volume>45</volume>, <fpage>580</fpage>&#x2013;<lpage>585</lpage>. <pub-id pub-id-type="doi">10.1038/ng.2653</pub-id>
<pub-id pub-id-type="pmid">23715323</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Prognostic value of epithelial-mesenchymal transition related genes: SLUG and QKI in breast cancer patients</article-title>. <source>Int. J. Clin. Exp. Pathol.</source> <volume>12</volume>, <fpage>2009</fpage>&#x2013;<lpage>2021</lpage>.<pub-id pub-id-type="pmid">31934023</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<collab>Guardant</collab> (<year>2020</year>). <article-title>Guardant Health Guardant360 CDx first FDA-approved liquid biopsy for comprehensive tumor mutation profiling across all solid cancers</article-title>. <source>News release. Guard. Health</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://investors.guardanthealth.com/press-releases/press-releases/2020/Guardant-Health-Guardant360-CDx-First-FDA-Approved-Liquid-Biopsy-for-Comprehensive-Tumor-Mutation-Profiling-Across-All-Solid-Cancers/default.aspx">https://investors.guardanthealth.com/press-releases/press-releases/2020/Guardant-Health-Guardant360-CDx-First-FDA-Approved-Liquid-Biopsy-for-Comprehensive-Tumor-Mutation-Profiling-Across-All-Solid-Cancers/default.aspx</ext-link> (Accessed February 1, 2021)</comment>.</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>G&#xfc;ler</surname>
<given-names>E. N.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Gene expression profiling in breast cancer and its effect on therapy selection in early-stage breast cancer</article-title>. <source>Eur. J. Breast Health</source> <volume>13</volume>, <fpage>168</fpage>&#x2013;<lpage>174</lpage>. <pub-id pub-id-type="doi">10.5152/ejbh.2017.3636</pub-id>
<pub-id pub-id-type="pmid">29082373</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hanahan</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Hallmarks of cancer: new dimensions</article-title>. <source>Cancer Discov.</source> <volume>12</volume>, <fpage>31</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1158/2159-8290.cd-21-1059</pub-id>
<pub-id pub-id-type="pmid">35022204</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ni</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Comprehensive and integrative analysis identifies COX7A1 as a critical methylation-driven gene in breast invasive carcinoma</article-title>. <source>Ann. Transl. Med.</source> <volume>7</volume>, <fpage>682</fpage>. <pub-id pub-id-type="doi">10.21037/atm.2019.11.97</pub-id>
<pub-id pub-id-type="pmid">31930083</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hickman</surname>
<given-names>S. E.</given-names>
</name>
<name>
<surname>Baxter</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Gilbert</surname>
<given-names>F. J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Adoption of artificial intelligence in breast imaging: evaluation, ethical constraints and limitations</article-title>. <source>Br. J. Cancer</source> <volume>125</volume>, <fpage>15</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1038/s41416-021-01333-w</pub-id>
<pub-id pub-id-type="pmid">33772149</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Horr</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Buechler</surname>
<given-names>S. A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Breast Cancer Consensus Subtypes: a system for subtyping breast cancer tumors based on gene expression</article-title>. <source>NPJ Breast Cancer</source> <volume>7</volume>, <fpage>136</fpage>. <pub-id pub-id-type="doi">10.1038/s41523-021-00345-2</pub-id>
<pub-id pub-id-type="pmid">34642313</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y. C. D.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>miRTarBase update 2022: an informative resource for experimentally validated miRNA-target interactions</article-title>. <source>Nucleic Acids Res.</source> <volume>50</volume>, <fpage>D222</fpage>&#x2013;<lpage>D230</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkab1079</pub-id>
<pub-id pub-id-type="pmid">34850920</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hudson</surname>
<given-names>T. J.</given-names>
</name>
<name>
<surname>Anderson</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Artez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Barker</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Bell</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bernab&#x00E9;</surname>
<given-names>R. R.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>International network of cancer genome projects</article-title>. <source>Nat.</source> <volume>464</volume>. <fpage>993</fpage>&#x2013;<lpage>998</lpage>. <pub-id pub-id-type="doi">10.1038/nature08987</pub-id>
<pub-id pub-id-type="pmid">20393554</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Johnson</surname>
<given-names>K. S.</given-names>
</name>
<name>
<surname>Conant</surname>
<given-names>E. F.</given-names>
</name>
<name>
<surname>Soo</surname>
<given-names>M. S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Molecular subtypes of breast cancer: a review for breast radiologists</article-title>. <source>J. Breast Imaging</source> <volume>3</volume>, <fpage>12</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1093/jbi/wbaa110</pub-id>
<pub-id pub-id-type="pmid">38424845</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kanda</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Shimizu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Tanaka</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shibata</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Iwata</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hayashi</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Metastatic pathway-specific transcriptome analysis identifies MFSD4 as a putative tumor suppressor and biomarker for hepatic metastasis in patients with gastric cancer</article-title>. <source>Oncotarget</source> <volume>7</volume>, <fpage>13667</fpage>&#x2013;<lpage>13679</lpage>. <pub-id pub-id-type="doi">10.18632/oncotarget.7269</pub-id>
<pub-id pub-id-type="pmid">26872374</pub-id>
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kniazeva</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zabegina</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shalaev</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Smirnova</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Lavrinovich</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Berlev</surname>
<given-names>I.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>NOVAprep-miR-cervix: new method for evaluation of cervical Dysplasia Severity based on analysis of six miRNAs</article-title>. <source>Int. J. Mol. Sci.</source> <volume>24</volume> (<issue>11</issue>), <fpage>9114</fpage>. <pub-id pub-id-type="doi">10.3390/ijms24119114</pub-id>
<pub-id pub-id-type="pmid">37298066</pub-id>
</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kourou</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Exarchos</surname>
<given-names>T. P.</given-names>
</name>
<name>
<surname>Exarchos</surname>
<given-names>K. P.</given-names>
</name>
<name>
<surname>Karamouzis</surname>
<given-names>M. V.</given-names>
</name>
<name>
<surname>Fotiadis</surname>
<given-names>D. I.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Machine learning applications in cancer prognosis and prediction</article-title>. <source>Comput. Struct. Biotechnol. J.</source> <volume>13</volume>, <fpage>8</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2014.11.005</pub-id>
<pub-id pub-id-type="pmid">25750696</pub-id>
</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kuhn</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Building predictive models in R using the caret package</article-title>. <source>J. Stat. Softw.</source> <volume>28</volume>, <fpage>1</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v028.i05</pub-id>
<pub-id pub-id-type="pmid">27774042</pub-id>
</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kursa</surname>
<given-names>M. B.</given-names>
</name>
<name>
<surname>Rudnicki</surname>
<given-names>W. R.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Feature selection with the Boruta package</article-title>. <source>J. Stat. Softw.</source> <volume>36</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v036.i11</pub-id>
</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>L</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>DEPDC1, negatively regulated by miR-26b, facilitates cell proliferation via the up-regulation of FOXM1 expression in TNBC</article-title>. <source>Cancer Lett.</source> <volume>442</volume>, <fpage>242</fpage>&#x2013;<lpage>251</lpage>. <pub-id pub-id-type="doi">10.1016/j.canlet.2018.11.003</pub-id>
<pub-id pub-id-type="pmid">30419349</pub-id>
</mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lallet-Daher</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wiel</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gitenay</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Navaratnam</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Augert</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Le Calv&#xe9;</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Potassium channel KCNA1 modulates oncogene-induced senescence and transformation</article-title>. <source>Cancer Res.</source> <volume>73</volume>, <fpage>5253</fpage>&#x2013;<lpage>5265</lpage>. <pub-id pub-id-type="doi">10.1158/0008-5472.can-12-3690</pub-id>
<pub-id pub-id-type="pmid">23774215</pub-id>
</mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lam</surname>
<given-names>D. C.-L.</given-names>
</name>
<name>
<surname>Girard</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ramirez</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chau</surname>
<given-names>W. s.</given-names>
</name>
<name>
<surname>Suen</surname>
<given-names>W. s.</given-names>
</name>
<name>
<surname>Sheridan</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>Expression of nicotinic acetylcholine receptor subunit genes in non-small-cell lung cancer reveals differences between smokers and nonsmokers</article-title>. <source>Cancer Res.</source> <volume>67</volume>, <fpage>4638</fpage>&#x2013;<lpage>4647</lpage>. <pub-id pub-id-type="doi">10.1158/0008-5472.can-06-4628</pub-id>
<pub-id pub-id-type="pmid">17510389</pub-id>
</mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Law</surname>
<given-names>C. W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Smyth</surname>
<given-names>G. K.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>voom: precision weights unlock linear model analysis tools for RNA-seq read counts</article-title>. <source>Genome Biol.</source> <volume>15</volume>, <fpage>R29</fpage>. <pub-id pub-id-type="doi">10.1186/gb-2014-15-2-r29</pub-id>
<pub-id pub-id-type="pmid">24485249</pub-id>
</mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lex</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gehlenborg</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Strobelt</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Vuillemot</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Pfister</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>UpSet: visualization of intersecting sets</article-title>. <source>IEEE Trans. Vis. Comput. Graph.</source> <volume>20</volume>, <fpage>1983</fpage>&#x2013;<lpage>1992</lpage>. <pub-id pub-id-type="doi">10.1109/tvcg.2014.2346248</pub-id>
<pub-id pub-id-type="pmid">26356912</pub-id>
</mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Gui</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Binding of the JmjC demethylase JARID1B to LSD1/NuRD suppresses angiogenesis and metastasis in breast cancer cells by repressing chemokine CCL14</article-title>. <source>Cancer Res.</source> <volume>71</volume>, <fpage>6899</fpage>&#x2013;<lpage>6908</lpage>. <pub-id pub-id-type="doi">10.1158/0008-5472.can-11-1523</pub-id>
<pub-id pub-id-type="pmid">21937684</pub-id>
</mixed-citation>
</ref>
<ref id="B58">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lv</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Elevated expression of FREM1 in breast cancer indicates favorable prognosis and high&#x2010;level immune infiltration status</article-title>. <source>Cancer Med.</source> <volume>9</volume>, <fpage>9554</fpage>&#x2013;<lpage>9570</lpage>. <pub-id pub-id-type="doi">10.1002/cam4.3543</pub-id>
<pub-id pub-id-type="pmid">33058542</pub-id>
</mixed-citation>
</ref>
<ref id="B59">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lindor</surname>
<given-names>N. M.</given-names>
</name>
<name>
<surname>McMaster</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Lindor</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Greene</surname>
<given-names>M. H.</given-names>
</name>
</person-group>
<collab>National Cancer Institute</collab> (<year>2008</year>). <article-title>Division of cancer prevention, community oncology and prevention trials research group. Concise handbook of familial cancer susceptibility syndromes</article-title>. <source>J. Natl. Cancer Inst. Monogr.</source>, <fpage>1</fpage>&#x2013;<lpage>93</lpage>. <pub-id pub-id-type="doi">10.1093/jncimonographs/lgn001</pub-id>
<pub-id pub-id-type="pmid">18559331</pub-id>
</mixed-citation>
</ref>
<ref id="B60">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Research for expression and prognostic value of GABRD in colon cancer and coexpressed gene network construction based on data mining</article-title>. <source>Comput. Math. Methods Med.</source> <volume>2021</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1155/2021/5544182</pub-id>
<pub-id pub-id-type="pmid">34194536</pub-id>
</mixed-citation>
</ref>
<ref id="B61">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>DuPr&#xe9;</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hilsenbeck</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Brown</surname>
<given-names>P. H.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>cFos is critical for MCF-7 breast cancer cell growth</article-title>. <source>Oncogene</source> <volume>24</volume>, <fpage>6516</fpage>&#x2013;<lpage>6524</lpage>. <pub-id pub-id-type="doi">10.1038/sj.onc.1208905</pub-id>
<pub-id pub-id-type="pmid">16027729</pub-id>
</mixed-citation>
</ref>
<ref id="B62">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luong</surname>
<given-names>T. T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Priedigkeit</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Parker</surname>
<given-names>P. S.</given-names>
</name>
<name>
<surname>B&#xf6;hm</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Rapchak</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Hrq1/RECQL4 regulation is critical for preventing aberrant recombination during DNA intrastrand crosslink repair and is upregulated in breast cancer</article-title>. <source>PLoS Genet.</source> <volume>18</volume>, <fpage>e1010122</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pgen.1010122</pub-id>
<pub-id pub-id-type="pmid">36126066</pub-id>
</mixed-citation>
</ref>
<ref id="B63">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Makoukji</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Raad</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Genadry</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>El-Sitt</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Makhoul</surname>
<given-names>N. J.</given-names>
</name>
<name>
<surname>Saad Aldin</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Association between CLN3 (neuronal ceroid Lipofuscinosis, CLN3 type) gene expression and clinical Characteristics of breast cancer patients</article-title>. <source>Front. Oncol.</source> <volume>5</volume>, <fpage>215</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2015.00215</pub-id>
<pub-id pub-id-type="pmid">26528430</pub-id>
</mixed-citation>
</ref>
<ref id="B64">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Malvia</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bagadi</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Dubey</surname>
<given-names>U. S.</given-names>
</name>
<name>
<surname>Saxena</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Epidemiology of breast cancer in Indian women. <italic>Asia Pac</italic>
</article-title>. <source>J. Clin. Oncol.</source> <volume>13</volume>, <fpage>289</fpage>&#x2013;<lpage>295</lpage>. <pub-id pub-id-type="doi">10.1111/ajco.12661</pub-id>
<pub-id pub-id-type="pmid">28181405</pub-id>
</mixed-citation>
</ref>
<ref id="B65">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Manyonda</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Sinai Talaulikar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Pirhadi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ward</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Banerjee</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Onwude</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Could Perimenopausal estrogen prevent breast cancer? Exploring the differential effects of estrogen-only versus combined hormone Replacement therapy</article-title>. <source>J. Clin. Med. Res.</source> <volume>14</volume>, <fpage>1</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.14740/jocmr4646</pub-id>
<pub-id pub-id-type="pmid">35211211</pub-id>
</mixed-citation>
</ref>
<ref id="B66">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Margheri</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Schiavone</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Papucci</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Magnelli</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Serrat&#xec;</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chill&#xe0;</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>GDF5 regulates TGF&#xdf;-dependent angiogenesis in breast carcinoma MCF-7 cells: <italic>in vitro</italic> and <italic>in vivo</italic> control by anti-TGF&#xdf; peptides</article-title>. <source>PLoS ONE</source> <volume>7</volume>, <fpage>e50342</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0050342</pub-id>
<pub-id pub-id-type="pmid">23226264</pub-id>
</mixed-citation>
</ref>
<ref id="B67">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Martini</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Delpe</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Chu</surname>
<given-names>T. R.</given-names>
</name>
<name>
<surname>Arora</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Lord</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Verma</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>African ancestry-associated gene expression profiles in triple-negative breast cancer underlie altered tumor biology and clinical outcome in women of african descent</article-title>. <source>Cancer Discov.</source> <volume>12</volume>, <fpage>2530</fpage>&#x2013;<lpage>2551</lpage>. <pub-id pub-id-type="doi">10.1158/2159-8290.cd-22-0138</pub-id>
<pub-id pub-id-type="pmid">36121736</pub-id>
</mixed-citation>
</ref>
<ref id="B68">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Masuda</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bartholomeusz</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Doihara</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hortobagyi</surname>
<given-names>G. N.</given-names>
</name>
<name>
<surname>Ueno</surname>
<given-names>N. T.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Role of epidermal growth factor receptor in breast cancer</article-title>. <source>Breast Cancer Res. Treat.</source> <volume>136</volume>, <fpage>331</fpage>&#x2013;<lpage>345</lpage>. <pub-id pub-id-type="doi">10.1007/s10549-012-2289-9</pub-id>
<pub-id pub-id-type="pmid">23073759</pub-id>
</mixed-citation>
</ref>
<ref id="B69">
<mixed-citation publication-type="web">
<collab>MBCP</collab> (<year>2025</year>). <article-title>The metastatic breast cancer project</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://mbcproject.org/">https://mbcproject.org/</ext-link>.</comment>
</mixed-citation>
</ref>
<ref id="B70">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>McKinney</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Sieniek</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Godbole</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Godwin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Antropova</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ashrafian</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>International evaluation of an AI system for breast cancer screening</article-title>. <source>Nature</source> <volume>577</volume>, <fpage>89</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-019-1799-6</pub-id>
<pub-id pub-id-type="pmid">31894144</pub-id>
</mixed-citation>
</ref>
<ref id="B71">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>DEPDC1 is a novel cell cycle related gene that regulates mitotic progression</article-title>. <source>BMB Rep.</source> <volume>48</volume>, <fpage>413</fpage>&#x2013;<lpage>418</lpage>. <pub-id pub-id-type="doi">10.5483/bmbrep.2015.48.7.036</pub-id>
<pub-id pub-id-type="pmid">25902835</pub-id>
</mixed-citation>
</ref>
<ref id="B72">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mohaiminul Islam</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ajwad</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>An integrative deep learning framework for classifying molecular subtypes of breast cancer</article-title>. <source>Comput. Struct. Biotechnol. J.</source> <volume>18</volume>, <fpage>2185</fpage>&#x2013;<lpage>2199</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2020.08.005</pub-id>
<pub-id pub-id-type="pmid">32952934</pub-id>
</mixed-citation>
</ref>
<ref id="B73">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mostavi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chiu</surname>
<given-names>Y.-C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Convolutional neural network models for cancer type prediction based on gene expression</article-title>. <source>BMC Med. Genomics</source> <volume>13</volume>, <fpage>44</fpage>. <pub-id pub-id-type="doi">10.1186/s12920-020-0677-2</pub-id>
<pub-id pub-id-type="pmid">32241303</pub-id>
</mixed-citation>
</ref>
<ref id="B74">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Muthamilselvan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Palaniappan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>BrcaDx: precise identification of breast cancer from expression data using a minimal set of features</article-title>. <source>Front. Bioinforma.</source> <volume>3</volume>, <fpage>1103493</fpage>. <pub-id pub-id-type="doi">10.3389/fbinf.2023.1103493</pub-id>
<pub-id pub-id-type="pmid">37287543</pub-id>
</mixed-citation>
</ref>
<ref id="B75">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Muthamilselvan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Raghavendran</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Palaniappan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Stage-differentiated ensemble modeling of DNA methylation landscapes uncovers salient biomarkers and prognostic signatures in colorectal cancer progression</article-title>. <source>PLOS ONE</source> <volume>17</volume>, <fpage>e0249151</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0249151</pub-id>
<pub-id pub-id-type="pmid">35202405</pub-id>
</mixed-citation>
</ref>
<ref id="B76">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Muthamilselvan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ramasami Sundhar Baabu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Palaniappan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Microfluidics for profiling miRNA biomarker panels in AI-Assisted cancer diagnosis and prognosis</article-title>. <source>Technol. Cancer Res. Treat.</source> <volume>22</volume>, <fpage>15330338231185284</fpage>. <pub-id pub-id-type="doi">10.1177/15330338231185284</pub-id>
<pub-id pub-id-type="pmid">37365928</pub-id>
</mixed-citation>
</ref>
<ref id="B77">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pampalakis</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Obasuyi</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Papadodima</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Chatziioannou</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zoumpourlis</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Sotiropoulou</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>The KLK5 protease suppresses breast cancer by repressing the mevalonate pathway</article-title>. <source>Oncotarget</source> <volume>5</volume>, <fpage>2390</fpage>&#x2013;<lpage>2403</lpage>. <pub-id pub-id-type="doi">10.18632/oncotarget.1235</pub-id>
<pub-id pub-id-type="pmid">24158494</pub-id>
</mixed-citation>
</ref>
<ref id="B78">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pisk&#xf3;r</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Przylipiak</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>D&#x105;browska</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Sidorkiewicz</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Niczyporuk</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Szmitkowski</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Plasma level of MMP-10 may Be a prognostic marker in early stages of breast cancer</article-title>. <source>J. Clin. Med.</source> <volume>9</volume>, <fpage>4122</fpage>. <pub-id pub-id-type="doi">10.3390/jcm9124122</pub-id>
<pub-id pub-id-type="pmid">33371324</pub-id>
</mixed-citation>
</ref>
<ref id="B79">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Prat</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Guarneri</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Pascual</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bras&#xf3;-Maristany</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Sanfeliu</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Par&#xe9;</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Development and validation of the new HER2DX assay for predicting pathological response and survival outcome in early-stage HER2-positive breast cancer</article-title>. <source>EBioMedicine</source> <volume>75</volume>, <fpage>103801</fpage>. <pub-id pub-id-type="doi">10.1016/j.ebiom.2021.103801</pub-id>
<pub-id pub-id-type="pmid">34990895</pub-id>
</mixed-citation>
</ref>
<ref id="B80">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>R</surname>
<given-names>R.-M.</given-names>
</name>
<name>
<surname>Curtis</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>Coughlin</surname>
<given-names>T. R.</given-names>
</name>
<name>
<surname>Miranda-Vergara</surname>
<given-names>M. C.</given-names>
</name>
<name>
<surname>Dutta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Natarajan</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>The CXCL5/CXCR2 axis is sufficient to promote breast cancer colonization during bone metastasis</article-title>. <source>Nat. Commun.</source> <volume>10</volume>, <fpage>4404</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-019-12108-6</pub-id>
<pub-id pub-id-type="pmid">31562303</pub-id>
</mixed-citation>
</ref>
<ref id="B81">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rakha</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Reis-Filho</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Baehner</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Dabbs</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Decker</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Eusebi</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>Breast cancer prognostic classification in the molecular era: the role of histological grade</article-title>. <source>Breast Cancer Res. BCR</source> <volume>12</volume>, <fpage>207</fpage>. <pub-id pub-id-type="doi">10.1186/bcr2607</pub-id>
<pub-id pub-id-type="pmid">20804570</pub-id>
</mixed-citation>
</ref>
<ref id="B82">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Risch</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>McLaughlin</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Cole</surname>
<given-names>D. E. C.</given-names>
</name>
<name>
<surname>Rosen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Bradley</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>I.</given-names>
</name>
<etal/>
</person-group> (<year>2006</year>). <article-title>Population BRCA1 and BRCA2 mutation frequencies and cancer penetrances: a kin-cohort study in Ontario, Canada</article-title>. <source>J. Natl. Cancer Inst.</source> <volume>98</volume>, <fpage>1694</fpage>&#x2013;<lpage>1706</lpage>. <pub-id pub-id-type="doi">10.1093/jnci/djj465</pub-id>
<pub-id pub-id-type="pmid">17148771</pub-id>
</mixed-citation>
</ref>
<ref id="B83">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ritchie</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Phipson</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Law</surname>
<given-names>C. W.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Limma powers differential expression analyses for RNA-sequencing and microarray studies</article-title>. <source>Nucleic Acids Res.</source> <volume>43</volume>, <fpage>e47</fpage>. <pub-id pub-id-type="doi">10.1093/nar/gkv007</pub-id>
<pub-id pub-id-type="pmid">25605792</pub-id>
</mixed-citation>
</ref>
<ref id="B84">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roy</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Mittal</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Classification models for Invasive Ductal Carcinoma Progression, based on gene expression data-trained supervised machine learning</article-title>. <source>Sci. Rep.</source> <volume>10</volume>, <fpage>4113</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-60740-w</pub-id>
<pub-id pub-id-type="pmid">32139710</pub-id>
</mixed-citation>
</ref>
<ref id="B85">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ru</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kechris</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>Tabakoff</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hoffman</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Radcliffe</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Bowler</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>The multiMiR R package and database: integration of microRNA&#x2013;target interactions along with their disease and drug associations</article-title>. <source>Nucleic Acids Res.</source> <volume>42</volume>, <fpage>e133</fpage>. <pub-id pub-id-type="doi">10.1093/nar/gku631</pub-id>
<pub-id pub-id-type="pmid">25063298</pub-id>
</mixed-citation>
</ref>
<ref id="B87">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sarathi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Palaniappan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Novel significant stage-specific differentially expressed genes in hepatocellular carcinoma</article-title>. <source>BMC Cancer</source> <volume>19</volume>, <fpage>663</fpage>. <pub-id pub-id-type="doi">10.1186/s12885-019-5838-3</pub-id>
<pub-id pub-id-type="pmid">31277598</pub-id>
</mixed-citation>
</ref>
<ref id="B88">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Seborova</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Vaclavikova</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Soucek</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Elsnerova</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bartakova</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cernaj</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Association of ABC gene profiles with time to progression and resistance in ovarian cancer revealed by bioinformatics analyses</article-title>. <source>Cancer Med.</source> <volume>8</volume>, <fpage>606</fpage>&#x2013;<lpage>616</lpage>. <pub-id pub-id-type="doi">10.1002/cam4.1964</pub-id>
<pub-id pub-id-type="pmid">30672151</pub-id>
</mixed-citation>
</ref>
<ref id="B89">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Siegel</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Giaquinto</surname>
<given-names>A. N.</given-names>
</name>
<name>
<surname>Jemal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Cancer statistics</article-title>. <source>Ca. Cancer J. Clin.</source> <volume>74</volume>, <fpage>12</fpage>&#x2013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.3322/caac.21820</pub-id>
<pub-id pub-id-type="pmid">38230766</pub-id>
</mixed-citation>
</ref>
<ref id="B90">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pillai</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chellappan</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Nicotinic acetylcholine receptor signaling in tumor growth and metastasis</article-title>. <source>J. Oncol.</source> <volume>2011</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1155/2011/456743</pub-id>
<pub-id pub-id-type="pmid">21541211</pub-id>
</mixed-citation>
</ref>
<ref id="B91">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Soliman</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Srkalovic</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Mahtani</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Mavromatis</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>MammaPrint guides treatment decisions in breast Cancer: results of the IMPACt trial</article-title>. <source>BMC Cancer</source> <volume>20</volume>, <fpage>81</fpage>. <pub-id pub-id-type="doi">10.1186/s12885-020-6534-z</pub-id>
<pub-id pub-id-type="pmid">32005181</pub-id>
</mixed-citation>
</ref>
<ref id="B92">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>SORBS1 suppresses tumor metastasis and improves the sensitivity of cancer to chemotherapy drug</article-title>. <source>Oncotarget</source> <volume>8</volume>, <fpage>9108</fpage>&#x2013;<lpage>9122</lpage>. <pub-id pub-id-type="doi">10.18632/oncotarget.12851</pub-id>
<pub-id pub-id-type="pmid">27791200</pub-id>
</mixed-citation>
</ref>
<ref id="B93">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Subramaniam</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hefferan</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tau</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Peus</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Pittelkow</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jalal</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>1998</year>). <article-title>Tissue, cell type, and breast cancer stage-specific expression of a TGF-beta inducible early transcription factor gene</article-title>. <source>J. Cell. Biochem.</source> <volume>68</volume>, <fpage>226</fpage>&#x2013;<lpage>236</lpage>. <pub-id pub-id-type="doi">10.1002/(sici)1097-4644(19980201)68:2&#x3c;226::aid-jcb9&#x3e;3.0.co;2-x</pub-id>
<pub-id pub-id-type="pmid">9443078</pub-id>
</mixed-citation>
</ref>
<ref id="B94">
<mixed-citation publication-type="web">
<collab>Summary</collab> (<year>2016</year>). <article-title>Broad GDAC 2016_01_28 stddata Run</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://gdac.broadinstitute.org/runs/stddata__2016_01_28/">https://gdac.broadinstitute.org/runs/stddata__2016_01_28/</ext-link>.</comment>
</mixed-citation>
</ref>
<ref id="B95">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sung</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ferlay</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Siegel</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Laversanne</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Soerjomataram</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Jemal</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Global cancer statistics 2020: GLOBOCAN estimates of incidence and mortality Worldwide for 36 cancers in 185 countries</article-title>. <source>Ca. Cancer J. Clin.</source> <volume>71</volume>, <fpage>209</fpage>&#x2013;<lpage>249</lpage>. <pub-id pub-id-type="doi">10.3322/caac.21660</pub-id>
<pub-id pub-id-type="pmid">33538338</pub-id>
</mixed-citation>
</ref>
<ref id="B96">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Suzuki</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Inoue</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Miki</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Moriya</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Akahira</surname>
<given-names>J. i.</given-names>
</name>
<name>
<surname>Ishida</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>Early growth responsive gene 3 in human breast carcinoma: a regulator of estrogen-meditated invasion and a potent prognostic factor</article-title>. <source>Endocr. Relat. Cancer</source> <volume>14</volume>, <fpage>279</fpage>&#x2013;<lpage>292</lpage>. <pub-id pub-id-type="doi">10.1677/erc-06-0005</pub-id>
<pub-id pub-id-type="pmid">17639044</pub-id>
</mixed-citation>
</ref>
<ref id="B97">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taghizadeh</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Heydarheydari</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Saberi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>JafarpoorNesheli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Rezaeijo</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Breast cancer prediction with transcriptome profiling using feature selection and machine learning methods</article-title>. <source>BMC Bioinforma.</source> <volume>23</volume>, <fpage>410</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-022-04965-8</pub-id>
<pub-id pub-id-type="pmid">36183055</pub-id>
</mixed-citation>
</ref>
<ref id="B98">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taylor</surname>
<given-names>B. S.</given-names>
</name>
<name>
<surname>Barretina</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Socci</surname>
<given-names>N. D.</given-names>
</name>
<name>
<surname>DeCarolis</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ladanyi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Meyerson</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2008</year>). <article-title>Functional Copy-number Alterations in cancer</article-title>. <source>PLoS ONE</source> <volume>3</volume>, <fpage>e3179</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0003179</pub-id>
<pub-id pub-id-type="pmid">18784837</pub-id>
</mixed-citation>
</ref>
<ref id="B99">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tervasm&#xe4;ki</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Winqvist</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jukkola-Vuorinen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pylk&#xe4;s</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Recurrent CYP2C19 deletion allele is associated with triple-negative breast cancer</article-title>. <source>BMC Cancer</source> <volume>14</volume>, <fpage>902</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2407-14-902</pub-id>
<pub-id pub-id-type="pmid">25466287</pub-id>
</mixed-citation>
</ref>
<ref id="B100">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Uhlen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sj&#xf6;stedt</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Fagerberg</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bidkhori</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>A pathology atlas of the human cancer transcriptome</article-title>. <source>Science</source> <volume>357</volume>, <fpage>eaan2507</fpage>. <pub-id pub-id-type="doi">10.1126/science.aan2507</pub-id>
<pub-id pub-id-type="pmid">28818916</pub-id>
</mixed-citation>
</ref>
<ref id="B101">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>V</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Brynychov&#xe1;</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>V&#xe1;clav&#xed;kov&#xe1;</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ehrlichov&#xe1;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vr&#xe1;na</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Pecha</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>The role of cytochromes p450 and aldo-keto reductases in prognosis of breast carcinoma patients</article-title>. <source>Med. Baltim.</source> <volume>93</volume>, <fpage>e255</fpage>. <pub-id pub-id-type="doi">10.1097/md.0000000000000255</pub-id>
</mixed-citation>
</ref>
<ref id="B102">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vaidya</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Massarut</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vaidya</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Alexander</surname>
<given-names>E. C.</given-names>
</name>
<name>
<surname>Richards</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Caris</surname>
<given-names>J. A.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Rethinking neoadjuvant chemotherapy for breast cancer</article-title>. <source>BMJ</source> <volume>360</volume>, <fpage>j5913</fpage>. <pub-id pub-id-type="doi">10.1136/bmj.j5913</pub-id>
<pub-id pub-id-type="pmid">29326104</pub-id>
</mixed-citation>
</ref>
<ref id="B103">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>van de Vijver</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Y. D.</given-names>
</name>
<name>
<surname>van&#x27;t Veer</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hart</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Voskuil</surname>
<given-names>D. W.</given-names>
</name>
<etal/>
</person-group> (<year>2002</year>). <article-title>A gene-expression signature as a predictor of survival in breast cancer</article-title>. <source>N. Engl. J. Med.</source> <volume>347</volume> (<issue>25</issue>), <fpage>1999</fpage>&#x2013;<lpage>2009</lpage>. <pub-id pub-id-type="doi">10.1056/nejmoa021967</pub-id>
<pub-id pub-id-type="pmid">12490681</pub-id>
</mixed-citation>
</ref>
<ref id="B104">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vy</surname>
<given-names>V. P. T.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>M. M.-S.</given-names>
</name>
<name>
<surname>Khanh Le</surname>
<given-names>N. Q.</given-names>
</name>
<name>
<surname>Chan</surname>
<given-names>W. P.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Machine learning algorithm for distinguishing ductal carcinoma <italic>in situ</italic> from invasive breast cancer</article-title>. <source>Cancers</source> <volume>14</volume>, <fpage>2437</fpage>. <pub-id pub-id-type="doi">10.3390/cancers14102437</pub-id>
<pub-id pub-id-type="pmid">35626043</pub-id>
</mixed-citation>
</ref>
<ref id="B105">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>W</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B. b.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J. y.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>S. m.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>TIEG1 inhibits breast cancer invasion and metastasis by inhibition of epidermal growth factor receptor (EGFR) transcription and the EGFR signaling pathway</article-title>. <source>Mol. Cell. Biol.</source> <volume>32</volume>, <fpage>50</fpage>&#x2013;<lpage>63</lpage>. <pub-id pub-id-type="doi">10.1128/mcb.06152-11</pub-id>
<pub-id pub-id-type="pmid">22025675</pub-id>
</mixed-citation>
</ref>
<ref id="B106">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>miRDB: a microRNA target prediction and functional annotation database with a wiki interface</article-title>. <source>RNA N. Y. N.</source> <volume>14</volume>, <fpage>1012</fpage>&#x2013;<lpage>1017</lpage>. <pub-id pub-id-type="doi">10.1261/rna.965408</pub-id>
<pub-id pub-id-type="pmid">18426918</pub-id>
</mixed-citation>
</ref>
<ref id="B107">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Role of Polymorphisms of FAM13A, PHLDB1, and CYP24A1 in breast cancer risk</article-title>. <source>Curr. Mol. Med.</source> <volume>19</volume>, <fpage>579</fpage>&#x2013;<lpage>588</lpage>. <pub-id pub-id-type="doi">10.2174/1566524019666190619125109</pub-id>
<pub-id pub-id-type="pmid">31215377</pub-id>
</mixed-citation>
</ref>
<ref id="B108">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weigelt</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Geyer</surname>
<given-names>F. C.</given-names>
</name>
<name>
<surname>Reis-Filho</surname>
<given-names>J. S.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Histological types of breast cancer: how special are they?</article-title> <source>Mol. Oncol.</source> <volume>4</volume>, <fpage>192</fpage>&#x2013;<lpage>208</lpage>. <pub-id pub-id-type="doi">10.1016/j.molonc.2010.04.004</pub-id>
<pub-id pub-id-type="pmid">20452298</pub-id>
</mixed-citation>
</ref>
<ref id="B86">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Williams</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bateman</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>O&#x2019;Kelly</surname>
<given-names>I.</given-names>
</name>
</person-group>(<year>2013</year>). <article-title>Altered expression of two-pore domain potassium (K2P) channels in cancer</article-title>. <source>PloS One</source> <volume>8</volume>, <fpage>e74589</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0074589</pub-id>
<pub-id pub-id-type="pmid">24116006</pub-id>
</mixed-citation>
</ref>
<ref id="B109">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Winchester</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>H. R.</given-names>
<suffix>Md, Facs</suffix>
</name>
<name>
<surname>Graves</surname>
<given-names>T. A.</given-names>
<suffix>Md</suffix>
</name>
<name>
<surname>Menck</surname>
<given-names>H. R.</given-names>
<suffix>Mba</suffix>
</name>
<name>
<surname>Bland</surname>
<given-names>K. I.</given-names>
<suffix>Md, Facs</suffix>
</name>
<name>
<surname>Winchester</surname>
<given-names>D. P.</given-names>
<suffix>Md, Facs</suffix>
</name>
</person-group> (<year>1998</year>). <article-title>A comparative analysis of lobular and ductal carcinoma of the breast: presentation, treatment, and Outcomes1 1This study was supported by the American cancer society and the American College of surgeons</article-title>. <source>J. Am. Coll. Surg.</source> <volume>186</volume>, <fpage>416</fpage>&#x2013;<lpage>422</lpage>. <pub-id pub-id-type="doi">10.1016/s1072-7515(98)00051-9</pub-id>
<pub-id pub-id-type="pmid">9544955</pub-id>
</mixed-citation>
</ref>
<ref id="B110">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2003</year>). <article-title>Stage-specific expression of breast cancer-specific gene gamma-synuclein</article-title>. <source>Cancer Epidemiol. Biomark. Prev.</source> <volume>12</volume>, <fpage>920</fpage>&#x2013;<lpage>925</lpage>.<pub-id pub-id-type="pmid">14504205</pub-id>
</mixed-citation>
</ref>
<ref id="B111">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Tong</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A framework to predict the applicability of Oncotype DX, MammaPrint, and E2F4 gene signatures for improving breast cancer prognostic prediction</article-title>. <source>Sci. Rep.</source> <volume>12</volume>, <fpage>2211</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-06230-7</pub-id>
<pub-id pub-id-type="pmid">35140308</pub-id>
</mixed-citation>
</ref>
<ref id="B112">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A machine learning model for detecting invasive ductal carcinoma with Google Cloud AutoML Vision</article-title>. <source>Comput. Biol. Med.</source> <volume>122</volume>, <fpage>103861</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2020.103861</pub-id>
<pub-id pub-id-type="pmid">32658738</pub-id>
</mixed-citation>
</ref>
<ref id="B113">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Miyake</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lawton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goodison</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Rosser</surname>
<given-names>C. J.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Matrix metalloproteinase-10 promotes tumor progression through regulation of angiogenic and apoptotic pathways in cervical tumors</article-title>. <source>BMC Cancer</source> <volume>14</volume>, <fpage>310</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2407-14-310</pub-id>
<pub-id pub-id-type="pmid">24885595</pub-id>
</mixed-citation>
</ref>
<ref id="B114">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fitzsimmons</surname>
<given-names>K. C.</given-names>
</name>
<name>
<surname>Hurvitz</surname>
<given-names>S. A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Oncotype DX recurrence score in premenopausal women</article-title>. <source>Ther. Adv. Med. Oncol.</source> <volume>14</volume>, <fpage>17588359221081077</fpage>. <pub-id pub-id-type="doi">10.1177/17588359221081077</pub-id>
<pub-id pub-id-type="pmid">35295864</pub-id>
</mixed-citation>
</ref>
<ref id="B115">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sui</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>High expression of DEPDC1 promotes Malignant phenotypes of breast cancer cells and predicts poor prognosis in patients with breast cancer</article-title>. <source>Front. Oncol.</source> <volume>9</volume>, <fpage>262</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2019.00262</pub-id>
<pub-id pub-id-type="pmid">31032225</pub-id>
</mixed-citation>
</ref>
<ref id="B116">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Namburi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pattison</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Posner</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Balachander</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>CUP-AI-Dx: a tool for inferring cancer tissue of origin and molecular subtype using RNA gene-expression data and artificial intelligence</article-title>. <source>EBioMedicine</source> <volume>61</volume>, <fpage>103030</fpage>. <pub-id pub-id-type="doi">10.1016/j.ebiom.2020.103030</pub-id>
<pub-id pub-id-type="pmid">33039710</pub-id>
</mixed-citation>
</ref>
<ref id="B117">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhong</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Shu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Low KRT15 expression is associated with poor prognosis in patients with breast invasive carcinoma</article-title>. <source>Exp. Ther. Med.</source> <volume>21</volume>, <fpage>305</fpage>. <pub-id pub-id-type="doi">10.3892/etm.2021.9736</pub-id>
<pub-id pub-id-type="pmid">33717248</pub-id>
</mixed-citation>
</ref>
</ref-list>
</back>
</article>