<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="brief-report" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1217860</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2023.1217860</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Brief Research Report</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>AI-based multi-PRS models outperform classical single-PRS models</article-title>
<alt-title alt-title-type="left-running-head">Klau et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2023.1217860">10.3389/fgene.2023.1217860</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Klau</surname>
<given-names>Jan Henric</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2315785/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Maj</surname>
<given-names>Carlo</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/723195/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Klinkhammer</surname>
<given-names>Hannah</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2060674/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Krawitz</surname>
<given-names>Peter M.</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/537311/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mayr</surname>
<given-names>Andreas</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2153094/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hillmer</surname>
<given-names>Axel M.</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Schumacher</surname>
<given-names>Johannes</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Heider</surname>
<given-names>Dominik</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/59798/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Mathematics and Computer Science, University of Marburg</institution>, <addr-line>Marburg</addr-line>, <country>Germany</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Center for Human Genetics</institution>, University of Marburg, <addr-line>Marburg</addr-line>, <country>Germany</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Institute for Genomic Statistics and Bioinformatics</institution>, Medical Faculty, University Bonn, <addr-line>Bonn</addr-line>, <country>Germany</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Institute for Medical Biometry</institution>, Informatics and Epidemiology, Medical Faculty, University Bonn, <addr-line>Bonn</addr-line>, <country>Germany</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Institute of Pathology</institution>, Faculty of Medicine, University of Cologne, <addr-line>Cologne</addr-line>, <country>Germany</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/104510/overview">Maria Luisa Chiusano</ext-link>, University of Naples Federico II, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1422927/overview">Ravi Madduri</ext-link>, Argonne National Laboratory (DOE), United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1793604/overview">Dominik Grimm</ext-link>, Weihenstephan-Triesdorf University of Applied Sciences, Germany</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Dominik Heider, <email>dominik.heider@uni-marburg.de</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>27</day>
<month>06</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1217860</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>05</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>06</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Klau, Maj, Klinkhammer, Krawitz, Mayr, Hillmer, Schumacher and Heider.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Klau, Maj, Klinkhammer, Krawitz, Mayr, Hillmer, Schumacher and Heider</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Polygenic risk scores (PRS) calculate the risk for a specific disease based on the weighted sum of associated alleles from different genetic loci in the germline estimated by regression models. Recent advances in genetics made it possible to create polygenic predictors of complex human traits, including risks for many important complex diseases, such as cancer, diabetes, or cardiovascular diseases, typically influenced by many genetic variants, each of which has a negligible effect on overall risk. In the current study, we analyzed whether adding additional PRS from other diseases to the prediction models and replacing the regressions with machine learning models can improve overall predictive performance. Results showed that multi-PRS models outperform single-PRS models significantly on different diseases. Moreover, replacing regression models with machine learning models, i.e., deep learning, can also improve overall accuracy.</p>
</abstract>
<kwd-group>
<kwd>polygenic risk score</kwd>
<kwd>machine learning</kwd>
<kwd>deep learning</kwd>
<kwd>breast cancer</kwd>
<kwd>regression</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Genomics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Disease prevention is a crucial part of medical care. It reduces the costs for the healthcare system and reduces the number of hospitalization and deaths (<xref ref-type="bibr" rid="B13">Kahn et al., 2008</xref>). For targeted preventive measures, it is necessary to determine the individual risks for certain diseases. In addition to age, sex, and lifestyle, genetic factors play an important role in determining the individual risk. Polygenic risk scores (PRS) are used to take multivariate genomic information into consideration and can be used for the selection of a targeted treatment in personalized medicine (<xref ref-type="bibr" rid="B17">Lambert et al., 2019</xref>; <xref ref-type="bibr" rid="B19">Lewis and Vassos, 2020</xref>; <xref ref-type="bibr" rid="B34">Schr&#xf6;der et al., 2022</xref>).</p>
<p>PRS are typically modeled as a regression task by calculating a weighted sum of all genotypes and their corresponding estimated effect size. Relevant single nucleotide polymorphisms are discovered by genome-wide association studies (GWAS). For individual risk prediction, another regression model is built based on the previously calculated PRS and other covariates, such as age, sex, and lifestyle (e.g., smoking and alcohol consumption) (<xref ref-type="bibr" rid="B9">Choi et al., 2020</xref>).</p>
<p>In recent years, machine learning (ML) has led to numerous advances in medicine (<xref ref-type="bibr" rid="B21">MacEachern and Forkert, 2021</xref>) due to the ability to train models on complex problems and being able to handle large amounts of data. These models have been used in various applications, e.g., oncology (<xref ref-type="bibr" rid="B5">Bibault et al., 2016</xref>), pathology (<xref ref-type="bibr" rid="B22">Madabhushi and Lee, 2016</xref>; <xref ref-type="bibr" rid="B11">Coudray et al., 2018</xref>), diabetes (<xref ref-type="bibr" rid="B36">Sp&#xe4;nig et al., 2019</xref>), human genetics (<xref ref-type="bibr" rid="B20">Libbrecht and Noble, 2015</xref>), and infectious diseases (<xref ref-type="bibr" rid="B30">Riemenschneider et al., 2016b</xref>; <xref ref-type="bibr" rid="B28">Ren et al., 2021</xref>) as part of a growing trend toward personalized/precision medicine.</p>
<p>In this study, we trained multiple models, i.e., ridge regression (RR), random forests (RFs), and deep neural networks (DNNs), to predict an individual&#x2019;s phenotype for the following diseases: breast cancer (BC), coronary artery disease (CAD), and type 2 diabetes (T2D). We selected those three common chronic diseases to demonstrate the usefulness of our approach for different diseases. For instance, breast cancer is diagnosed in approximately 2.3 million women yearly. Cardiovascular diseases are the leading cause of death globally. Coronary artery disease affects approximately 126 million individuals, with 7.2 million deaths each year. Diabetes affects approximately 425 million people worldwide.</p>
<p>The inclusion of additional PRS has been shown to improve the prediction of traits and diseases (<xref ref-type="bibr" rid="B15">Krapohl et al., 2017</xref>) (<xref ref-type="bibr" rid="B35">Sinnott-Armstrong et al., 2021</xref>) (<xref ref-type="bibr" rid="B1">Abraham et al., 2019</xref>), psychological diseases, such as schizophrenia, bipolar disorder, or depression (<xref ref-type="bibr" rid="B31">Rodriguez et al., 2022</xref>), the risk of exposure to bullying (<xref ref-type="bibr" rid="B33">Schoeler et al., 2019</xref>), and hazard ratios (<xref ref-type="bibr" rid="B24">Meisner et al., 2020</xref>). Thus, we further evaluated the inclusion of 139 additional PRS in a multi-PRS approach to the prediction of the previously mentioned diseases. The additionally used PRS do not have to be directly associated with the investigated disease (<xref ref-type="bibr" rid="B35">Sinnott-Armstrong et al., 2021</xref>). Including these PRS, even if the phenotypes appear to be unrelated, may be beneficial as similar underlying biological mechanisms may be involved.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<p>The workflow of the current study is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. We incorporated additional PRS into the predictive models and, additionally, compared different machine learning models to the regression models that are typically used in PRS.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Workflow of the study. PRS are calculated based on the associated genetic loci (i.e., SNPs, single-nucleotide polymorphisms). Significant loci are identified via a regression model. These loci are then used to calculate the PRS based on a linear combination. Additional PRS for other diseases are incorporated into the final predictive model. During training, the models learn to distinguish between relevant and irrelevant features, including the additional PRS. Moreover, we compare the typically used ridge regression with machine learning models, namely, deep neural networks and random forests. Created with <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://www.biorender.com/">BioRender.com</ext-link>.</p>
</caption>
<graphic xlink:href="fgene-14-1217860-g001.tif"/>
</fig>
<sec id="s2-1">
<title>2.1 Data</title>
<p>This research has been conducted using the UK Biobank resource (<xref ref-type="bibr" rid="B7">Bycroft et al., 2018</xref>) under application number 81202. The UK Biobank is a large-scale cohort study covering a huge prospective sample (<italic>n</italic> &#x3e; 500,000) of the British general population, including both genotype and phenotype (health-related outcomes) data. We used the imputed UK Biobank data which include &#x2dc;96 million variants.</p>
<p>We excluded available genotype data outliers for heterozygosity (F within three standard deviations (SD) from the mean), sample genotype missing rates (&#x3e;2%), and discordant reported sex vs. genotypic sex. Allele frequency MAF &#x3c; 0.1% was removed. Variants not in the Hardy&#x2013;Weinberg equilibrium (<italic>p</italic>-value &#x3c;10<sup>&#x2013;6</sup>) were excluded.</p>
<p>In total, 139 PRS (<xref ref-type="sec" rid="s10">Supplementary Table S1</xref>) for different phenotypes, e.g., lung cancer (PGS000078), venous thromboembolism (PGS000043), and fasting glucose (PGS000305), were computed using PLINK (<xref ref-type="bibr" rid="B8">Chang et al., 2015</xref>) score function, and the corresponding effect alleles and beta coefficients were retrieved from the PGS Catalog (<ext-link ext-link-type="uri" xlink:href="https://www.pgscatalog.org/">https://www.pgscatalog.org/</ext-link>). The PRS are therefore based on a linear additive combination of effect alleles and are characterized by a normal distribution. Due to the great abundance of SNPs in the imputed UK Biobank, adequate coverage was ensured.</p>
<p>The additional 139 PRS were added as additional input features without any pre-selection to enable a data-driven approach without any subject-matter knowledge. Therefore, we included all PRS that were available in the PGS Catalog at the time we started the project. The underlying idea is that different diseases can share different pathways, e.g., inflammatory pathways, or even comorbidities. Selection of PRS according to phenotype association with the investigated disease, though more interpretable, can potentially miss relevant information. By using multiple risk scores, we were able to capture the interdependencies in a data-driven approach by machine learning models. PRS that were calculated on the same UK Biobank cohort for one of our target diseases could induce overfitting or circularity. For PRS that were calculated on the UK Biobank cohort, but for different diseases, this would only affect the control group. Therefore, these effects are, if at all, of very little impact.</p>
<p>From the phenotypic data, we derived the case/control status for three diseases, namely, BC, CAD, and T2D. BC cases were women based on self-report in an interview with a trained nurse and/or BC-related ICD-9 codes (174 or 174.9) or ICD-10 codes (C50.X) in hospitalization records. CAD cases were individuals with myocardial infarction based on self-report or hospital admission diagnosis according to ICD-9 codes of 410.X, 411.0, 412.X, or 429.79 or ICD-10 codes of I21.X, I22.X, I23.X, I24.1, or I25.2 in hospitalization records and/or with coronary artery bypass grafting (K40.1&#x2013;40.4, K41.1&#x2013;41.4, or K45.1&#x2013;45.5) or coronary angioplasty with or without stenting (K49.1&#x2013;49.2, K49.8&#x2013;49.9, K50.2, K75.1&#x2013;75.4, or K75.8&#x2013;75.9). T2D cases were samples based on self-report in an interview with a trained nurse or an ICD-10 code of E11.X in hospitalization records. For controls, all individuals without the phenotype were considered (for BC, the analysis was restricted only to women).</p>
<p>In order to limit the confounding due to the genetic background, the analysis was restricted only to individuals with White British origin (Field 21000) and with European genetic ancestry according to the principal components provided by UK Biobank (Field 22006), and among the remaining samples, to account for the residual population stratification, we considered the principal components (PCs) as computed in UK Biobank (Field 22009). The total number of individuals in the data set amounts to 429,466, while the number of patients for the three diseases, BC, CAD, and T2D, are 13,679, 23,033, and 24,241, respectively (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Number of individuals in the case and control groups.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center"/>
<th align="center">BC (female only)</th>
<th align="center">CAD</th>
<th align="center">T2D</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Cases</td>
<td align="center">13,679</td>
<td align="center">23,033</td>
<td align="center">24,241</td>
</tr>
<tr>
<td align="center">Controls</td>
<td align="center">232,424</td>
<td align="center">406,433</td>
<td align="center">405,225</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-2">
<title>2.2 Data preparation</title>
<p>We included the following features into the model training: corresponding PRS (i.e., BC-PRS (PGS000015), CAD-PRS (PGS000013), or T2D-PRS (PGS000014), respectively), first 10&#xa0;PCs, age, sex, and the genotyping array. Categorical features such as sex and genotyping array were one-hot encoded, while all other features were normalized to values between 0 and 1. For the prediction of BC, only female individuals were included, and sex was removed as an input feature. For the multi-PRS approach, 139 additional PRS (e.g., lung cancer (PGS000078), venous thromboembolism (PGS000043), and fasting glucose (PGS000305)) were included in the data set.</p>
</sec>
<sec id="s2-3">
<title>2.3 Model development</title>
<p>The data sets were split for each individual disease into training and test sets (75:25) using a stratified approach to preserve a disease&#x2019;s prevalence within each data set. This was repeated three times with different seeds to assert the robustness of the model&#x2019;s prediction on previously unseen data sets. The training set was then used in a stratified 10-fold nested cross-validation. Due to the class imbalance in the data, the training data set was upsampled within the nested cross-validation (<xref ref-type="bibr" rid="B4">Beinecke and Heider, 2021</xref>). We compared multiple methods in our study: RR, RF, and DNN.</p>
<sec id="s2-3-1">
<title>2.3.1 Ridge regression</title>
<p>Ridge regression (RR) is a statistical method that includes a penalty parameter, rendering it more stable when input features are correlated compared to other regression models. RR is typically used in calculating PRS. For the RR, we used the scikit-learn library version 0.23.2 (<xref ref-type="bibr" rid="B26">Pedregosa et al., 2011</xref>).</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Random forests</title>
<p>Random forests (RFs) are proven non-linear classifiers that have been shown to produce good results even in small-<italic>n</italic>-large-<italic>p</italic> scenarios in biomedical classification (<xref ref-type="bibr" rid="B29">Riemenschneider et al., 2016a</xref>; <xref ref-type="bibr" rid="B3">Anastasiou et al., 2017</xref>). They are based on multiple decision trees that are combined via a majority vote (<xref ref-type="bibr" rid="B6">Breiman, 2001</xref>). We used the implementation of the scikit-learn library version 0.23.2 (<xref ref-type="bibr" rid="B26">Pedregosa et al., 2011</xref>).</p>
</sec>
<sec id="s2-3-3">
<title>2.3.3 Deep neural networks</title>
<p>Deep neural networks (DNNs) are modeled after biological neurons and consist of multiple layers of artificial neurons. In our study, we used only deep feed-forward networks, where each of these neurons has multiple inputs via weighted connections to previous neurons and calculates an output on the sum of all inputs and with a given activation function. The first layer is called the input layer and is fed with the training features, while the last layer is called the output layer and provides the prediction of the network. These two layers are connected by several so-called hidden layers. All DNNs were implemented using the PyTorch library version 1.7.1 (<xref ref-type="bibr" rid="B25">Paszke et al., 2019</xref>).</p>
</sec>
<sec id="s2-3-4">
<title>2.3.4 Hyperparameter optimization</title>
<p>Hyperparameter optimization of all models was carried out within the nested cross-validation. For the DNNs, we evaluated different topologies, ranging from 3 to 6 layers and 2 to 512 neurons per layer. Learning rates of 1 &#xd7; 10<sup>&#x2212;5</sup>, 1 &#xd7; 10<sup>&#x2212;4</sup>, and 1 &#xd7; 10<sup>&#x2212;3</sup> were tested. The loss function used was BCELoss. RFs were optimized with regard to the number of trees (100, 250, 500, and 1,000) and the maximum depth per tree (default, 10, 25, and 50). For RR models, the number of iterations (default, 100, 500, 1,000, and 5,000) was optimized.</p>
<p>After optimizing the hyperparameters in the 10-fold nested cross-validation, models were trained on the full training set using the optimal hyperparameters and then used to predict the test set. Models were evaluated based on the area under the receiver operating characteristic curve (AUC) and accuracy on the test set averaged over three random seeds.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p>For the DNNs, no single best topology for all tasks was found (<xref ref-type="table" rid="T2">Table 2</xref>). The best learning rate for all DNN models was 1 &#xd7; 10<sup>&#x2212;4</sup>. The best topology for the single-PRS approach for all data sets is 16-8-4-1, while the best topology for the multi-PRS approach is 8-4-4-1 for CAD and T2D and 16-8-4-1 for BC. The rectified linear unit (ReLU) was used as an activation function after all layers, except for the output layer, where the sigmoid function was used. The models performed best after 100 epochs of training. The training of single-PRS models took approximately 8&#xa0;min, while multi-PRS trainings took approximately 10&#xa0;min, resulting in a total training time of approximately 80 and 100&#xa0;min, respectively, for a 10-fold cross-validation. Due to the lower amount of samples for BC, training times were halved for these models.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Comparison of DNN, RF, and RR on the three data sets, BC, CAD, and T2D, for single- and multi-PRS approaches. Evaluation based on AUC and accuracy according to <xref ref-type="bibr" rid="B14">Khera et al. (2018)</xref>. Values are shown as mean &#xb1; SD.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="left">Disease</th>
<th align="left">PRS mode</th>
<th align="left">Accuracy</th>
<th align="left">AUC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">DNN</td>
<td align="left">BC</td>
<td align="left">Single-PRS</td>
<td align="left">0.613 &#xb1; 0.021</td>
<td align="left">0.653 &#xb1; 0.004</td>
</tr>
<tr>
<td align="center">DNN</td>
<td align="left">BC</td>
<td align="left">Multi-PRS</td>
<td align="left">0.628 &#xb1; 0.024</td>
<td align="left">0.668 &#xb1; 0.001</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="left">BC</td>
<td align="left">Single-PRS</td>
<td align="left">0.592 &#xb1; 0.015</td>
<td align="left">0.626 &#xb1; 0.005</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="left">BC</td>
<td align="left">Multi-PRS</td>
<td align="left">0.609 &#xb1; 0.009</td>
<td align="left">0.648 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">RR</td>
<td align="left">BC</td>
<td align="left">Single-PRS</td>
<td align="left">0.598 &#xb1; 0.007</td>
<td align="left">0.652 &#xb1; 0.004</td>
</tr>
<tr>
<td align="center">RR</td>
<td align="left">BC</td>
<td align="left">Multi-PRS</td>
<td align="left">0.612 &#xb1; 0.011</td>
<td align="left">0.670 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">DNN</td>
<td align="left">CAD</td>
<td align="left">Single-PRS</td>
<td align="left">0.694 &#xb1; 0.009</td>
<td align="left">0.785 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">DNN</td>
<td align="left">CAD</td>
<td align="left">Multi-PRS</td>
<td align="left">0.698 &#xb1; 0.012</td>
<td align="left">0.790 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="left">CAD</td>
<td align="left">Single-PRS</td>
<td align="left">0.674 &#xb1; 0.002</td>
<td align="left">0.765 &#xb1; 0.003</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="left">CAD</td>
<td align="left">Multi-PRS</td>
<td align="left">0.683 &#xb1; 0.004</td>
<td align="left">0.768 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">RR</td>
<td align="left">CAD</td>
<td align="left">Single-PRS</td>
<td align="left">0.696 &#xb1; 0.004</td>
<td align="left">0.785 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">RR</td>
<td align="left">CAD</td>
<td align="left">Multi-PRS</td>
<td align="left">0.693 &#xb1; 0.004</td>
<td align="left">0.790 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">DNN</td>
<td align="left">T2D</td>
<td align="left">Single-PRS</td>
<td align="left">0.626 &#xb1; 0.017</td>
<td align="left">0.703 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">DNN</td>
<td align="left">T2D</td>
<td align="left">Multi-PRS</td>
<td align="left">0.653 &#xb1; 0.010</td>
<td align="left">0.716 &#xb1; 0.003</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="left">T2D</td>
<td align="left">Single-PRS</td>
<td align="left">0.607 &#xb1; 0 014</td>
<td align="left">0.675 &#xb1; 0.001</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="left">T2D</td>
<td align="left">Multi-PRS</td>
<td align="left">0.610 &#xb1; 0.001</td>
<td align="left">0.686 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">RR</td>
<td align="left">T2D</td>
<td align="left">Single-PRS</td>
<td align="left">0.636 &#xb1; 0.007</td>
<td align="left">0.703 &#xb1; 0.002</td>
</tr>
<tr>
<td align="center">RR</td>
<td align="left">T2D</td>
<td align="left">Multi-PRS</td>
<td align="left">0.636 &#xb1; 0.008</td>
<td align="left">0.716 &#xb1; 0.002</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>For the RF models, the best predictions were obtained with 500 trees, while all other parameters were left at the default value. For the RR models, all parameters were left at the default value.</p>
<p>It turned out that the DNNs performed equally well or outperformed RR in all data sets, in particular for the multi-PRS approach. RF did not outperform RR in any data set, neither as single-PRS nor as multi-PRS. In fact, RF performed significantly worse for all data sets and PRS modes with approximately 2% lower AUC and accuracy values than RR and DNNs.</p>
<p>For instance, the DNNs reached an accuracy of 0.653 &#xb1; 0.010 compared to 0.636 &#xb1; 0.008 for RR for the T2D data set using the multi-PRS approach. For the BC data set, the DNN reached an accuracy of 0.628 &#xb1; 0.024 for the multi-PRS approach, while the RR reached only an accuracy of 0.612 &#xb1; 0.011. For the single-PRS, the DNN reached an accuracy of 0.613 &#xb1; 0.021 and the RR reached an accuracy of 0.598 &#xb1; 0.007. For the CAD data set, the DNN reached an accuracy of 0.698 &#xb1; 0.012 with the multi-PRS approach, while the RR reached 0.693 &#xb1; 0.004. For the single-PRS approach, there were no differences between RR and DNN. Interestingly, using the multi-PRS approach instead of the typically used single-PRS approach generally leads to higher accuracy of the resulting model, irrespective of the underlying prediction model, i.e., RF, RR, or DNN.</p>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>We showed that the inclusion of additional PRS improves the prediction quality of PRS models for predicting an individual&#x2019;s phenotype for BC, CAD, and T2D. The improved prediction quality by including additional PRS can be attributed to the fact that disease susceptibility can be characterized by different risk factors for which at least a partially independent underlying genetic liability exists. For instance, the risk for CAD (coronary artery disease) can be associated with high LDL-cholesterol, high body mass index, smoking, etc., which is also influenced by genetics. Therefore, more comprehensive genetic risk models can be obtained by using a multi-PRS modeling approach. Moreover, by replacing the typically used RR with DNNs, prediction performance could also be improved. DNNs are non-linear classifiers able to capture non-linearity in the underlying data. By not selecting additional PRS manually, we ensured that no information is lost and left it to the algorithms to identify important features. The effect of different PRS on the prediction is likely to be very different. Approaches from explainable AI could be used to identify the relevant PRS.</p>
<p>Although these differences are rather small, the improvement in overall accuracy implies that there are non-linear relationships in the genomics data, as expected from other studies. Improvements in accuracy of up to 1.5%&#x2013;2% are rather small, but they can have strong implications for patients. For instance, in Europe, there are approximately 355,000 BC cases per year, accounting for more than 90,000 deaths; however, incidences are increasing. Currently, one out of 11 women will develop BC in Europe. In the US, the number is even higher, with approximately 13%, and BC is the second leading cause of death among women. Using prediction models to detect high-risk patients for screening of BC can improve early detection and thus increase life expectancy. An improvement of 1.5% corresponds to more than 5,000 cases that can be detected only in Europe. If we consider T2D, one in 11 adults has diabetes, i.e., 425 million people worldwide. In the United States of America, approximately 11% of people aged between 20 and 79 years have diabetes, while in Europe, it is approximately 6.8%. Approximately 90% of those affected have type 2 diabetes. Every 8&#xa0;seconds, a person dies as a result of diabetes. It is estimated that almost 700 million people will have diabetes in 2045. Moreover, it has been estimated that a very high number (almost half) of cases are unreported. By improving the risk prediction by 2% solely by incorporating the available data and novel AI models, approximately 7 million more cases could be identified in risk screenings.</p>
<p>From a translational point of view, better prediction performance will improve disease risk stratification. So far, multi-PRS approaches have been rarely applied, mainly due to the limited availability of large-population-based cohorts with deep-phenotyping data to train the model and for the computational issues to deal with high-dimensional data. With the availability of population-based cohorts (such as UK Biobank) and the parallel improvement of computational algorithms for big-data processing, the training of multi-PRS models is feasible on standard HPC infrastructure. Instead, the final application of the models on independent test data is not computationally demanding and therefore can be run locally and potentially integrated into a clinical setting. Additional PRS can be calculated on imputed SNPs based on reference haplotypes if they were not included in the original SNP array.</p>
<p>Our study presents different limitations. In particular, we focused on the genetic predictions of complex traits, including only sex and age as non-genetic factors. However, it is well known that genetic predictors explain only a relatively small proportion of the heritability of complex traits (<xref ref-type="bibr" rid="B12">Gusev et al., 2013</xref>). Therefore, in translational settings, different non-genetic risk factors should be included in the prediction models in order to obtain an optimized risk stratification [e.g., the BOADICEA model for breast cancer (<xref ref-type="bibr" rid="B18">Lee et al., 2019</xref>)]. Since the multi-PRS model is based on multiple PRS, general limitations of PRS also apply to our model. Some SNPs associated with the diseases may be undiscovered by GWAS, and effect sizes are imprecise (<xref ref-type="bibr" rid="B19">Lewis and Vassos, 2020</xref>). Additionally, PRS suffer from a portability problem. PRS calculated on one genetic ancestry perform worse on groups of different ancestry (<xref ref-type="bibr" rid="B23">Martin et al., 2019</xref>). In our work, the data set is mainly composed of samples with European genetic backgrounds. Given the different allele frequencies across populations and the limited sample size of non-European individuals, overfitting with respect to the target European population can affect the generalizability of the model. Family-based GWAS are more robust to the effects of population stratification but generally lack power in comparison to non-family-based GWAS (<xref ref-type="bibr" rid="B16">Laird and Lange, 2009</xref>). Furthermore, the interpretation of PRS can be difficult and lead to overdiagnosis, resulting in inappropriate treatment (<xref ref-type="bibr" rid="B2">Polygenic Risk Score Task Force of the International Common Disease Alliance et al., 2021</xref>).</p>
<p>In the future, we aim to incorporate not only genomics information and PRS but also other clinical data and questionnaires to further improve the risk predictions. As the number of scores in the PGS Catalog constantly grows, those new PRS can be used to update and potentially improve the multi-PRS model. Furthermore, tools other than PLINK (<xref ref-type="bibr" rid="B8">Chang et al., 2015</xref>) [e.g., LDpred2 (<xref ref-type="bibr" rid="B27">Priv&#xe9; et al., 2021</xref>), PRSice-2 (<xref ref-type="bibr" rid="B10">Choi and O&#x2019;Reilly, 2019</xref>), PRS-CSx (<xref ref-type="bibr" rid="B32">Ruan et al., 2022</xref>), or PRSMix (<xref ref-type="bibr" rid="B37">Truong et al., 2023</xref>)] can be used to calculate the input PRS.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. These data can be found at: UK Biobank.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>Conceptualization: DH and JK; methodology: JK; software: JK; validation: JK; formal analysis: JK; investigation: JK; resources: DH, CM, HK, PK, AM, AH, and JS; data curation: CM; writing&#x2014;original draft preparation: JK; writing&#x2014;review and editing: all authors; visualization: DH and JK; supervision: DH; project administration: DH; funding acquisition: DH, AH, and JS. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>This work was financially supported by the German Federal Ministry of Education and Research (BMBF) [031L0267A] (Deep Insight).</p>
</sec>
<ack>
<p>
<xref ref-type="fig" rid="F1">Figure 1</xref> was created using <ext-link ext-link-type="uri" xlink:href="http://BioRender.com">BioRender.com</ext-link>.</p>
</ack>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2023.1217860/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2023.1217860/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Table1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abraham</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Malik</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Yonova-Doing</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Salim</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Danesh</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Genomic risk score offers predictive performance comparable to clinical risk factors for ischaemic stroke</article-title>. <source>Nat. Commun.</source> <volume>10</volume>, <fpage>5819</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-019-13848-1</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<collab>Polygenic Risk Score Task Force of the International Common Disease Alliance</collab>
<person-group person-group-type="author">
<name>
<surname>Adeyemo</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Balaconis</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Darnes</surname>
<given-names>D. R.</given-names>
</name>
<name>
<surname>Fatumo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Granados Moreno</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Responsible use of polygenic risk scores in the clinic: Potential benefits, risks and gaps</article-title>. <source>Nat. Med.</source> <volume>27</volume>, <fpage>1876</fpage>&#x2013;<lpage>1884</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-021-01549-6</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Anastasiou</surname>
<given-names>O. E.</given-names>
</name>
<name>
<surname>K&#xe4;lsch</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hakmouni</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kucukoglu</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Heider</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Korth</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Low transferrin and high ferritin concentrations are associated with worse outcome in acute liver failure</article-title>. <source>Liver Int. Official J. Int. Assoc. Study Liver</source> <volume>37</volume>, <fpage>1032</fpage>&#x2013;<lpage>1041</lpage>. <pub-id pub-id-type="doi">10.1111/liv.13369</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beinecke</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Heider</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Gaussian noise up-sampling is better suited than SMOTE and ADASYN for clinical decision making</article-title>. <source>BioData Min.</source> <volume>14</volume>, <fpage>49</fpage>. <pub-id pub-id-type="doi">10.1186/s13040-021-00283-6</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bibault</surname>
<given-names>J.-E.</given-names>
</name>
<name>
<surname>Giraud</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Burgun</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Big data and machine learning in radiation oncology: State of the art and future prospects</article-title>. <source>Cancer Lett.</source> <volume>382</volume>, <fpage>110</fpage>&#x2013;<lpage>117</lpage>. <pub-id pub-id-type="doi">10.1016/j.canlet.2016.05.033</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Breiman</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Random forests</article-title>. <source>Mach. Learn.</source> <volume>45</volume>, <fpage>5</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1023/a:1010933404324</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bycroft</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Freeman</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Petkova</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Band</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Elliott</surname>
<given-names>L. T.</given-names>
</name>
<name>
<surname>Sharp</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>The UK biobank resource with deep phenotyping and genomic data</article-title>. <source>Nature</source> <volume>562</volume>, <fpage>203</fpage>&#x2013;<lpage>209</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-018-0579-z</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chang</surname>
<given-names>C. C.</given-names>
</name>
<name>
<surname>Chow</surname>
<given-names>C. C.</given-names>
</name>
<name>
<surname>Tellier</surname>
<given-names>L. C.</given-names>
</name>
<name>
<surname>Vattikuti</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Purcell</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J. J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Second-generation PLINK: Rising to the challenge of larger and richer datasets</article-title>. <source>GigaScience</source> <volume>4</volume>, <fpage>7</fpage>. <pub-id pub-id-type="doi">10.1186/s13742-015-0047-8</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Choi</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>Mak</surname>
<given-names>T. S.-H.</given-names>
</name>
<name>
<surname>O&#x2019;Reilly</surname>
<given-names>P. F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tutorial: A guide to performing polygenic risk score analyses</article-title>. <source>Nat. Protoc.</source> <volume>15</volume>, <fpage>2759</fpage>&#x2013;<lpage>2772</lpage>. <pub-id pub-id-type="doi">10.1038/s41596-020-0353-1</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Choi</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>O&#x2019;Reilly</surname>
<given-names>P. F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>PRSice-2: Polygenic Risk Score software for biobank-scale data</article-title>. <source>GigaScience</source> <volume>8</volume>, <fpage>082</fpage>. <comment>giz082</comment>. <pub-id pub-id-type="doi">10.1093/gigascience/giz082</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Coudray</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ocampo</surname>
<given-names>P. S.</given-names>
</name>
<name>
<surname>Sakellaropoulos</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Narula</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Snuderl</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Feny&#xf6;</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Classification and mutation prediction from non-small cell lung cancer histopathology images using deep learning</article-title>. <source>Nat. Med.</source> <volume>24</volume>, <fpage>1559</fpage>&#x2013;<lpage>1567</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-018-0177-5</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gusev</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bhatia</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zaitlen</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Vilhjalmsson</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Diogo</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Stahl</surname>
<given-names>E. A.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Quantifying missing heritability at known GWAS loci</article-title>. <source>PLoS Genet.</source> <volume>9</volume>, <fpage>1003993</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pgen.1003993</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kahn</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Robertson</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Eddy</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>The impact of prevention on reducing the burden of cardiovascular disease</article-title>. <source>Diabetes Care</source> <volume>31</volume>, <fpage>1686</fpage>&#x2013;<lpage>1696</lpage>. <pub-id pub-id-type="doi">10.2337/dc08-9022</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khera</surname>
<given-names>A. V.</given-names>
</name>
<name>
<surname>Chaffin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Aragam</surname>
<given-names>K. G.</given-names>
</name>
<name>
<surname>Haas</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Roselli</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>S. H.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Genome-wide polygenic scores for common diseases identify individuals with risk equivalent to monogenic mutations</article-title>. <source>Nat. Genet.</source> <volume>50</volume>, <fpage>1219</fpage>&#x2013;<lpage>1224</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-018-0183-z</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krapohl</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Newhouse</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Curtis</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>von Stumm</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dale</surname>
<given-names>P. S.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Multi-polygenic score approach to trait prediction</article-title>. <source>Mol. Psychiatry</source> <volume>23</volume>, <fpage>1368</fpage>&#x2013;<lpage>1374</lpage>. <pub-id pub-id-type="doi">10.1038/mp.2017.163</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Laird</surname>
<given-names>N. M.</given-names>
</name>
<name>
<surname>Lange</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>The role of family-based designs in genome-wide association studies</article-title>. <source>Stat. Sci.</source> <volume>24</volume>. <pub-id pub-id-type="doi">10.1214/08-STS280</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lambert</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Abraham</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Inouye</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Towards clinical utility of polygenic risk scores</article-title>. <source>Hum. Mol. Genet.</source> <volume>28</volume>, <fpage>R133</fpage>&#x2013;<lpage>R142</lpage>. <pub-id pub-id-type="doi">10.1093/hmg/ddz187</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mavaddat</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wilcox</surname>
<given-names>A. N.</given-names>
</name>
<name>
<surname>Cunningham</surname>
<given-names>A. P.</given-names>
</name>
<name>
<surname>Carver</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hartley</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Boadicea: A comprehensive breast cancer risk prediction model incorporating genetic and nongenetic risk factors</article-title>. <source>Genet. Med.</source> <volume>21</volume>, <fpage>1708</fpage>&#x2013;<lpage>1718</lpage>. <pub-id pub-id-type="doi">10.1038/s41436-018-0406-9</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lewis</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Vassos</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Polygenic risk scores: From research tools to clinical instruments</article-title>. <source>Genome Med.</source> <volume>12</volume>, <fpage>44</fpage>. <pub-id pub-id-type="doi">10.1186/s13073-020-00742-5</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Libbrecht</surname>
<given-names>M. W.</given-names>
</name>
<name>
<surname>Noble</surname>
<given-names>W. S.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Machine learning applications in genetics and genomics</article-title>. <source>Nat. Rev. Genet.</source> <volume>16</volume>, <fpage>321</fpage>&#x2013;<lpage>332</lpage>. <pub-id pub-id-type="doi">10.1038/nrg3920</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>MacEachern</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Forkert</surname>
<given-names>N. D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Machine learning for precision medicine</article-title>. <source>Genome</source> <volume>64</volume>, <fpage>416</fpage>&#x2013;<lpage>425</lpage>. <pub-id pub-id-type="doi">10.1139/gen-2020-0131</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Madabhushi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Image analysis and machine learning in digital pathology: Challenges and opportunities</article-title>. <source>Med. Image Anal.</source> <volume>33</volume>, <fpage>170</fpage>&#x2013;<lpage>175</lpage>. <pub-id pub-id-type="doi">10.1016/j.media.2016.06.037</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Martin</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Kanai</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kamatani</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Okada</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Neale</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Daly</surname>
<given-names>M. J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Clinical use of current polygenic risk scores may exacerbate health disparities</article-title>. <source>Nat. Genet.</source> <volume>51</volume>, <fpage>584</fpage>&#x2013;<lpage>591</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-019-0379-x</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meisner</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kundu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y. D.</given-names>
</name>
<name>
<surname>Lan</surname>
<given-names>L. V.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ghandwani</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Combined utility of 25 disease and risk factor polygenic risk scores for stratifying risk of all-cause mortality</article-title>. <source>Am. J. Hum. Genet.</source> <volume>107</volume>, <fpage>418</fpage>&#x2013;<lpage>431</lpage>. <pub-id pub-id-type="doi">10.1016/j.ajhg.2020.07.002</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Massa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lerer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bradbury</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chanan</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Pytorch: An imperative style, high-performance deep learning library</article-title>,&#x201d; in <source>Advances in neural information processing systems 32</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Wallach</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Larochelle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Beygelzimer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>d&#x27;Alch&#xe9;-Buc</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>Red Hook, New York</publisher-loc>: <publisher-name>Curran Associates, Inc</publisher-name>), <fpage>8024</fpage>&#x2013;<lpage>8035</lpage>.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: Machine learning in Python</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Priv&#xe9;</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Arbel</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Vilhj&#xe1;lmsson</surname>
<given-names>B. J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>LDpred2: Better, faster, stronger</article-title>. <source>Bioinformatics</source> <volume>36</volume>, <fpage>5424</fpage>&#x2013;<lpage>5431</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa1029</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chakraborty</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Doijad</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Falgenhauer</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Falgenhauer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Goesmann</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Prediction of antimicrobial resistance based on whole-genome sequencing and machine learning</article-title>. <source>Bioinformatics</source> <volume>38</volume>, <fpage>325</fpage>&#x2013;<lpage>334</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btab681</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Riemenschneider</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cashin</surname>
<given-names>K. Y.</given-names>
</name>
<name>
<surname>Budeus</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Sierra</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shirvani-Dastgerdi</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Bayanolhagh</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2016a</year>). <article-title>Genotypic prediction of co-receptor tropism of HIV-1 subtypes a and c</article-title>. <source>Sci. Rep.</source> <volume>6</volume>, <fpage>24883</fpage>. <pub-id pub-id-type="doi">10.1038/srep24883</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Riemenschneider</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hummel</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Heider</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2016b</year>). <article-title>Shiva - a web application for drug resistance and tropism testing in HIV</article-title>. <source>BMC Bioinforma.</source> <volume>17</volume>, <fpage>314</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-016-1179-2</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rodriguez</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Alameda</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Quattrone</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Tripoli</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gayer-Anderson</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Spinazzola</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Use of multiple polygenic risk scores for distinguishing schizophrenia-spectrum disorder and affective psychosis categories in a first-episode sample; the eu-gei study</article-title>. <source>Psychol. Med.</source> <volume>1</volume>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1017/S0033291721005456</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ruan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.-F.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Y.-C. A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Lam</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Improving polygenic prediction in ancestrally diverse populations</article-title>. <source>Nat. Genet.</source> <volume>54</volume>, <fpage>573</fpage>&#x2013;<lpage>580</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-022-01054-7</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schoeler</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>Dudbridge</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Baldwin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Duncan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Cecil</surname>
<given-names>C. M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Multi&#x2013;polygenic score approach to identifying individual vulnerabilities associated with the risk of exposure to bullying</article-title>. <source>JAMA Psychiatry</source> <volume>76</volume>, <fpage>730</fpage>&#x2013;<lpage>738</lpage>. <pub-id pub-id-type="doi">10.1001/jamapsychiatry.2019.0310</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schr&#xf6;der</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chegwidden</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Maj</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gehlen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Speller</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>B&#xf6;hmer</surname>
<given-names>A. C.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>GWAS meta-analysis of 16 790 patients with barrett&#x2019;s oesophagus and oesophageal adenocarcinoma identifies 16 novel genetic risk loci and provides insights into disease aetiology beyond the single marker level</article-title>. <source>
<italic>Gut</italic> , gutjnl&#x2013;</source> <volume>72</volume>, <fpage>612</fpage>&#x2013;<lpage>623</lpage>. <comment>326698</comment>. <pub-id pub-id-type="doi">10.1136/gutjnl-2021-326698</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sinnott-Armstrong</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Tanigawa</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Amar</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mars</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Benner</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Aguirre</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Genetics of 35 blood and urine biomarkers in the UK biobank</article-title>. <source>Nat. Genet.</source> <volume>53</volume>, <fpage>185</fpage>&#x2013;<lpage>194</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-020-00757-z</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sp&#xe4;nig</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Emberger-Klein</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sowa</surname>
<given-names>J.-P.</given-names>
</name>
<name>
<surname>Canbay</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Menrad</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Heider</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>The virtual doctor: An interactive clinical-decision-support system based on deep learning for non-invasive prediction of diabetes</article-title>. <source>Artif. Intell. Med.</source> <volume>100</volume>, <fpage>101706</fpage>. <pub-id pub-id-type="doi">10.1016/j.artmed.2019.101706</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Truong</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hull</surname>
<given-names>L. E.</given-names>
</name>
<name>
<surname>Ruan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Q. Q.</given-names>
</name>
<name>
<surname>Hornsby</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Integrative polygenic risk score improves the prediction accuracy of complex traits and diseases</article-title>. <source>Prepr. Genet. Genomic Med.</source>, <fpage>23286110</fpage>. <pub-id pub-id-type="doi">10.1101/2023.02.21.23286110</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>