<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Public Health</journal-id>
<journal-title>Frontiers in Public Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Public Health</abbrev-journal-title>
<issn pub-type="epub">2296-2565</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpubh.2021.626331</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Public Health</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Machine Learning for Predicting the 3-Year Risk of Incident Diabetes in Chinese Adults</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Wu</surname> <given-names>Yang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1134875/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Hu</surname> <given-names>Haofei</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Cai</surname> <given-names>Jinlin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname> <given-names>Runtian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Zuo</surname> <given-names>Xin</given-names></name>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Cheng</surname> <given-names>Heng</given-names></name>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yan</surname> <given-names>Dewen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Endocrinology, The First Affiliated Hospital of Shenzhen University</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Endocrinology, Shenzhen Second People&#x00027;s Hospital</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Shenzhen University Health Science Center</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Nephrology, The First Affiliated Hospital of Shenzhen University</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>Department of Nephrology, Shenzhen Second People&#x00027;s Hospital</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<aff id="aff6"><sup>6</sup><institution>Shantou University Medical College</institution>, <addr-line>Shantou</addr-line>, <country>China</country></aff>
<aff id="aff7"><sup>7</sup><institution>Department of Endocrinology, The Third People&#x00027;s Hospital of Shenzhen</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Yongcheng He, Shenzhen Hengsheng Hospital, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Chi Chen, Guizhou University of Traditional Chinese Medicine, China; Huiqiong Zeng, Shenzhen Futian Hospital for Rheumatic Diseases, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Dewen Yan <email>ydewenyy&#x00040;163.com</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Clinical Diabetes, a section of the journal Frontiers in Public Health</p></fn>
<fn fn-type="other" id="fn002"><p>&#x02020;These authors have contributed equally to this work</p></fn></author-notes>
<pub-date pub-type="epub">
<day>29</day>
<month>06</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>9</volume>
<elocation-id>626331</elocation-id>
<history>
<date date-type="received">
<day>16</day>
<month>11</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>05</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2021 Wu, Hu, Cai, Chen, Zuo, Cheng and Yan.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Wu, Hu, Cai, Chen, Zuo, Cheng and Yan</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license> </permissions>
<abstract><p><bold>Purpose:</bold> We aimed to establish and validate a risk assessment system that combines demographic and clinical variables to predict the 3-year risk of incident diabetes in Chinese adults.</p>
<p><bold>Methods:</bold> A 3-year cohort study was performed on 15,928 Chinese adults without diabetes at baseline. All participants were randomly divided into a training set (<italic>n</italic> = 7,940) and a validation set (<italic>n</italic> = 7,988). XGBoost method is an effective machine learning technique used to select the most important variables from candidate variables. And we further established a stepwise model based on the predictors chosen by the XGBoost model. The area under the receiver operating characteristic curve (AUC), decision curve and calibration analysis were used to assess discrimination, clinical use and calibration of the model, respectively. The external validation was performed on a cohort of 11,113 Japanese participants.</p>
<p><bold>Result:</bold> In the training and validation sets, 148 and 145 incident diabetes cases occurred. XGBoost methods selected the 10 most important variables from 15 candidate variables. Fasting plasma glucose (FPG), body mass index (BMI) and age were the top 3 important variables. And we further established a stepwise model and a prediction nomogram. The AUCs of the stepwise model were 0.933 and 0.910 in the training and validation sets, respectively. The Hosmer-Lemeshow test showed a perfect fit between the predicted diabetes risk and the observed diabetes risk (<italic>p</italic> = 0.068 for the training set, <italic>p</italic> = 0.165 for the validation set). Decision curve analysis presented the clinical use of the stepwise model and there was a wide range of alternative threshold probability spectrum. And there were almost no the interactions between these predictors (most <italic>P</italic>-values for interaction &#x0003E;0.05). Furthermore, the AUC for the external validation set was 0.830, and the Hosmer-Lemeshow test for the external validation set showed no statistically significant difference between the predicted diabetes risk and observed diabetes risk (<italic>P</italic> = 0.824).</p>
<p><bold>Conclusion:</bold> We established and validated a risk assessment system for characterizing the 3-year risk of incident diabetes.</p></abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>extreme gradient boosting</kwd>
<kwd>simple stepwise model</kwd>
<kwd>Incident diabetes</kwd>
<kwd>risk</kwd>
</kwd-group>
<counts>
<fig-count count="9"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="66"/>
<page-count count="12"/>
<word-count count="7832"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Highlights</title>
<list list-type="simple">
<list-item><p>- The eXtreme Gradient Boosting system was an effective machine learning technique.</p></list-item>
<list-item><p>- We establish a risk assessment system for characterizing the 3-year risk of diabetes.</p></list-item>
<list-item><p>- The external validation showed that our findings were well-generalized.</p></list-item>
<list-item><p>- Our findings are helpful for identifying individuals at high risk for diabetes.</p></list-item>
</list></sec>
<sec sec-type="intro" id="s2">
<title>Introduction</title>
<p>The epidemic of diabetes has become a major public health threat across the world. The International Diabetes Federation (IDF) estimated that 451 million adults were suffering from diabetes mellitus worldwide in 2017 and the figure was expected to increase to 693 million by 2045 (<xref ref-type="bibr" rid="B1">1</xref>). The prevalence of diabetes among Chinese adults increased from 9.7% in 2007 and to 11.2% in 2017 (<xref ref-type="bibr" rid="B2">2</xref>). Diabetes is a debilitating chronic disease with potentially various microvascular and macrovascular complications, such as diabetic kidney disease, diabetic retinopathy, diabetic neuropathy, cardiovascular, and cerebrovascular disease (<xref ref-type="bibr" rid="B3">3</xref>&#x02013;<xref ref-type="bibr" rid="B7">7</xref>). Diabetes and its complications have contributed tremendously to the burden of social, financial, and health systems worldwide.</p>
<p>Although diabetes is an irreversible disease, it is largely preventable. Early screening and diagnosis are at the core of effectively preventing diabetes and delaying its progression. Several studies revealed lifestyle modification and pharmacological intervention could reduce the risk of developing diabetes (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B9">9</xref>). Moreover, for newly diagnosed diabetic patients, intensive lifestyle intervention, early short-term intensive insulin therapy and metabolic surgery can induce long-term glycemic remission without further antidiabetic medication (<xref ref-type="bibr" rid="B10">10</xref>&#x02013;<xref ref-type="bibr" rid="B12">12</xref>). Therefore, it is essential to identify individuals at high risk of developing diabetes for diabetes prevention programs.</p>
<p>Machine learning has increasingly been utilized to establish risk prediction models in the field of medicine (<xref ref-type="bibr" rid="B13">13</xref>&#x02013;<xref ref-type="bibr" rid="B15">15</xref>). Machine-learning algorithms can be defined as searching through a large number of candidate programs under the guidance of training experience to find a program that optimizes the performance metric (<xref ref-type="bibr" rid="B16">16</xref>). Compared with traditional statistical methods, it is mainly applied to iteratively learn the non-linear interactions from a mass of data through computer algorithms (<xref ref-type="bibr" rid="B17">17</xref>). Several studies showed that machine learning methods could describe an individual&#x00027;s characteristics and identify individuals at high risk of diabetes (<xref ref-type="bibr" rid="B18">18</xref>&#x02013;<xref ref-type="bibr" rid="B21">21</xref>). A gradient tree boosting method implemented in the eXtreme Gradient Boosting (XGBoost) system is an effective machine learning method that can assemble weak prediction models to establish a more reliable prediction model (<xref ref-type="bibr" rid="B22">22</xref>&#x02013;<xref ref-type="bibr" rid="B26">26</xref>). So far, there is no research using the XGBoost method to build diabetes risk prediction models. Therefore, we sought to use the XGBoost method to select the most important variables from candidate variables and further establish and validate a risk assessment system that combines demographic and clinical variables using real-world data from a large cohort of Chinese adults across 32 sites and 11 cities between 2010 and 2016 to predict the 3-year risk of incident diabetes in Chinese adults.</p></sec>
<sec sec-type="materials and methods" id="s3">
<title>Materials and Methods</title>
<sec>
<title>Study Design and Participants</title>
<p>The data was downloaded from the &#x0201C;DATADRYAD&#x0201D; database (<ext-link ext-link-type="uri" xlink:href="http://www.Datadryad.org">www.Datadryad.org</ext-link>), a non-profit computerized database established in China by the Rich Healthcare Group. Its data is available publicly for use. The raw data was provided by Chen et al. (<xref ref-type="bibr" rid="B27">27</xref>). The original study recruited a total of 685,277 participants &#x02265;20 years old with at least two visits from 2010 to 2016 across 32 sites and 11 cities in China.</p>
<p>Baseline demographic and clinical variables were included as follows: age, gender, smoking and drinking status, family history of diabetes, body mass index (BMI), systolic blood pressure (SBP), diastolic blood pressure (DBP), fasting plasma glucose (FPG), total cholesterol (TC), triglyceride (TG), low density lipoprotein cholesterol (LDL-C), high density lipoprotein cholesterol (HDL-C), serum urea nitrogen (BUN), serum creatinine (Scr), alanine aminotransferase (ALT). The clinical outcome was incident diabetes during a 3-years follow-up. Baseline excluding criteria in the original study included as follows:(1) no available information on weight, height and gender; (2) extreme BMI values (&#x0003C;15 or &#x0003E;55 kg/m<sup>2</sup>); (3) visit intervals &#x0003C;2 years; (4) no available fasting plasma glucose value; (5) participants diagnosed with diabetes at baseline (participants diagnosed by self-report or diagnosed by a fasting plasma glucose &#x02265;7.0 mmol/L) and participants with undefined diabetes status at follow-up. A total of 211,833 participants remained after applying exclusion criteria in the original study. In our study, we further excluded participants with incomplete records. To predicting the 3-year risk of incident diabetes, we also excluded participants who lost to follow-up during 3-years follow up and the censored data is excluded (<xref ref-type="bibr" rid="B28">28</xref>). <xref ref-type="fig" rid="F1">Figure 1</xref> depicted the participants&#x00027; selection process. Finally, a total of 15,928 subjects (10,313 male and 5,615 female) were included in the present study.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Flowchart of study participants.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0001.tif"/>
</fig>
<p>The authors of the original study have waived all copyright and related ownership of the raw data. Therefore, we could use these data for secondary analysis without infringing on the authors&#x00027; rights. Furthermore, the original study was approved by the Rich Healthcare Group Review Board, and the information was retrieved retrospectively. And the original study was conducted in accordance with the Declaration of Helsinki, so did this secondary research. The data are anonymous, and the requirement for informed consent was waived by the Rich Healthcare Group Review Board due to the observational nature of the study, as reported elsewhere (<xref ref-type="bibr" rid="B29">29</xref>).</p></sec>
<sec>
<title>Variable Measurement</title>
<p>In each visit to the health check center, participants were required to do a personal questionnaire on demographics, lifestyle, medical history, and family history of chronic disease. And trained staff performed the baseline examination. Weight was measured in light clothing without shoes to the nearest 0.1 kg. The height was accurate to 0.1 cm. BMI was equal to the weight divided by the square of height, which was accurate to 0.1 kg/m<sup>2</sup>. And the staff measured their blood pressure by a standard mercury sphygmomanometer. Fasting venous blood samples were taken after fasting for at least 10 h each visit. Plasma glucose levels were measured by the glucose oxidase method. The clinical measurements of FPG, TC, TG, LDL-C, HDL-C, BUN, Scr, and ALT were conducted by an autoanalyzer (Beckman 5800).</p></sec>
<sec>
<title>Definitions</title>
<p>The definitions of diabetes were fasting blood glucose &#x02265;7.00 mmol/L and/or self-reported diabetes during follow-up. Patients were censored either at the time of the diagnosis or at the last visit, whichever comes first.</p></sec>
<sec>
<title>Statistical Analysis</title>
<p>All eligible participants were randomly assigned to the training set and the validation set. There were 15 candidate baseline variables involving demographic and clinical characteristics. First, we exclude some variables with relatively significant interference based on collinearity screening. Baseline characteristics were described as means &#x000B1; standard deviations (normal distribution) or medians (quartiles) (skewed distribution) for continuous variables and as percentages or frequency for categorical variables. We used two-sample <italic>t</italic>-tests to analyze differences between the training set and validation set for normally distributed continuous variables, Wilcoxon rank-sum tests for non-normally distributed continuous variables, and chi-square tests for categorical variables.</p>
<p>XGBoost is a scalable tree boosting system that can assemble weak prediction models to establish a more reliable prediction model (<xref ref-type="bibr" rid="B30">30</xref>). During the training process, it can generate a new decision tree through gradient boosting on the basis of the existing decision trees to better predict the results. Therefore, a risk prediction system consisting of a series of decision trees is formed after training. In the application process, the predicted risk output is the cumulative score of each decision tree, representing the probability of the predicted outcome. XGBoost provides the importance score of each variable, indicating the relative number of times the variable is used to distribute data in all trees. We ranked these variables according to the prediction contribution of each variable. Given the Shapley Additive exPlanations (SHAP) approach can transform the original non-linear XGBoost model to the summation effects of all variable attributions while approximating the output risk for each participant (<xref ref-type="bibr" rid="B31">31</xref>). Thus, the SHAP method was used to interpret the results of the XGBoost model. We used Shapley values to construct dependency graphs to capture the actual relationship between diabetes risk and the three variables with the most significant prediction contribution. Additionally, we summarized the specificity, sensitivity, accuracy, negative predictive value (NPV), positive predictive value (PPV), positive likelihood ratio (PLR), and negative likelihood ratio (NLR) of the XGBoost model at different predicted probability.</p>
<p>We further established three prediction models based on the predictors chosen by the XGBoost model. First, we applied all risk factors selected by the XGBoost method to build a full model. Second, according to the multivariable fractional polynomials (MFP) algorithm, we used the iterative fashion to determine the significant variables and functional form by backward elimination to establish the MFP model to eliminate the influence of non-linearity and interaction. Third, we conducted a backward step-down selection process based on the Akaike information criterion (AIC) to establish a stepwise model (<xref ref-type="bibr" rid="B32">32</xref>). While confirming the statistical significance of the predictor factors, the stepwise logistic regression can achieve local optimal goodness of fit. To assess the discrimination of these risk prediction models, we plotted the receiver operating characteristic (ROC) curve and calculated the area under the ROC curve (AUC) with 95% confidence intervals (CI) for the two sets. Given nomogram is an intuitive graphical prediction model which provides personalized risk predictions for individuals, we further construct the nomogram of the stepwise model. The nomogram is built according to the proportional conversion of each regression coefficient to a 0- to 100-point scale in multiple logistic regression (<xref ref-type="bibr" rid="B33">33</xref>). The effect of the variable with the highest &#x003B2; coefficient (absolute value) is assigned 100 points. The point of each variable is added to obtain the total points, which can be converted into the predicted probability of incident diabetes. And we used the Hosmer&#x02013;Lemeshow test to compare the predicted risk and observed a 3-year incidence of deciles of predicted diabetes risk and we plotted the calibration bar graph of the nomogram for the probability of incident diabetes (<xref ref-type="bibr" rid="B34">34</xref>). Besides, we performed decision curve analysis to evaluate the clinical use of the prediction model by quantifying the net benefit at different threshold probabilities: subtracting the proportion of participants with false-positive results from the proportion of participants with true-positive results and then weighing the relative hazards of false positive and false negative results to achieve a net benefit from decision-making (<xref ref-type="bibr" rid="B35">35</xref>). And we examined the modifications and interactions between each predictor selected by the stepwise model. In addition, we used a cohort of 11,113 Japanese participants from the NAGALA (NAfd in the Gifu Area, Longitudinal Analysis) database for external validation. The data were also downloaded from the &#x0201C;DATADRYAD&#x0201D; database (<ext-link ext-link-type="uri" xlink:href="http://www.Datadryad.org">www.Datadryad.org</ext-link>), shared by Okamura et al. (<xref ref-type="bibr" rid="B36">36</xref>) from: Ectopic fat obesity presents the greatest risk for incident type 2 diabetes: a population-based longitudinal study. Dryad Digital Repository. <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41366-018-0076-3">https://doi.org/10.1038/s41366-018-0076-3</ext-link>. All results are reported in adherence to the TRIPOD statement (<xref ref-type="bibr" rid="B37">37</xref>).</p>
<p>All statistical analyses were performed by the statistical software package R (<ext-link ext-link-type="uri" xlink:href="http://www.R-project.org">http://www.R-project.org</ext-link>, The R Foundation) and Empower-Stats (<ext-link ext-link-type="uri" xlink:href="http://www.empowerstats.com">http://www.empowerstats.com</ext-link>, X&#x00026;Y Solutions, Inc., Boston, MA). The tests were 2-tailed, and <italic>P</italic> &#x0003C; 0.05 was taken as statistically significant.</p></sec></sec>
<sec sec-type="results" id="s4">
<title>Results</title>
<sec>
<title>Baseline Characteristics of the Study Population</title>
<p>A total of 15,928 eligible participants were included in this study. The mean age of all participants was 43.33 &#x000B1; 12.31 years old. The male/female ratio was 1.84:1. The mean BMI was 23.53 &#x000B1; 3.30 Kg/m<sup>2</sup>. The mean FPG was 4.85 &#x000B1; 0.66 mmol/L. The mean HDL-C and LDL-C were 1.30 &#x000B1; 0.32 and 2.75 &#x000B1; 0.69 mmol/L, respectively. TC was excluded based on collinearity screening.</p>
<p><xref ref-type="table" rid="T1">Table 1</xref> compared the baseline characteristics of the training set (<italic>n</italic> = 7,940) and the validation set (<italic>n</italic> = 7,988). After a 3-year follow-up, 148 and 145 incident diabetes cases occurred in the training and validation set, respectively. There were no statistically significant differences in all baseline characteristics and the number of diabetic patients between the two sets (all <italic>P</italic> &#x0003E; 0.05).</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Baseline characteristics of the training and validation sets.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Characteristic</bold></th>
<th valign="top" align="center"><bold>Training set</bold></th>
<th valign="top" align="center"><bold>Validation set</bold></th>
<th valign="top" align="center"><bold><italic>P</italic>-value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Participants</td>
<td valign="top" align="center">7,940</td>
<td valign="top" align="center">7,988</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Incident diabetes</td>
<td/>
<td/>
<td valign="top" align="center">0.901</td>
</tr>
<tr>
<td valign="top" align="left">No</td>
<td valign="top" align="center">7,795 (98.17%)</td>
<td valign="top" align="center">7,840 (98.15%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Yes</td>
<td valign="top" align="center">145 (1.83%)</td>
<td valign="top" align="center">148 (1.85%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Age (year)</td>
<td valign="top" align="center">43.43 &#x000B1; 12.45</td>
<td valign="top" align="center">43.24 &#x000B1; 12.17</td>
<td valign="top" align="center">0.339</td>
</tr>
<tr>
<td valign="top" align="left">Gender</td>
<td/>
<td/>
<td valign="top" align="center">0.595</td>
</tr>
<tr>
<td valign="top" align="left">Male</td>
<td valign="top" align="center">5,157 (64.95%)</td>
<td valign="top" align="center">5,156 (64.55%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Female</td>
<td valign="top" align="center">2,783 (35.05%)</td>
<td valign="top" align="center">2,832 (35.45%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">BMI (kg/m<sup>2</sup>)</td>
<td valign="top" align="center">23.51 &#x000B1; 3.28</td>
<td valign="top" align="center">23.54 &#x000B1; 3.32</td>
<td valign="top" align="center">0.552</td>
</tr>
<tr>
<td valign="top" align="left">SBP (mmHg)</td>
<td valign="top" align="center">119.90 &#x000B1; 16.00</td>
<td valign="top" align="center">119.62 &#x000B1; 15.77</td>
<td valign="top" align="center">0.266</td>
</tr>
<tr>
<td valign="top" align="left">DBP (mmHg)</td>
<td valign="top" align="center">75.12 &#x000B1; 10.46</td>
<td valign="top" align="center">75.04 &#x000B1; 10.38</td>
<td valign="top" align="center">0.633</td>
</tr>
<tr>
<td valign="top" align="left">FPG (mmol/L)</td>
<td valign="top" align="center">4.86 &#x000B1; 0.66</td>
<td valign="top" align="center">4.84 &#x000B1; 0.66</td>
<td valign="top" align="center">0.247</td>
</tr>
<tr>
<td valign="top" align="left">TG (mmol/L)</td>
<td valign="top" align="center">1.17 (0.80&#x02013;1.77)</td>
<td valign="top" align="center">1.16 (0.80&#x02013;1.75)</td>
<td valign="top" align="center">0.287</td>
</tr>
<tr>
<td valign="top" align="left">HDL-C (mmol/L)</td>
<td valign="top" align="center">1.30 &#x000B1; 0.31</td>
<td valign="top" align="center">1.30 &#x000B1; 0.33</td>
<td valign="top" align="center">0.198</td>
</tr>
<tr>
<td valign="top" align="left">LDL-C (mmol/L)</td>
<td valign="top" align="center">2.75 &#x000B1; 0.69</td>
<td valign="top" align="center">2.75 &#x000B1; 0.69</td>
<td valign="top" align="center">0.913</td>
</tr>
<tr>
<td valign="top" align="left">ALT (U/L)</td>
<td valign="top" align="center">20.00 (14.00&#x02013;30.00)</td>
<td valign="top" align="center">20.00 (14.00&#x02013;30.30)</td>
<td valign="top" align="center">0.566</td>
</tr>
<tr>
<td valign="top" align="left">BUN (mmol/L)</td>
<td valign="top" align="center">4.66 &#x000B1; 1.17</td>
<td valign="top" align="center">4.67 &#x000B1; 1.16</td>
<td valign="top" align="center">0.880</td>
</tr>
<tr>
<td valign="top" align="left">Scr (&#x003BC;mol/L)</td>
<td valign="top" align="center">72.04 &#x000B1; 15.07</td>
<td valign="top" align="center">72.11 &#x000B1; 15.25</td>
<td valign="top" align="center">0.767</td>
</tr>
<tr>
<td valign="top" align="left">Smoking status</td>
<td/>
<td/>
<td valign="top" align="center">0.443</td>
</tr>
<tr>
<td valign="top" align="left">Ever/current</td>
<td valign="top" align="center">1972 (24.84%)</td>
<td valign="top" align="center">2026 (25.36%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Never</td>
<td valign="top" align="center">5968 (75.16%)</td>
<td valign="top" align="center">5962 (74.64%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Drinking status</td>
<td/>
<td/>
<td valign="top" align="center">0.624</td>
</tr>
<tr>
<td valign="top" align="left">Ever/current</td>
<td valign="top" align="center">1,544 (19.45%)</td>
<td valign="top" align="center">1,578 (19.75%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Never</td>
<td valign="top" align="center">6,396 (80.55%)</td>
<td valign="top" align="center">6,410 (80.25%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Family history</td>
<td/>
<td/>
<td valign="top" align="center">0.157</td>
</tr>
<tr>
<td valign="top" align="left">No</td>
<td valign="top" align="center">7,400 (93.20%)</td>
<td valign="top" align="center">7489 (93.75%)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Yes</td>
<td valign="top" align="center">540 (6.80%)</td>
<td valign="top" align="center">499 (6.25%)</td>
<td/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Values are n (%) or mean &#x000B1; SD</italic>.</p>
<p><italic>BMI, Body mass index; SBP, Systolic blood pressure; DBP, Diastolic blood pressure; FPG; Fasting plasma glucose; TG, Triglyceride; HDL-C, High density lipoprotein cholesterol; LDL-C, Low density lipid cholesterol; ALT, Alanine aminotransferase; BUN, Blood urea nitrogen; Scr, Serum creatinine; Family history, Family history of diabetes</italic>.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<title>Development of XGBoost Model</title>
<p><xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref> presented the variables selected by the XGBoost model and the corresponding prediction contributions. The XGBoost model incorporated FPG, BMI, age, HDL-C, ALT, BUN, SBP, LDL-C, Scr, TG, DBP, current smoking, and drinking. The importance score of FPG was 0.5125 and its relative importance was 1.0000, which was the most important variable. The importance score of BMI was 0.0708 and its relative importance was 0.1382, and its prediction contribution was only lower than that of FPG. And the importance score of age is 0.0658, ranking third in the prediction contributions. <xref ref-type="fig" rid="F2">Figure 2</xref> showed the ranking of the variables based on contributing features. <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 1</xref> demonstrated the discriminatory ability of the XGBoost model. The AUCs of the XGBoost model in the training set and validation set were 0.977 and 0.920, respectively. Given FPG, BMI and age shared the top 3 contributing features, and we further used the SHAP method to explore the actual relationship between diabetes risk and them (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 2</xref>). When FPG &#x0003C;4.6 mmol/L, the risk of incident diabetes was at a low level. However, when FPG &#x0003E; 4.6 mmol/L, with the increase of FPG, the risk of developing diabetes increased rapidly. And as BMI and age increased, the risk of diabetes gradually increased.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Shapley values-based interpretation of the model. Contributing feature importance of the variables selected by the XGBoost model.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0002.tif"/>
</fig>
<p><xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 3</xref> presented the result of the decision curve analysis for the XGBoost model. The results showed that if the personal threshold probability of a participant is 50% (i.e., the participant would opt for diabetes screening if the probability of incident diabetes was &#x0003C;50%), then the net benefit is 0.453 when using the model to decide whether to perform diabetes screening (i.e., oral glucose tolerance test), with added benefit compared to the diabetes screening for all or none participants.</p>
<p>And <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 2</xref> summarized the sensitivity and specificity for predicting incident diabetes at different cutoff values in the XGBoost model. The result showed that although higher cutoff values lead to higher specificity, the sensitivity rapidly dropped to a relatively low level.</p></sec>
<sec>
<title>Construction of the Stepwise Model</title>
<p>We further established three prediction models based on the predictors chosen by the XGBoost model, including the MFP model, full model and stepwise model. In the training set, AUCs of the MFP model, full model and stepwise model were 0.937, 0.934 and 0.933, respectively. In the validation set, the corresponding AUCs of those models were 0.908, 0.909 and 0.910, respectively (<xref ref-type="fig" rid="F3">Figure 3</xref>, <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 3</xref>). The AUCs of the three models were relatively close. Given that the stepwise model incorporated fewer risk factors and it was simpler than MFP and full models. Besides, the stepwise model could predict the 3-year diabetes risk relatively well. Therefore, we chose the stepwise model as the optimal risk prediction model for incident diabetes. <xref ref-type="table" rid="T2">Table 2</xref> showed the 6 variables were selected by stepwise model, including FPG, BMI, age, HDL-C, ALT, and LDL-C. The results showed FPG, BMI, age, HDL-C and ALT were positively associated with incident diabetes. And participants with relatively high FPG were more likely to develop diabetes [relative risk (RR):11.2812; 95% CI: 8.0798&#x02013;16.4983]. In contrast, participants with relatively high LDL-C were less likely to develop diabetes (RR, 0.7238; 95% CI: 0.5438&#x02013;0.9229). We further draw a corresponding nomogram to provide a quantitative and simple tool in predicting the risk of diabetes by using age, BMI, FPG, HDL-C, LDL-C, and ALT (<xref ref-type="fig" rid="F4">Figure 4</xref>). Each variable in the nomogram was assigned a specific point, and the points from each variable value are summed to obtain the total points, which was used to obtain the probability for predicting diabetes. And the algorithm of diabetes risk in stepwise model was logit (risk of incident diabetes) = &#x02212;24.07232 &#x0002B;0.04191<sup>&#x0002A;</sup>age (year) &#x0002B; 0.15291<sup>&#x0002A;</sup>BMI (kg/m<sup>2</sup>) &#x0002B; 2.45073<sup>&#x0002A;</sup>FPG (mmol/L) &#x0002B; 1.14025<sup>&#x0002A;</sup>HDL-C (mmol/L) - 0.32400<sup>&#x0002A;</sup>LDL-C (mmol/L) &#x0002B; 0.00852<sup>&#x0002A;</sup>ALT (U/L).</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>The ROC curves of the MFP model, full model and stepwise model in the training set <bold>(A)</bold> and validation set <bold>(B)</bold>.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0003.tif"/>
</fig>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Variables selected using stepwise logistic regression.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th/>
<th valign="top" align="center"><bold>Beta</bold></th>
<th valign="top" align="center"><bold>Standard error</bold></th>
<th valign="top" align="center"><bold><italic>z</italic>-value</bold></th>
<th valign="top" align="center"><bold>RR (95%CI)</bold></th>
<th valign="top" align="center"><bold><italic>P</italic>-value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">(Intercept)</td>
<td valign="top" align="center">&#x02212;24.07232</td>
<td valign="top" align="center">1.34753</td>
<td valign="top" align="center">&#x02212;17.86405</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
</tr>
<tr>
<td valign="top" align="left">FPG (mmol/L)</td>
<td valign="top" align="center">2.45073</td>
<td valign="top" align="center">0.15763</td>
<td valign="top" align="center">15.54774</td>
<td valign="top" align="center">11.2812 (8.0798&#x02013;16.4983)</td>
<td valign="top" align="center">0.0000</td>
</tr>
<tr>
<td valign="top" align="left">HDL-C (mmol/L)</td>
<td valign="top" align="center">1.14025</td>
<td valign="top" align="center">0.29593</td>
<td valign="top" align="center">3.85313</td>
<td valign="top" align="center">3.1101 (1.7651&#x02013;5.8612)</td>
<td valign="top" align="center">0.0000</td>
</tr>
<tr>
<td valign="top" align="left">BMI (kg/m<sup>2</sup>)</td>
<td valign="top" align="center">0.15291</td>
<td valign="top" align="center">0.03016</td>
<td valign="top" align="center">5.07010</td>
<td valign="top" align="center">1.1647 (1.0911&#x02013;1.2413)</td>
<td valign="top" align="center">0.0000</td>
</tr>
<tr>
<td valign="top" align="left">Age (year)</td>
<td valign="top" align="center">0.04191</td>
<td valign="top" align="center">0.00765</td>
<td valign="top" align="center">5.47752</td>
<td valign="top" align="center">1.0427 (1.0276&#x02013;1.0578)</td>
<td valign="top" align="center">0.0000</td>
</tr>
<tr>
<td valign="top" align="left">ALT (U/L)</td>
<td valign="top" align="center">0.00852</td>
<td valign="top" align="center">0.00335</td>
<td valign="top" align="center">2.53939</td>
<td valign="top" align="center">1.0085 (1.0022&#x02013;1.0146)</td>
<td valign="top" align="center">0.0060</td>
</tr>
<tr>
<td valign="top" align="left">LDL-C (mmol/L)</td>
<td valign="top" align="center">&#x02212;0.32400</td>
<td valign="top" align="center">0.14526</td>
<td valign="top" align="center">&#x02212;2.23050</td>
<td valign="top" align="center">0.7238 (0.5438&#x02013;0.9229)</td>
<td valign="top" align="center">0.0030</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>FPG; Fasting plasma glucose; HDL-C, High density lipoprotein cholesterol; BMI, Body mass index; LDL-C, Low density lipid cholesterol; ALT, Alanine aminotransferase; RR, Relative risk; CI, Confidence interval</italic>.</p>
</table-wrap-foot>
</table-wrap>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>The nomogram of the stepwise model to predict the 3-year risk of incident diabetes. When predicting an individual&#x00027;s 3-year risk of diabetes, locate his/her value on each variable axis. Draw a vertical line from that value to the top Points scale to determine how many points are assigned by that variable value. Then, the points from each variable value are summed. Locate the sum on the Total Points scale and vertically project it onto the bottom axis, thus obtaining a personalized 3-year risk of diabetes.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0004.tif"/>
</fig></sec>
<sec>
<title>Performance of the Stepwise Model</title>
<p>The AUCs of the stepwise model were 0.933 and 0.910 in the training and validation sets, respectively (<xref ref-type="fig" rid="F5">Figure 5</xref>). And the result of bootstrap resampling validation (times = 500) confirmed that the prediction performance of the stepwise model in the training cohort was stable (AUC = 0.927) (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 4</xref>). The calibration bar graph of the nomogram for the probability of incident diabetes demonstrated good agreement between observation and prediction both in the training and validation sets (<xref ref-type="fig" rid="F6">Figure 6</xref>). The Hosmer-Lemeshow test indicated that the model was non-significant (<italic>p</italic> = 0.068 for the training set, <italic>p</italic> = 0.165 for the validation set), suggesting a perfect fit between the predicted diabetes risk and the observed diabetes risk.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>The ROC curves of the stepwise model in the training set and validation set.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0005.tif"/>
</fig>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Comparison between predicted and observed 3-year incidence of deciles of the predicted diabetes risk score in the nomogram for the training set <bold>(A)</bold> and validation set <bold>(B)</bold>.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0006.tif"/>
</fig>
<p><xref ref-type="fig" rid="F7">Figure 7</xref> presented the result of decision curve analysis for the stepwise model. The decision curve demonstrated if the threshold probability of a patient was &#x0003E;1%, using the XGBoost model to predict incident diabetes was more beneficial than diabetes screening for all or none of the participants. There was a wide range of alternative threshold probability spectrum, which indicated that the stepwise model had significant clinical use.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>The decision curve for the stepwise model predicts the risk of incident diabetes in the training set <bold>(A)</bold> and validation set <bold>(B)</bold>. Net benefit is shown on the y-axis. The red line represents the model; the thin gray line represents the assumption that all participants develop diabetes; the thin black line represents the assumption that none participants develop diabetes. The decision curve demonstrated that if the threshold probability of a patient is &#x0003E;1%, using the model to predict incident diabetes adds more benefit than diabetes screenings (i.e., oral glucose tolerance test) for all or none of the participants.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0007.tif"/>
</fig></sec>
<sec>
<title>Modifications and Interactions Between Each Predictor in the Nomogram</title>
<p>We examined the modifications and interactions between each predictor selected by the stepwise model, including age, BMI, FPG, HDL-C, LDL-C, and ALT. <xref ref-type="table" rid="T3">Table 3</xref> showed that almost no interactions were observed based on our prior specification (most <italic>P</italic>-values for interaction &#x0003E;0.05), except that BMI and FPG had significant interactions (<italic>P</italic>-values for interaction = 0.017).</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Modifications and interactions between each predictor selected by the stepwise model.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Predictor</bold></th>
<th valign="top" align="center"><bold>Modifier</bold></th>
<th valign="top" align="center"><bold>HR (95%CI)</bold></th>
<th valign="top" align="center"><bold><italic>P</italic> for interaction</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">BMI</td>
<td valign="top" align="center">0.997 (0.994, 1.001)</td>
<td valign="top" align="center">0.186</td>
</tr>
<tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">FPG</td>
<td valign="top" align="center">0.980 (0.958, 1.002)</td>
<td valign="top" align="center">0.077</td>
</tr>
<tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">ALT</td>
<td valign="top" align="center">1.000 (0.999, 1.000)</td>
<td valign="top" align="center">0.824</td>
</tr>
<tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">HDL-C</td>
<td valign="top" align="center">1.015 (0.969, 1.064)</td>
<td valign="top" align="center">0.524</td>
</tr>
<tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">LDL-C</td>
<td valign="top" align="center">0.996 (0.974, 1.018)</td>
<td valign="top" align="center">0.699</td>
</tr>
<tr>
<td valign="top" align="left">ALT</td>
<td valign="top" align="center">FPG</td>
<td valign="top" align="center">1.001 (0.991, 1.011)</td>
<td valign="top" align="center">0.902</td>
</tr>
<tr>
<td valign="top" align="left">ALT</td>
<td valign="top" align="center">BMI</td>
<td valign="top" align="center">1.000 (0.999, 1.002)</td>
<td valign="top" align="center">0.627</td>
</tr>
<tr>
<td valign="top" align="left">ALT</td>
<td valign="top" align="center">HDL-C</td>
<td valign="top" align="center">0.999 (0.979, 1.019)</td>
<td valign="top" align="center">0.896</td>
</tr>
<tr>
<td valign="top" align="left">ALT</td>
<td valign="top" align="center">LDL-C</td>
<td valign="top" align="center">0.994 (0.986, 1.002)</td>
<td valign="top" align="center">0.148</td>
</tr>
<tr>
<td valign="top" align="left">BMI</td>
<td valign="top" align="center">FPG</td>
<td valign="top" align="center">0.904 (0.832, 0.982)</td>
<td valign="top" align="center">0.017</td>
</tr>
<tr>
<td valign="top" align="left">BMI</td>
<td valign="top" align="center">HDL-C</td>
<td valign="top" align="center">0.978 (0.840, 1.139)</td>
<td valign="top" align="center">0.776</td>
</tr>
<tr>
<td valign="top" align="left">BMI</td>
<td valign="top" align="center">LDL-C</td>
<td valign="top" align="center">1.001 (0.923, 1.086)</td>
<td valign="top" align="center">0.979</td>
</tr>
<tr>
<td valign="top" align="left">FPG</td>
<td valign="top" align="center">HDL-C</td>
<td valign="top" align="center">1.903 (0.692, 5.233)</td>
<td valign="top" align="center">0.213</td>
</tr>
<tr>
<td valign="top" align="left">FPG</td>
<td valign="top" align="center">LDL-C</td>
<td valign="top" align="center">1.034 (0.643, 1.665)</td>
<td valign="top" align="center">0.889</td>
</tr>
<tr>
<td valign="top" align="left">HDL-C</td>
<td valign="top" align="center">LDL-C</td>
<td valign="top" align="center">1.268 (0.560, 2.872)</td>
<td valign="top" align="center">0.569</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>FPG; Fasting plasma glucose; HDL-C, High density lipoprotein cholesterol; BMI, Body mass index; LDL-C, Low density lipid cholesterol; ALT, Alanine aminotransferase; HR, Hazard Ratio; CI, Confidence interval</italic>.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<title>External Validation</title>
<p>The external validation was performed on a cohort of 11,113 Japanese participants. The AUC for the external validation set was 0.830, which showed good discrimination (<xref ref-type="fig" rid="F8">Figure 8</xref>). And the Hosmer-Lemeshow test for the external validation set showed no statistically significant difference between the predicted diabetes risk and observed diabetes risk, which revealed a perfect fit between the predicted diabetes risk and the observed diabetes risk (<italic>P</italic> = 0.824) (<xref ref-type="fig" rid="F9">Figure 9</xref>). In short, the external validation indicated that the stepwise model was well-generalized.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>The ROC curves of the external validation.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0008.tif"/>
</fig>
<fig id="F9" position="float">
<label>Figure 9</label>
<caption><p>Comparison between predicted and observed 3-year incidence of deciles of a predicted diabetes risk score for the external validation set.</p></caption>
<graphic xlink:href="fpubh-09-626331-g0009.tif"/>
</fig></sec></sec>
<sec sec-type="discussion" id="s5">
<title>Discussion</title>
<p>In the present study, we established and validated a risk assessment system for characterizing the 3-year risk of incident diabetes. The XGBoost model incorporated FPG, BMI, Age, HDL-C, ALT, BUN, SBP, LDL-C, Scr, TG, DBP, current smoking, and drinking, of which FPG, BMI and age shared the top three prediction contribution. And we further established a stepwise model and a corresponding prediction nomogram based on the predictors chosen by the XGBoost model. The AUCs of the stepwise model were 0.933 and 0.910 in the training and validation sets, respectively. The Hosmer-Lemeshow test showed a perfect fit between the predicted diabetes risk and the observed diabetes risk (<italic>p</italic> = 0.068 for the training set, <italic>p</italic> = 0.165 for the validation set). Decision curve analysis presented the clinical use of the stepwise model and there was a wide range of alternative threshold probability spectrum. Furthermore, the AUC for the external validation set was 0.830, and the Hosmer-Lemeshow test for the external validation set showed no statistically significant difference between the predicted diabetes risk and observed diabetes risk (<italic>P</italic> = 0.824). Therefore, the external validation indicated that the stepwise model was well-generalized.</p>
<p>Machine learning is a collection of data analysis techniques, which aims to establish prediction models that improve with experience and it is becoming an important part of modern medical research (<xref ref-type="bibr" rid="B13">13</xref>). It holds promise to enable computers to assist humans to analyze large and complex data sets (<xref ref-type="bibr" rid="B14">14</xref>). So far, researchers have developed a variety of machine learning algorithms, including decision trees, kernel machines, neural networks, support vector machines, logistic regression, Bayesian classifiers, ensemble learning, multilayer perceptron, and so on (<xref ref-type="bibr" rid="B38">38</xref>&#x02013;<xref ref-type="bibr" rid="B45">45</xref>). Machine learning has unique advantages, including scalability and flexibility, making it applicable to various tasks, such as classification, risk stratification, diagnosis and survival predictions (<xref ref-type="bibr" rid="B46">46</xref>). Besides, it handles large multidimensional sets of time-to-event data without the need for assumptions of normality of distributions, linearity of risk prediction, and overfitting of models (<xref ref-type="bibr" rid="B47">47</xref>). As yet, machine learning techniques have been applied to a broad range of areas within diabetes, some of which are used to build risk prediction models for incident diabetes (<xref ref-type="bibr" rid="B20">20</xref>, <xref ref-type="bibr" rid="B21">21</xref>, <xref ref-type="bibr" rid="B48">48</xref>&#x02013;<xref ref-type="bibr" rid="B52">52</xref>). As a novel machine learning method, XGBoost has become well-established in the machine learning community and gained a positive reputation through numerous machine learning challenges (<xref ref-type="bibr" rid="B53">53</xref>). The XGBoost algorithm can automatically handle missing data by adding a default direction for the missing values in each tree node (<xref ref-type="bibr" rid="B54">54</xref>). XGBoost has higher calculating speed and accuracy based on the principle of gradient boosting (<xref ref-type="bibr" rid="B30">30</xref>). Evidence showed that XGBoost&#x00027;s performance was significantly better than traditional statistical methods (<xref ref-type="bibr" rid="B24">24</xref>, <xref ref-type="bibr" rid="B55">55</xref>, <xref ref-type="bibr" rid="B56">56</xref>). To our knowledge, the XGBoost method has not been applied to develop a diabetes risk prediction model. In addition, in those studies using machine learning techniques to predict the risk of diabetes, researchers mainly focused on comparing various machine learning methods but did not extend the results of machine learning to clinical applications. And AUCs of those models were between 0.580 and 0.925 (<xref ref-type="bibr" rid="B20">20</xref>, <xref ref-type="bibr" rid="B21">21</xref>, <xref ref-type="bibr" rid="B48">48</xref>&#x02013;<xref ref-type="bibr" rid="B52">52</xref>). However, this was the first study that used XGBoost method to evaluate the importance of variables and characterize the 3-year risk of incident diabetes among Chinese adults. Notably, we ranked the variables according to the prediction contribution of each selected variable. Furthermore, we used the SHAP method to capture the actual relationship between diabetes risk and the three variables with the largest predicted contribution. Moreover, we developed a simple stepwise model and constructing a corresponding nomogram based on the XGBoost model. And we performed the Hosmer-Lemeshow test to explore the difference between the predicted diabetes risk and the observed diabetes risk. And we did the decision curve analysis to explore the clinical use of the stepwise model, and there was a wide range of alternative threshold probability spectrum. Moreover, we examined the modifications and interactions between each predictor selected by the stepwise model. Furthermore, we used a cohort of 11,113 Japanese participants as the external validation set to explore the reliability and generalizability of the stepwise model.</p>
<p>Diabetes can cause various complications, bring severe physical and psychological distress to patients, and bring a huge burden to the healthcare system. And it tends to be undiagnosed due to the lack of specific symptoms. However, screening for diabetes through oral glucose tolerance test may increase the yield and economic efficiency of screening (<xref ref-type="bibr" rid="B57">57</xref>). Our results made up for this deficiency, which helps identify individuals with a high risk of developing diabetes and avoiding the costs and efforts of prevention and treatment in low-risk groups.</p>
<p>Identifying key factors has great clinical significance in the risk assessment of incident diabetes. FPG was the most important risk predictor in our study. Impaired fasting plasma glucose is one of the diagnostic criteria for diabetes. Researchers found compared with those with impaired fasting blood glucose, people with normal fasting blood glucose have a significantly lower risk of developing diabetes (4.0 vs. 11.3%) (<xref ref-type="bibr" rid="B58">58</xref>). BMI had the second-largest predicted contribution. The original research showed for every 1 kg/m<sup>2</sup> increase in BMI among Chinese adults, the risk of diabetes increases by 23% (<xref ref-type="bibr" rid="B27">27</xref>). Multiple studies have demonstrated overweight or obesity was related to the risk of diabetes (<xref ref-type="bibr" rid="B59">59</xref>, <xref ref-type="bibr" rid="B60">60</xref>). Evidence showed obesity, dyslipidemia, abnormal hepatocellular function, and diabetes usually coexist in the same subject and have common pathological mediators (inflammation, metabolic disorders, insulin resistance and intestinal flora imbalance, etc.) (<xref ref-type="bibr" rid="B61">61</xref>&#x02013;<xref ref-type="bibr" rid="B63">63</xref>). The prevalence of diabetes markedly increases with age (<xref ref-type="bibr" rid="B64">64</xref>). The aging of pancreatic &#x003B2; cells can lead to decreased glucose sensitivity and insulin secretion defects (<xref ref-type="bibr" rid="B65">65</xref>). Therefore, the application of these risk predictors in our models is well-founded.</p>
<p>There are some strengths of our study, as follows: (1) As a large-scale multicenter study, our models can be well-applied to the Chinese population. (2) This was the first study that used the XGBoost method to characterize the 3-year risk of incident diabetes. (3) We presented the predicted contribution of each variable selected by the XGBoost model and sorted them in the form of a bar chart. (4) We developed a simple stepwise model based on the XGBoost model and constructed a corresponding nomogram to provide a personalized risk assessment tool. (5) We examined the modifications and interactions between each predictor selected by the stepwise model. (6) We used a cohort of Japanese participants as the external validation set to explore the reliability and generalizability of the stepwise model. (7) Since this was a retrospective cohort study, it could decrease the risk of selection bias and observation bias.</p>
<p>However, there are still some potential limitations. First, the variables we extracted were limited and lacked information about other diabetes risk factors, such as glycated glycosylated hemoglobin, serum insulin and C-peptide concentration. Second, due to the original study design, we cannot distinguish the types of diabetes mellitus. Considering type 2 diabetes mellitus is the most common kind of diabetes, accounting for over 90% of diabetes cases (<xref ref-type="bibr" rid="B66">66</xref>), our findings represent type 2 diabetes mellitus. Third, the researchers did not perform a 2-h oral glucose tolerance test. Thus, our diagnostic criteria for diabetes mellitus may have missed some diabetic patients. However, it is not feasible to perform an oral glucose tolerance test on all participants in such a large-scale cohort study. Fourth, there are too many missing values of variables in the original data, and multiple imputations to replace missing values were not feasible. Therefore, we excluded participants with incomplete records for a complete case study.</p></sec>
<sec sec-type="conclusions" id="s6">
<title>Conclusion</title>
<p>We established and validated a risk assessment system for characterizing the 3-year risk of incident diabetes, which showed outstanding performance. And FPG, BMI and age shared the top three prediction contributions. We also constructed a prediction nomogram to provide a personalized risk assessment tool for developing diabetes.</p></sec>
<sec sec-type="data-availability-statement" id="s7">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary Materials</xref>, further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s8">
<title>Ethics Statement</title>
<p>The studies involving human participants were reviewed and approved by the Rich Healthcare Group Review Board,and the information was retrieved retrospectively.The data are anonymous, and the requirement for informed consent was waived by the Rich Healthcare Group Review Board due to the observational nature of the study, as reported elsewhere.</p></sec>
<sec id="s9">
<title>Author Contributions</title>
<p>YW and HH conceived and designed the research and drafted the manuscript. JC and RC did statistical analysis. XZ and HC took part in the discussion. DY revised the manuscript. All authors read and approved the final manuscript.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
</body>
<back>
<ack><p>The authors thank the funder and all study participants.</p>
</ack><sec sec-type="supplementary-material" id="s10">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fpubh.2021.626331/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fpubh.2021.626331/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cho</surname> <given-names>NH</given-names></name> <name><surname>Shaw</surname> <given-names>JE</given-names></name> <name><surname>Karuranga</surname> <given-names>S</given-names></name> <name><surname>Huang</surname> <given-names>Y</given-names></name> <name><surname>Da</surname> <given-names>RFJ</given-names></name> <name><surname>Ohlrogge</surname> <given-names>AW</given-names></name> <etal/></person-group>. <article-title>IDF diabetes atlas: global estimates of diabetes prevalence for 2017 and projections for 2045</article-title>. <source>Diabetes Res Clin Pract.</source> (<year>2018</year>) <volume>138</volume>:<fpage>271</fpage>&#x02013;<lpage>81</lpage>. <pub-id pub-id-type="doi">10.1016/j.diabres.2018.02.023</pub-id></citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Teng</surname> <given-names>D</given-names></name> <name><surname>Shi</surname> <given-names>X</given-names></name> <name><surname>Qin</surname> <given-names>G</given-names></name> <name><surname>Qin</surname> <given-names>Y</given-names></name> <name><surname>Quan</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Prevalence of diabetes recorded in mainland China using 2018 diagnostic criteria from the American Diabetes Association: national cross sectional study</article-title>. <source>BMJ.</source> (<year>2020</year>) <volume>369</volume>:<fpage>m997</fpage>. <pub-id pub-id-type="doi">10.1136/bmj.m997</pub-id><pub-id pub-id-type="pmid">32345662</pub-id></citation></ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Thomas</surname> <given-names>MC</given-names></name> <name><surname>Brownlee</surname> <given-names>M</given-names></name> <name><surname>Susztak</surname> <given-names>K</given-names></name> <name><surname>Sharma</surname> <given-names>K</given-names></name> <name><surname>Jandeleit-Dahm</surname> <given-names>KA</given-names></name> <name><surname>Zoungas</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>Diabetic kidney disease</article-title>. <source>Nat Rev Dis Primers.</source> (<year>2015</year>) <volume>1</volume>:<fpage>15018</fpage>. <pub-id pub-id-type="doi">10.1038/nrdp.2015.70</pub-id></citation></ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cheung</surname> <given-names>N</given-names></name> <name><surname>Mitchell</surname> <given-names>P</given-names></name> <name><surname>Wong</surname> <given-names>TY</given-names></name></person-group>. <article-title>Diabetic retinopathy</article-title>. <source>Lancet.</source> (<year>2010</year>) <volume>376</volume>:<fpage>124</fpage>&#x02013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1016/S0140-6736(09)62124-3</pub-id></citation></ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Feldman</surname> <given-names>EL</given-names></name> <name><surname>Callaghan</surname> <given-names>BC</given-names></name> <name><surname>Pop-Busui</surname> <given-names>R</given-names></name> <name><surname>Zochodne</surname> <given-names>DW</given-names></name> <name><surname>Wright</surname> <given-names>DE</given-names></name> <name><surname>Bennett</surname> <given-names>DL</given-names></name> <etal/></person-group>. <article-title>Diabetic neuropathy</article-title>. <source>Nat Rev Dis Primers.</source> (<year>2019</year>) <volume>5</volume>:<fpage>42</fpage>. <pub-id pub-id-type="doi">10.1038/s41572-019-0092-1</pub-id></citation></ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zelniker</surname> <given-names>TA</given-names></name> <name><surname>Wiviott</surname> <given-names>SD</given-names></name> <name><surname>Raz</surname> <given-names>I</given-names></name> <name><surname>Im</surname> <given-names>K</given-names></name> <name><surname>Goodrich</surname> <given-names>EL</given-names></name> <name><surname>Bonaca</surname> <given-names>MP</given-names></name> <etal/></person-group>. <article-title>SGLT2 inhibitors for primary and secondary prevention of cardiovascular and renal outcomes in type 2 diabetes: a systematic review and meta-analysis of cardiovascular outcome trials</article-title>. <source>Lancet.</source> (<year>2019</year>) <volume>393</volume>:<fpage>31</fpage>&#x02013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1016/S0140-6736(18)32590-X</pub-id><pub-id pub-id-type="pmid">30424892</pub-id></citation></ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hardigan</surname> <given-names>T</given-names></name> <name><surname>Ward</surname> <given-names>R</given-names></name> <name><surname>Ergul</surname> <given-names>A</given-names></name></person-group>. <article-title>Cerebrovascular complications of diabetes: focus on cognitive dysfunction</article-title>. <source>Clin Sci.</source> (<year>2016</year>) <volume>130</volume>:<fpage>1807</fpage>&#x02013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1042/CS20160397</pub-id><pub-id pub-id-type="pmid">27634842</pub-id></citation></ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>DD</given-names></name> <name><surname>Ley</surname> <given-names>SH</given-names></name> <name><surname>Vasanti</surname> <given-names>M</given-names></name> <name><surname>Howard</surname> <given-names>AG</given-names></name> <name><surname>He</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>Time trends of dietary and lifestyle factors and their potential impact on diabetes burden in China</article-title>. <source>Diabetes Care.</source> (<year>2017</year>) <volume>40</volume>:<fpage>1685</fpage>&#x02013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.2337/dc17-0571</pub-id><pub-id pub-id-type="pmid">29046327</pub-id></citation></ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Le Roux</surname> <given-names>CW</given-names></name> <name><surname>Astrup</surname> <given-names>A</given-names></name> <name><surname>Fujioka</surname> <given-names>K</given-names></name> <name><surname>Greenway</surname> <given-names>F</given-names></name> <name><surname>Lau</surname> <given-names>D</given-names></name> <name><surname>Van Gaal</surname> <given-names>L</given-names></name> <etal/></person-group>. <article-title>3 years of liraglutide versus placebo for type 2 diabetes risk reduction and weight management in individuals with prediabetes: a randomised, double-blind trial</article-title>. <source>Lancet.</source> (<year>2017</year>) <volume>389</volume>:<fpage>1399</fpage>&#x02013;<lpage>409</lpage>. <pub-id pub-id-type="doi">10.1016/S0140-6736(17)30069-7</pub-id><pub-id pub-id-type="pmid">28237263</pub-id></citation></ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gregg</surname> <given-names>EW</given-names></name> <name><surname>Chen</surname> <given-names>H</given-names></name> <name><surname>Wagenknecht</surname> <given-names>LE</given-names></name> <name><surname>Clark</surname> <given-names>JM</given-names></name> <name><surname>Delahanty</surname> <given-names>LM</given-names></name> <name><surname>Bantle</surname> <given-names>J</given-names></name> <etal/></person-group>. <article-title>Association of an intensive lifestyle intervention with remission of type 2 diabetes</article-title>. <source>JAMA.</source> (<year>2012</year>) <volume>308</volume>:<fpage>2489</fpage>&#x02013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1001/jama.2012.67929</pub-id><pub-id pub-id-type="pmid">23288372</pub-id></citation></ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brito</surname> <given-names>JP</given-names></name> <name><surname>Montori</surname> <given-names>VM</given-names></name> <name><surname>Davis</surname> <given-names>AM</given-names></name></person-group>. <article-title>Metabolic surgery in the treatment algorithm for type 2 diabetes: a joint statement by international diabetes organizations</article-title>. <source>JAMA.</source> (<year>2017</year>) <volume>317</volume>:<fpage>635</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1001/jama.2016.20563</pub-id><pub-id pub-id-type="pmid">28196240</pub-id></citation></ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shi</surname> <given-names>X</given-names></name> <name><surname>Shi</surname> <given-names>Y</given-names></name> <name><surname>Chen</surname> <given-names>N</given-names></name> <name><surname>Lin</surname> <given-names>M</given-names></name> <name><surname>Su</surname> <given-names>W</given-names></name> <name><surname>Zhang</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Effect of exenatide after short-time intensive insulin therapy on glycaemic remission maintenance in type 2 diabetes patients: a randomized controlled trial</article-title>. <source>Sci Rep.</source> (<year>2017</year>) <volume>7</volume>:<fpage>2383</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-017-02631-1</pub-id><pub-id pub-id-type="pmid">28539618</pub-id></citation></ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Camacho</surname> <given-names>DM</given-names></name> <name><surname>Collins</surname> <given-names>KM</given-names></name> <name><surname>Powers</surname> <given-names>RK</given-names></name> <name><surname>Costello</surname> <given-names>JC</given-names></name> <name><surname>Collins</surname> <given-names>JJ</given-names></name></person-group>. <article-title>Next-generation machine learning for biological networks</article-title>. <source>Cell.</source> (<year>2018</year>) <volume>173</volume>:<fpage>1581</fpage>&#x02013;<lpage>92</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2018.05.015</pub-id><pub-id pub-id-type="pmid">29887378</pub-id></citation></ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Libbrecht</surname> <given-names>MW</given-names></name> <name><surname>Noble</surname> <given-names>WS</given-names></name></person-group>. <article-title>Machine learning applications in genetics and genomics</article-title>. <source>Nat Rev Genet.</source> (<year>2015</year>) <volume>16</volume>:<fpage>321</fpage>&#x02013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1038/nrg3920</pub-id><pub-id pub-id-type="pmid">25948244</pub-id></citation></ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dwyer</surname> <given-names>DB</given-names></name> <name><surname>Falkai</surname> <given-names>P</given-names></name> <name><surname>Koutsouleris</surname> <given-names>N</given-names></name></person-group>. <article-title>Machine learning approaches for clinical psychology and psychiatry</article-title>. <source>Annu Rev Clin Psychol.</source> (<year>2018</year>) <volume>14</volume>:<fpage>91</fpage>&#x02013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.1146/annurev-clinpsy-032816-045037</pub-id><pub-id pub-id-type="pmid">29401044</pub-id></citation></ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jordan</surname> <given-names>MI</given-names></name> <name><surname>Mitchell</surname> <given-names>TM</given-names></name></person-group>. <article-title>Machine learning: trends, perspectives, and prospects</article-title>. <source>Science.</source> (<year>2015</year>) <volume>349</volume>:<fpage>255</fpage>&#x02013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1126/science.aaa8415</pub-id><pub-id pub-id-type="pmid">31054502</pub-id></citation></ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lagani</surname> <given-names>V</given-names></name> <name><surname>Koumakis</surname> <given-names>L</given-names></name> <name><surname>Chiarugi</surname> <given-names>F</given-names></name> <name><surname>Lakasing</surname> <given-names>E</given-names></name> <name><surname>Tsamardinos</surname> <given-names>I</given-names></name></person-group>. <article-title>A systematic review of predictive risk models for diabetes complications based on large scale clinical studies</article-title>. <source>J Diabetes Complicat.</source> (<year>2013</year>) <volume>27</volume>:<fpage>407</fpage>&#x02013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1016/j.jdiacomp.2012.11.003</pub-id><pub-id pub-id-type="pmid">23273850</pub-id></citation></ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xiong</surname> <given-names>XL</given-names></name> <name><surname>Zhang</surname> <given-names>RX</given-names></name> <name><surname>Bi</surname> <given-names>Y</given-names></name> <name><surname>Zhou</surname> <given-names>WH</given-names></name> <name><surname>Yu</surname> <given-names>Y</given-names></name> <name><surname>Zhu</surname> <given-names>DL</given-names></name></person-group>. <article-title>Machine learning models in type 2 diabetes risk prediction: results from a cross-sectional retrospective study in Chinese adults</article-title>. <source>Curr Med Sci.</source> (<year>2019</year>) <volume>39</volume>:<fpage>582</fpage>&#x02013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1007/s11596-019-2077-4</pub-id><pub-id pub-id-type="pmid">31346994</pub-id></citation></ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname> <given-names>Q</given-names></name> <name><surname>Qu</surname> <given-names>K</given-names></name> <name><surname>Luo</surname> <given-names>Y</given-names></name> <name><surname>Yin</surname> <given-names>D</given-names></name> <name><surname>Ju</surname> <given-names>Y</given-names></name> <name><surname>Tang</surname> <given-names>H</given-names></name></person-group>. <article-title>Predicting diabetes mellitus with machine learning techniques</article-title>. <source>Front Genet.</source> (<year>2018</year>) <volume>9</volume>:<fpage>515</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2018.00515</pub-id><pub-id pub-id-type="pmid">31623111</pub-id></citation></ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Perveen</surname> <given-names>S</given-names></name> <name><surname>Shahbaz</surname> <given-names>M</given-names></name> <name><surname>Keshavjee</surname> <given-names>K</given-names></name> <name><surname>Guergachi</surname> <given-names>A</given-names></name></person-group>. <article-title>Prognostic modeling and prevention of diabetes using machine learning technique</article-title>. <source>Sci Rep.</source> (<year>2019</year>) <volume>9</volume>:<fpage>13805</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-019-49563-6</pub-id><pub-id pub-id-type="pmid">31551457</pub-id></citation></ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>L</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Niu</surname> <given-names>M</given-names></name> <name><surname>Wang</surname> <given-names>C</given-names></name> <name><surname>Wang</surname> <given-names>Z</given-names></name></person-group>. <article-title>Machine learning for characterizing risk of type 2 diabetes mellitus in a rural Chinese population: the Henan Rural Cohort Study</article-title>. <source>Sci Rep.</source> (<year>2020</year>) <volume>10</volume>:<fpage>4406</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-61123-x</pub-id><pub-id pub-id-type="pmid">32157171</pub-id></citation></ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>X</given-names></name> <name><surname>Wu</surname> <given-names>S</given-names></name> <name><surname>Fang</surname> <given-names>N</given-names></name> <name><surname>Sun</surname> <given-names>X</given-names></name> <name><surname>Fan</surname> <given-names>J</given-names></name></person-group>. <article-title>Evaluation of single-cell classifiers for single-cell RNA sequencing data sets</article-title>. <source>Brief Bioinform.</source> (<year>2020</year>) <volume>21</volume>:<fpage>1581</fpage>&#x02013;<lpage>595</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbz096</pub-id><pub-id pub-id-type="pmid">31675098</pub-id></citation></ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Polano</surname> <given-names>M</given-names></name> <name><surname>Chierici</surname> <given-names>M</given-names></name> <name><surname>Dal Bo</surname> <given-names>M</given-names></name> <name><surname>Gentilini</surname> <given-names>D</given-names></name> <name><surname>Di Cintio</surname> <given-names>F</given-names></name> <name><surname>Baboci</surname> <given-names>L</given-names></name> <etal/></person-group>. <article-title>A pan-cancer approach to predict responsiveness to immune checkpoint inhibitors by machine learning</article-title>. <source>Cancers.</source> (<year>2019</year>) <volume>11</volume>:<fpage>1562</fpage>. <pub-id pub-id-type="doi">10.3390/cancers11101562</pub-id><pub-id pub-id-type="pmid">31618839</pub-id></citation></ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Z</given-names></name> <name><surname>Ho</surname> <given-names>KM</given-names></name> <name><surname>Hong</surname> <given-names>Y</given-names></name></person-group>. <article-title>Machine learning for the prediction of volume responsiveness in patients with oliguric acute kidney injury in critical care</article-title>. <source>Crit Care.</source> (<year>2019</year>) <volume>23</volume>:<fpage>112</fpage>. <pub-id pub-id-type="doi">10.1186/s13054-019-2411-z</pub-id><pub-id pub-id-type="pmid">30961662</pub-id></citation></ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>K</given-names></name> <name><surname>Zuo</surname> <given-names>P</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>M</given-names></name> <name><surname>Zhao</surname> <given-names>X</given-names></name> <name><surname>Xie</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>Clinical and laboratory predictors of in-hospital mortality in patients with COVID-19: a cohort study in Wuhan, China</article-title>. <source>Clin Infect Dis</source>. (<year>2020</year>) <volume>71</volume>:<fpage>2079</fpage>&#x02013;<lpage>88</lpage>. <pub-id pub-id-type="doi">10.2139/ssrn.3546115</pub-id></citation></ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tahmassebi</surname> <given-names>A</given-names></name> <name><surname>Wengert</surname> <given-names>GJ</given-names></name> <name><surname>Helbich</surname> <given-names>TH</given-names></name> <name><surname>Bago-Horvath</surname> <given-names>Z</given-names></name> <name><surname>Alaei</surname> <given-names>S</given-names></name> <name><surname>Bartsch</surname> <given-names>R</given-names></name> <etal/></person-group>. <article-title>Impact of machine learning with multiparametric magnetic resonance imaging of the breast for early prediction of response to neoadjuvant chemotherapy and survival outcomes in breast cancer patients</article-title>. <source>Invest Radiol.</source> (<year>2019</year>) <volume>54</volume>:<fpage>110</fpage>&#x02013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1097/RLI.0000000000000518</pub-id><pub-id pub-id-type="pmid">30358693</pub-id></citation></ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>XP</given-names></name> <name><surname>Yuan</surname> <given-names>J</given-names></name> <name><surname>Cai</surname> <given-names>B</given-names></name> <name><surname>Wang</surname> <given-names>XL</given-names></name> <name><surname>Wu</surname> <given-names>XL</given-names></name> <etal/></person-group>. <article-title>Association of body mass index and age with incident diabetes in Chinese adults: a population-based cohort study</article-title>. <source>BMJ Open.</source> (<year>2018</year>) <volume>8</volume>:<fpage>e21768</fpage>. <pub-id pub-id-type="doi">10.1136/bmjopen-2018-021768</pub-id><pub-id pub-id-type="pmid">30269064</pub-id></citation></ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>K</given-names></name> <name><surname>Gong</surname> <given-names>M</given-names></name> <name><surname>Xie</surname> <given-names>S</given-names></name> <name><surname>Zhang</surname> <given-names>M</given-names></name> <name><surname>Zheng</surname> <given-names>H</given-names></name> <name><surname>Zhao</surname> <given-names>X</given-names></name> <etal/></person-group>. <article-title>Nomogram prediction for the 3-year risk of type 2 diabetes in healthy mainland China residents</article-title>. <source>EPMA J.</source> (<year>2019</year>) <volume>10</volume>:<fpage>227</fpage>&#x02013;<lpage>37</lpage>. <pub-id pub-id-type="doi">10.1007/s13167-019-00181-2</pub-id><pub-id pub-id-type="pmid">31462940</pub-id></citation></ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Geleris</surname> <given-names>J</given-names></name> <name><surname>Sun</surname> <given-names>Y</given-names></name> <name><surname>Platt</surname> <given-names>J</given-names></name> <name><surname>Zucker</surname> <given-names>J</given-names></name> <name><surname>Baldwin</surname> <given-names>M</given-names></name> <name><surname>Hripcsak</surname> <given-names>G</given-names></name> <etal/></person-group>. <article-title>Observational study of hydroxychloroquine in hospitalized patients with Covid-19</article-title>. <source>N Engl J Med.</source> (<year>2020</year>) <volume>382</volume>:<fpage>2411</fpage>&#x02013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMoa2012410</pub-id><pub-id pub-id-type="pmid">32379955</pub-id></citation></ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T</given-names></name> <name><surname>Guestrin</surname> <given-names>C</given-names></name></person-group>. <article-title>XGBoost: a scalable tree boosting system</article-title>. In: <source>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining: 2016</source>. <publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>ACM</publisher-name> (<year>2016</year>). p. <fpage>785</fpage>&#x02013;<lpage>94</lpage>.</citation></ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lundberg</surname> <given-names>S</given-names></name> <name><surname>Lee</surname> <given-names>S</given-names></name></person-group>. <article-title>A unified approach to interpreting model predictions</article-title>. Advances in neural information processing systems. <publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>Neural Information Processing Systems</publisher-name> (<year>2017</year>). p. <fpage>4765</fpage>&#x02013;<lpage>74</lpage>.</citation></ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Collignon</surname> <given-names>O</given-names></name> <name><surname>Monnez</surname> <given-names>J</given-names></name></person-group>. <article-title>Clustering of the values of a response variable and simultaneous covariate selection using a stepwise algorithm</article-title>. <source>Appl. Math.</source> (<year>2016</year>) <volume>7</volume>:<fpage>1639</fpage>&#x02013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.4236/am.2016.715141</pub-id></citation></ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lei</surname> <given-names>Z</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Wu</surname> <given-names>D</given-names></name> <name><surname>Xia</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>Q</given-names></name> <name><surname>Si</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Nomogram for preoperative estimation of microvascular invasion risk in hepatitis B virus-related hepatocellular carcinoma within the milan criteria</article-title>. <source>JAMA Surg.</source> (<year>2016</year>) <volume>151</volume>:<fpage>356</fpage>&#x02013;<lpage>63</lpage>. <pub-id pub-id-type="doi">10.1001/jamasurg.2015.4257</pub-id><pub-id pub-id-type="pmid">26579636</pub-id></citation></ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>F</given-names></name> <name><surname>Tao</surname> <given-names>Q</given-names></name> <name><surname>Zhan</surname> <given-names>S</given-names></name></person-group>. <article-title>An accurate risk score for estimation 5-year risk of type 2 diabetes based on a health screening population in Taiwan</article-title>. <source>Diabetes Res Clin Pract.</source> (<year>2009</year>) <volume>85</volume>:<fpage>228</fpage>&#x02013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1016/j.diabres.2009.05.005</pub-id><pub-id pub-id-type="pmid">19500871</pub-id></citation></ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fitzgerald</surname> <given-names>M</given-names></name> <name><surname>Saville</surname> <given-names>BR</given-names></name> <name><surname>Lewis</surname> <given-names>RJ</given-names></name></person-group>. <article-title>Decision curve analysis</article-title>. <source>JAMA.</source> (<year>2015</year>) <volume>313</volume>:<fpage>409</fpage>&#x02013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1001/jama.2015.37</pub-id></citation></ref>
<ref id="B36">
<label>36.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Okamura</surname> <given-names>T</given-names></name> <name><surname>Hashimoto</surname> <given-names>Y</given-names></name> <name><surname>Hamaguchi</surname> <given-names>M</given-names></name> <name><surname>Obora</surname> <given-names>A</given-names></name> <name><surname>Kojima</surname> <given-names>T</given-names></name> <name><surname>Fukui</surname> <given-names>M</given-names></name></person-group>. <article-title>Ectopic fat obesity presents the greatest risk for incident type 2 diabetes: a population-based longitudinal study</article-title>. <source>Int J Obes.</source> (<year>2019</year>) <volume>43</volume>:<fpage>139</fpage>&#x02013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1038/s41366-018-0076-3</pub-id><pub-id pub-id-type="pmid">29717276</pub-id></citation></ref>
<ref id="B37">
<label>37.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Collins</surname> <given-names>GS</given-names></name> <name><surname>Reitsma</surname> <given-names>JB</given-names></name> <name><surname>Altman</surname> <given-names>DG</given-names></name> <name><surname>Moons</surname> <given-names>KG</given-names></name></person-group>. <article-title>Transparent reporting of a multivariable prediction model for individual prognosis or diagnosis (TRIPOD): the TRIPOD statement</article-title>. <source>BMJ.</source> (<year>2015</year>) <volume>350</volume>:<fpage>g7594</fpage>. <pub-id pub-id-type="doi">10.1136/bmj.g7594</pub-id><pub-id pub-id-type="pmid">25627261</pub-id></citation></ref>
<ref id="B38">
<label>38.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kengne</surname> <given-names>AP</given-names></name> <name><surname>Beulens</surname> <given-names>JW</given-names></name> <name><surname>Peelen</surname> <given-names>LM</given-names></name> <name><surname>Moons</surname> <given-names>KG</given-names></name> <name><surname>van der Schouw</surname> <given-names>YT</given-names></name> <name><surname>Schulze</surname> <given-names>MB</given-names></name> <etal/></person-group>. <article-title>Non-invasive risk scores for prediction of type 2 diabetes (EPIC-InterAct): a validation of existing models</article-title>. <source>Lancet Diabetes Endocrinol.</source> (<year>2014</year>) <volume>2</volume>:<fpage>19</fpage>&#x02013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1016/S2213-8587(13)70103-7</pub-id><pub-id pub-id-type="pmid">24622666</pub-id></citation></ref>
<ref id="B39">
<label>39.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kriegeskorte</surname> <given-names>N</given-names></name> <name><surname>Golan</surname> <given-names>T</given-names></name></person-group>. <article-title>Neural network models and deep learning</article-title>. <source>Curr Biol.</source> (<year>2019</year>) <volume>29</volume>:<fpage>R231</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1016/j.cub.2019.02.034</pub-id></citation></ref>
<ref id="B40">
<label>40.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Allalou</surname> <given-names>A</given-names></name> <name><surname>Nalla</surname> <given-names>A</given-names></name> <name><surname>Prentice</surname> <given-names>KJ</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>M</given-names></name> <name><surname>Dai</surname> <given-names>FF</given-names></name> <etal/></person-group>. <article-title>A predictive metabolic signature for the transition from gestational diabetes mellitus to type 2 diabetes</article-title>. <source>Diabetes.</source> (<year>2016</year>) <volume>65</volume>:<fpage>2529</fpage>&#x02013;<lpage>39</lpage>. <pub-id pub-id-type="doi">10.2337/db15-1720</pub-id><pub-id pub-id-type="pmid">27338739</pub-id></citation></ref>
<ref id="B41">
<label>41.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>T</given-names></name></person-group>. <article-title>Deep neural mapping support vector machines</article-title>. <source>Neural Netw.</source> (<year>2017</year>) <volume>93</volume>:<fpage>185</fpage>&#x02013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2017.05.010</pub-id><pub-id pub-id-type="pmid">28646763</pub-id></citation></ref>
<ref id="B42">
<label>42.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>H</given-names></name> <name><surname>Thiagarajan</surname> <given-names>JJ</given-names></name> <name><surname>Sattigeri</surname> <given-names>P</given-names></name> <name><surname>Spanias</surname> <given-names>A</given-names></name></person-group>. <article-title>Optimizing kernel machines using deep learning</article-title>. <source>IEEE Trans Neural Netw Learn Syst.</source> (<year>2018</year>) <volume>29</volume>:<fpage>5528</fpage>&#x02013;<lpage>40</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2018.2804895</pub-id><pub-id pub-id-type="pmid">29993616</pub-id></citation></ref>
<ref id="B43">
<label>43.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wiggins</surname> <given-names>M</given-names></name> <name><surname>Saad</surname> <given-names>A</given-names></name> <name><surname>Litt</surname> <given-names>B</given-names></name> <name><surname>Vachtsevanos</surname> <given-names>G</given-names></name></person-group>. <article-title>Evolving a Bayesian classifier for ECG-based age classification in medical applications</article-title>. <source>Appl Soft Comput.</source> (<year>2008</year>) <volume>8</volume>:<fpage>599</fpage>&#x02013;<lpage>608</lpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2007.03.009</pub-id><pub-id pub-id-type="pmid">22010038</pub-id></citation></ref>
<ref id="B44">
<label>44.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mao</surname> <given-names>S</given-names></name> <name><surname>Lin</surname> <given-names>W</given-names></name> <name><surname>Jiao</surname> <given-names>L</given-names></name> <name><surname>Gou</surname> <given-names>S</given-names></name> <name><surname>Chen</surname> <given-names>JW</given-names></name></person-group>. <article-title>End-to-end ensemble learning by exploiting the correlation between individuals and weights</article-title>. <source>IEEE Trans Cybern</source>. (<year>2021</year>) <volume>51</volume>:<fpage>2835</fpage>&#x02013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1109/TCYB.2019.2931071</pub-id><pub-id pub-id-type="pmid">31425063</pub-id></citation></ref>
<ref id="B45">
<label>45.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>J</given-names></name> <name><surname>Deng</surname> <given-names>C</given-names></name> <name><surname>Huang</surname> <given-names>GB</given-names></name></person-group>. <article-title>Extreme learning machine for multilayer perceptron</article-title>. <source>IEEE Trans Neural Netw Learn Syst.</source> (<year>2016</year>) <volume>27</volume>:<fpage>809</fpage>&#x02013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2015.2424995</pub-id><pub-id pub-id-type="pmid">25966483</pub-id></citation></ref>
<ref id="B46">
<label>46.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ngiam</surname> <given-names>KY</given-names></name> <name><surname>Khor</surname> <given-names>IW</given-names></name></person-group>. <article-title>Big data and machine learning algorithms for health-care delivery</article-title>. <source>Lancet Oncol.</source> (<year>2019</year>) <volume>20</volume>:<fpage>e262</fpage>&#x02013;<lpage>73</lpage>. <pub-id pub-id-type="doi">10.1016/S1470-2045(19)30149-4</pub-id><pub-id pub-id-type="pmid">31044724</pub-id></citation></ref>
<ref id="B47">
<label>47.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cahn</surname> <given-names>A</given-names></name> <name><surname>Shoshan</surname> <given-names>A</given-names></name> <name><surname>Sagiv</surname> <given-names>T</given-names></name> <name><surname>Yesharim</surname> <given-names>R</given-names></name> <name><surname>Goshen</surname> <given-names>R</given-names></name> <name><surname>Shalev</surname> <given-names>V</given-names></name> <etal/></person-group>. <article-title>Prediction of progression from pre-diabetes to diabetes: development and validation of a machine learning model</article-title>. <source>Diabetes Metab Res Rev.</source> (<year>2020</year>) <volume>36</volume>:<fpage>e3252</fpage>. <pub-id pub-id-type="doi">10.1002/dmrr.3252</pub-id><pub-id pub-id-type="pmid">31943669</pub-id></citation></ref>
<ref id="B48">
<label>48.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Farran</surname> <given-names>B</given-names></name> <name><surname>Channanath</surname> <given-names>AM</given-names></name> <name><surname>Behbehani</surname> <given-names>K</given-names></name> <name><surname>Thanaraj</surname> <given-names>TA</given-names></name></person-group>. <article-title>Predictive models to assess risk of type 2 diabetes, hypertension and comorbidity: machine-learning algorithms and validation using national health data from Kuwait&#x02013;a cohort study</article-title>. <source>BMJ Open.</source> (<year>2013</year>) <volume>3</volume>:<fpage>e002457</fpage>. <pub-id pub-id-type="doi">10.1136/bmjopen-2012-002457</pub-id></citation></ref>
<ref id="B49">
<label>49.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cho</surname> <given-names>SB</given-names></name> <name><surname>Kim</surname> <given-names>SC</given-names></name> <name><surname>Chung</surname> <given-names>MG</given-names></name></person-group>. <article-title>Identification of novel population clusters with different susceptibilities to type 2 diabetes and their impact on the prediction of diabetes</article-title>. <source>Sci Rep.</source> (<year>2019</year>) <volume>9</volume>:<fpage>3329</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-019-40058-y</pub-id><pub-id pub-id-type="pmid">30833619</pub-id></citation></ref>
<ref id="B50">
<label>50.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>BJ</given-names></name> <name><surname>Kim</surname> <given-names>JY</given-names></name></person-group>. <article-title>Identification of type 2 diabetes risk factors using phenotypes consisting of anthropometry and triglycerides based on machine learning</article-title>. <source>IEEE J Biomed Health Inform.</source> (<year>2016</year>) <volume>20</volume>:<fpage>39</fpage>&#x02013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2015.2396520</pub-id><pub-id pub-id-type="pmid">25675467</pub-id></citation></ref>
<ref id="B51">
<label>51.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alghamdi</surname> <given-names>M</given-names></name> <name><surname>Al-Mallah</surname> <given-names>M</given-names></name> <name><surname>Keteyian</surname> <given-names>S</given-names></name> <name><surname>Brawner</surname> <given-names>C</given-names></name> <name><surname>Ehrman</surname> <given-names>J</given-names></name> <name><surname>Sakr</surname> <given-names>S</given-names></name></person-group>. <article-title>Predicting diabetes mellitus using SMOTE and ensemble machine learning approach: the Henry Ford ExercIse Testing (FIT) project</article-title>. <source>PLoS ONE.</source> (<year>2017</year>) <volume>12</volume>:<fpage>e179805</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0179805</pub-id><pub-id pub-id-type="pmid">28738059</pub-id></citation></ref>
<ref id="B52">
<label>52.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ji</surname> <given-names>X</given-names></name> <name><surname>Tong</surname> <given-names>W</given-names></name> <name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Shi</surname> <given-names>T</given-names></name></person-group>. <article-title>Five-feature model for developing the classifier for synergistic vs. antagonistic drug combinations built by XGBoost</article-title>. <source>Front Genet.</source> (<year>2019</year>) <volume>10</volume>:<fpage>600</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2019.00600</pub-id><pub-id pub-id-type="pmid">31338106</pub-id></citation></ref>
<ref id="B53">
<label>53.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T</given-names></name> <name><surname>Li</surname> <given-names>X</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Xia</surname> <given-names>E</given-names></name> <name><surname>Qin</surname> <given-names>Y</given-names></name> <name><surname>Liang</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>Prediction and risk stratification of kidney outcomes in IgA nephropathy</article-title>. <source>Am J Kidney Dis.</source> (<year>2019</year>) <volume>74</volume>:<fpage>300</fpage>&#x02013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1053/j.ajkd.2019.02.016</pub-id><pub-id pub-id-type="pmid">31031086</pub-id></citation></ref>
<ref id="B54">
<label>54.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xiao</surname> <given-names>J</given-names></name> <name><surname>Ding</surname> <given-names>R</given-names></name> <name><surname>Xu</surname> <given-names>X</given-names></name> <name><surname>Guan</surname> <given-names>H</given-names></name> <name><surname>Feng</surname> <given-names>X</given-names></name> <name><surname>Sun</surname> <given-names>T</given-names></name> <etal/></person-group>. <article-title>Comparison and development of machine learning tools in the prediction of chronic kidney disease progression</article-title>. <source>J Transl Med.</source> (<year>2019</year>) <volume>17</volume>:<fpage>119</fpage>. <pub-id pub-id-type="doi">10.1186/s12967-019-1860-0</pub-id><pub-id pub-id-type="pmid">30971285</pub-id></citation></ref>
<ref id="B55">
<label>55.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khemasuwan</surname> <given-names>D</given-names></name> <name><surname>Sorensen</surname> <given-names>J</given-names></name> <name><surname>Griffin</surname> <given-names>DC</given-names></name></person-group>. <article-title>Predictive variables for failure in administration of intrapleural tissue plasminogen activator/deoxyribonuclease in patients with complicated parapneumonic effusions/empyema</article-title>. <source>Chest.</source> (<year>2018</year>) <volume>154</volume>:<fpage>550</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1016/j.chest.2018.01.037</pub-id><pub-id pub-id-type="pmid">29425674</pub-id></citation></ref>
<ref id="B56">
<label>56.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Selph</surname> <given-names>S</given-names></name> <name><surname>Dana</surname> <given-names>T</given-names></name> <name><surname>Blazina</surname> <given-names>I</given-names></name> <name><surname>Bougatsos</surname> <given-names>C</given-names></name> <name><surname>Patel</surname> <given-names>H</given-names></name> <name><surname>Chou</surname> <given-names>R</given-names></name></person-group>. <article-title>Screening for type 2 diabetes mellitus: a systematic review for the U.S. Preventive Services Task Force</article-title>. <source>Ann Intern Med.</source> (<year>2015</year>) <volume>162</volume>:<fpage>765</fpage>&#x02013;<lpage>76</lpage>. <pub-id pub-id-type="doi">10.7326/M14-2221</pub-id><pub-id pub-id-type="pmid">25973510</pub-id></citation></ref>
<ref id="B57">
<label>57.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nichols</surname> <given-names>GA</given-names></name> <name><surname>Hillier</surname> <given-names>TA</given-names></name> <name><surname>Brown</surname> <given-names>JB</given-names></name></person-group>. <article-title>Progression from newly acquired impaired fasting glusose to type 2 diabetes</article-title>. <source>Diabetes Care.</source> (<year>2007</year>) <volume>30</volume>:<fpage>228</fpage>&#x02013;<lpage>33</lpage>. <pub-id pub-id-type="doi">10.2337/dc06-1392</pub-id><pub-id pub-id-type="pmid">17259486</pub-id></citation></ref>
<ref id="B58">
<label>58.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Arnlov</surname> <given-names>J</given-names></name> <name><surname>Sundstrom</surname> <given-names>J</given-names></name> <name><surname>Ingelsson</surname> <given-names>E</given-names></name> <name><surname>Lind</surname> <given-names>L</given-names></name></person-group>. <article-title>Impact of BMI and the metabolic syndrome on the risk of diabetes in middle-aged men</article-title>. <source>Diabetes Care.</source> (<year>2011</year>) <volume>34</volume>:<fpage>61</fpage>&#x02013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.2337/dc10-0955</pub-id><pub-id pub-id-type="pmid">20852030</pub-id></citation></ref>
<ref id="B59">
<label>59.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tirosh</surname> <given-names>A</given-names></name> <name><surname>Shai</surname> <given-names>I</given-names></name> <name><surname>Afek</surname> <given-names>A</given-names></name> <name><surname>Dubnov-Raz</surname> <given-names>G</given-names></name> <name><surname>Ayalon</surname> <given-names>N</given-names></name> <name><surname>Gordon</surname> <given-names>B</given-names></name> <etal/></person-group>. <article-title>Adolescent BMI trajectory and risk of diabetes versus coronary disease</article-title>. <source>N Engl J Med.</source> (<year>2011</year>) <volume>364</volume>:<fpage>1315</fpage>&#x02013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMoa1006992</pub-id><pub-id pub-id-type="pmid">21470009</pub-id></citation></ref>
<ref id="B60">
<label>60.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vozarova</surname> <given-names>B</given-names></name> <name><surname>Stefan</surname> <given-names>N</given-names></name> <name><surname>Lindsay</surname> <given-names>RS</given-names></name> <name><surname>Saremi</surname> <given-names>A</given-names></name> <name><surname>Pratley</surname> <given-names>RE</given-names></name> <name><surname>Bogardus</surname> <given-names>C</given-names></name> <etal/></person-group>. <article-title>High alanine aminotransferase is associated with decreased hepatic insulin sensitivity and predicts the development of type 2 diabetes</article-title>. <source>Diabetes.</source> (<year>2002</year>) <volume>51</volume>:<fpage>1889</fpage>&#x02013;<lpage>95</lpage>. <pub-id pub-id-type="doi">10.2337/diabetes.51.6.1889</pub-id><pub-id pub-id-type="pmid">12031978</pub-id></citation></ref>
<ref id="B61">
<label>61.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Meijnikman</surname> <given-names>AS</given-names></name> <name><surname>Gerdes</surname> <given-names>VE</given-names></name> <name><surname>Nieuwdorp</surname> <given-names>M</given-names></name> <name><surname>Herrema</surname> <given-names>H</given-names></name></person-group>. <article-title>Evaluating causality of gut microbiota in obesity and diabetes in humans</article-title>. <source>Endocr Rev.</source> (<year>2018</year>) <volume>39</volume>:<fpage>133</fpage>&#x02013;<lpage>53</lpage>. <pub-id pub-id-type="doi">10.1210/er.2017-00192</pub-id><pub-id pub-id-type="pmid">29309555</pub-id></citation></ref>
<ref id="B62">
<label>62.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>L</given-names></name> <name><surname>Parhofer</surname> <given-names>KG</given-names></name></person-group>. <article-title>Diabetic dyslipidemia</article-title>. <source>Metabolism.</source> (<year>2014</year>) <volume>63</volume>:<fpage>1469</fpage>&#x02013;<lpage>79</lpage>. <pub-id pub-id-type="doi">10.1016/j.metabol.2014.08.010</pub-id></citation></ref>
<ref id="B63">
<label>63.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qiao</surname> <given-names>Q</given-names></name> <name><surname>Hu</surname> <given-names>G</given-names></name> <name><surname>Tuomilehto</surname> <given-names>J</given-names></name> <name><surname>Nakagami</surname> <given-names>T</given-names></name> <name><surname>Balkau</surname> <given-names>B</given-names></name> <name><surname>Borch-Johnsen</surname> <given-names>K</given-names></name> <etal/></person-group>. <article-title>Age- and sex-specific prevalence of diabetes and impaired glucose regulation in 11 Asian cohorts</article-title>. <source>Diabetes Care.</source> (<year>2003</year>) <volume>26</volume>:<fpage>1770</fpage>&#x02013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.2337/diacare.26.6.1770</pub-id><pub-id pub-id-type="pmid">12766108</pub-id></citation></ref>
<ref id="B64">
<label>64.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chang</surname> <given-names>AM</given-names></name> <name><surname>Halter</surname> <given-names>JB</given-names></name></person-group>. <article-title>Aging and insulin secretion</article-title>. <source>Am J Physiol Endocrinol Metab.</source> (<year>2003</year>) <volume>284</volume>:<fpage>E7</fpage>&#x02013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1152/ajpendo.00366.2002</pub-id></citation></ref>
<ref id="B65">
<label>65.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>Y</given-names></name> <name><surname>Ley</surname> <given-names>SH</given-names></name> <name><surname>Hu</surname> <given-names>FB</given-names></name></person-group>. <article-title>Global aetiology and epidemiology of type 2 diabetes mellitus and its complications</article-title>. <source>Nat Rev Endocrinol.</source> (<year>2018</year>) <volume>14</volume>:<fpage>88</fpage>&#x02013;<lpage>98</lpage>. <pub-id pub-id-type="doi">10.1038/nrendo.2017.151</pub-id><pub-id pub-id-type="pmid">29219149</pub-id></citation></ref>
<ref id="B66">
<label>66.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Segar</surname> <given-names>MW</given-names></name> <name><surname>Vaduganathan</surname> <given-names>M</given-names></name> <name><surname>Patel</surname> <given-names>KV</given-names></name> <name><surname>McGuire</surname> <given-names>DK</given-names></name> <name><surname>Butler</surname> <given-names>J</given-names></name> <name><surname>Fonarow</surname> <given-names>GC</given-names></name> <etal/></person-group>. <article-title>Machine learning to predict the risk of incident heart failure hospitalization among patients with diabetes: the WATCH-DM risk score</article-title>. <source>Diabetes Care.</source> (<year>2019</year>) <volume>42</volume>:<fpage>2298</fpage>&#x02013;<lpage>306</lpage>. <pub-id pub-id-type="doi">10.2337/dc19-0587</pub-id><pub-id pub-id-type="pmid">31959648</pub-id></citation></ref>
</ref-list>
<glossary>
<def-list>
<title>Abbreviations</title>
<def-item><term>BMI</term>
<def><p>Body mass index</p></def></def-item>
<def-item><term>SBP</term>
<def><p>Systolic blood pressure</p></def></def-item>
<def-item><term>DBP</term>
<def><p>Diastolic blood pressure</p></def></def-item>
<def-item><term>FPG</term>
<def><p>Fasting plasma glucose</p></def></def-item>
<def-item><term>TC</term>
<def><p>Total cholesterol</p></def></def-item>
<def-item><term>TG</term>
<def><p>Triglyceride</p></def></def-item>
<def-item><term>HDL-C</term>
<def><p>High-density lipoprotein cholesterol</p></def></def-item>
<def-item><term>LDL-C</term>
<def><p>Low-density lipid cholesterol</p></def></def-item>
<def-item><term>ALT</term>
<def><p>Alanine aminotransferase</p></def></def-item>
<def-item><term>BUN</term>
<def><p>Serum urea nitrogen</p></def></def-item>
<def-item><term>Scr</term>
<def><p>Serum creatinine</p></def></def-item>
<def-item><term>Family history</term>
<def><p>Family history of diabetes</p></def></def-item>
<def-item><term>XGBoost</term>
<def><p>EXtreme Gradient Boosting</p></def></def-item>
<def-item><term>SHAP</term>
<def><p>Shapley Additive exPlanations</p></def></def-item>
<def-item><term>SD</term>
<def><p>Standardized difference</p></def></def-item>
<def-item><term>RR</term>
<def><p>Relative risk</p></def></def-item>
<def-item><term>CI</term>
<def><p>Confidence intervals</p></def></def-item>
<def-item><term>PPV</term>
<def><p>Positive predictive value</p></def></def-item>
<def-item><term>NPV</term>
<def><p>Negative predictive value</p></def></def-item>
<def-item><term>PLR</term>
<def><p>Positive likelihood ratio</p></def></def-item>
<def-item><term>NLR</term>
<def><p>Negative likelihood ratio</p></def></def-item>
<def-item><term>DOR</term>
<def><p>Diagnostic odds ratio</p></def></def-item>
<def-item><term>ROC</term>
<def><p>Receiver operating characteristic</p></def></def-item>
<def-item><term>AUC</term>
<def><p>Area under curve.</p></def></def-item>
</def-list>
</glossary>
<fn-group>
<fn fn-type="financial-disclosure"><p><bold>Funding.</bold> This study was supported in part by Discipline Construction Ability Enhancement Project of Shenzhen Municipal Health Commission (SZXJ2017031).</p>
</fn>
</fn-group>
</back>
</article> 