<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Public Health</journal-id>
<journal-title-group>
<journal-title>Frontiers in Public Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Public Health</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-2565</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpubh.2026.1747762</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Evaluating the sampling effect of propensity score matching for reducing selection bias in medical data</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Roh</surname> <given-names>Minji</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3365823"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yum</surname> <given-names>Sujin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<uri xlink:href="https://loop.frontiersin.org/people/1944676"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Joo</surname> <given-names>Gihun</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<uri xlink:href="https://loop.frontiersin.org/people/2339857"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jang</surname> <given-names>Jae-Won</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/1666520"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Im</surname> <given-names>Hyeonseung</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<uri xlink:href="https://loop.frontiersin.org/people/1744354"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Interdisciplinary Graduate Program in Medical Bigdata Convergence, Kangwon National University</institution>, <city>Chuncheon</city>, <country country="kp">Republic of Korea</country></aff>
<aff id="aff2"><label>2</label><institution>Kangwon National University</institution>, <city>Chuncheon</city>, <country country="kp">Republic of Korea</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Neurology, Kangwon National University Hospital</institution>, <city>Chuncheon</city>, <country country="kp">Republic of Korea</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Computer Science and Engineering, Kangwon National University</institution>, <city>Chuncheon</city>, <country country="kp">Republic of Korea</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Gihun Joo, <email xlink:href="mailto:joo9327@kangwon.ac.kr">joo9327@kangwon.ac.kr</email>; Hyeonseung Im, <email xlink:href="mailto:hsim@kangwon.ac.kr">hsim@kangwon.ac.kr</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-10">
<day>10</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>14</volume>
<elocation-id>1747762</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>25</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>16</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Roh, Yum, Joo, Jang and Im.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Roh, Yum, Joo, Jang and Im</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-10">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>In real-world medical data, selection bias can significantly impact the performance of machine learning models, potentially leading to distorted outcomes. However, research aimed at mitigating selection bias remains relatively limited.</p></sec>
<sec>
<title>Methods</title>
<p>In this study, we evaluate the effectiveness of Propensity Score Matching (PSM) in reducing selection bias and assessing its impact on classification performance in imbalanced medical data. Specifically, we apply PSM alongside five undersampling, three oversampling, and three hybrid sampling techniques to three medical datasets: rapidly progressive dementia prediction (ADNI, <italic>n</italic> = 628, events = 51), hypothyroidism prediction (UCI, <italic>n</italic> = 3,772, events = 3,481), and cardiovascular disease prediction (Kaggle, <italic>n</italic> = 253,680, events = 23,893), each exhibiting varying degrees of demographic selection bias. We train and compare six classification models to assess the impact of each resampling technique on model performance. The magnitude of selection bias is quantified using the standardized mean difference (SMD), while model performance is assessed using the Area Under the Receiver Operating Characteristic Curve (AUROC), the Area Under the Precision-Recall Curve (AUPRC), accuracy, precision, recall, F1-score, specificity, calibration curves, Brier score, and decision curve analysis.</p></sec>
<sec>
<title>Results</title>
<p>The results indicate that PSM reduces SMD within the dataset, maintains stable classification performance, and enhances the internal validity of the model under conditions of limited or moderate demographic imbalance.</p></sec>
<sec>
<title>Conclusion</title>
<p>These advantages suggest its potential for improving model reliability and facilitating better generalization to external datasets in real-world medical applications. However, in datasets with extreme selection bias or when overly restrictive matching is applied, PSM can degrade model performance, underscoring the importance of choosing strategies that account for dataset characteristics.</p></sec></abstract>
<kwd-group>
<kwd>imbalanced data</kwd>
<kwd>machine learning</kwd>
<kwd>medical data analysis</kwd>
<kwd>propensity score matching</kwd>
<kwd>selection bias</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was supported by a grant of Patient-Centered Clinical Research Coordinating Center (PACEN) funded by the Ministry of Health &#x00026; Welfare, Republic of Korea (grant number: RS-2025-02304176) and Basic Science Research Program through the National Research Foundation of Korea (NRF) funded by the Ministry of Education (RS-2024-00463967 and No. 25411243). This research was also supported by the Regional Innovation System &#x00026; Education (RISE) program through the Gangwon RISE Center, funded by the Ministry of Education (MOE) and the Gangwon State (G.S.), Republic of Korea (2025-RISE-10-002).</funding-statement>
</funding-group>
<counts>
<fig-count count="7"/>
<table-count count="4"/>
<equation-count count="2"/>
<ref-count count="37"/>
<page-count count="16"/>
<word-count count="8785"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Digital Public Health</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>With the rapid advancement of technology and data collection, the utilization of real-world data (RWD) has become increasingly important. Although traditional medical research has favored study designs based on randomized clinical trials (RCTs) due to their high reliability, this approach is costly and limited to specific populations, making it challenging to fully represent the general patient population (<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>). In contrast, RWD is naturally collected from healthcare institutions or individuals, enabling the inclusion of a broader patient population and offering potential applications across various domains. Consequently, the adoption of RWD in medical research has been steadily increasing (<xref ref-type="bibr" rid="B3">3</xref>&#x02013;<xref ref-type="bibr" rid="B7">7</xref>). However, the lack of controlled data collection procedures introduces multiple forms of bias that may compromise the reliability and generalizability of ML-based predictions.</p>
<p>In the literature, the term selection bias has been used in multiple, conceptually distinct contexts. In causal inference, selection bias typically refers to bias arising from non-random treatment assignment, conditioning on post-intervention variables, or missing outcomes, and is discussed in the context of estimating causal effects using observational data (<xref ref-type="bibr" rid="B8">8</xref>). In contrast, in machine learning, particularly for healthcare applications, selection bias is commonly used to describe a mismatch between the study population and the target population, often resulting from non-representative sampling, inclusion and exclusion criteria, or data availability constraints (<xref ref-type="bibr" rid="B9">9</xref>). These two uses of the term arise from different problem settings and lead to different methodological solutions.</p>
<p>In this study, we focus exclusively on the latter setting: selection bias as a sample representativeness and imbalance problem in supervised ML prediction, rather than as a causal inference problem. Specifically, we consider scenarios in which certain demographic or clinical subpopulations are overrepresented or underrepresented in the available training data. Such imbalances can induce systematic differences in covariate distributions between outcome-defined groups, causing ML models to learn distorted decision boundaries and resulting in degraded performance when deployed in real-world clinical settings (<xref ref-type="bibr" rid="B10">10</xref>, <xref ref-type="bibr" rid="B11">11</xref>).</p>
<p>One of the major challenges in machine learning is class imbalance, which has led to the development of various methodologies to address the issue (<xref ref-type="bibr" rid="B12">12</xref>, <xref ref-type="bibr" rid="B13">13</xref>). Among the most prominent solutions are data-level and algorithm-level approaches. Data-level methods adjust class ratios by augmenting or removing data through resampling techniques. In contrast, algorithm-level methods include recognition-based learning and cost-sensitive learning approaches. Among these, data resampling techniques are widely utilized in the data preprocessing stage as they help mitigate problems caused by class imbalance, improve model performance, and offer a simple yet intuitive solution. Additionally, they are independent of specific learning algorithms and can be applied flexibly (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B14">14</xref>). Due to these advantages, resampling techniques remain an active area of research. However, because these methods primarily focus on adjusting sample counts, they may not effectively address underlying distortions in covariate distributions within the dataset.</p>
<p>Propensity Score Matching (PSM) offers an alternative perspective by explicitly targeting covariate balance between groups. Traditionally applied in observational studies to address confounding, PSM has increasingly been adopted in ML contexts as a covariate-aware preprocessing strategy (<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B16">16</xref>). When used for outcome-defined group matching, PSM can be interpreted as a structured resampling approach that prioritizes covariate overlap rather than class prevalence alone. However, it remains unclear under which data conditions and matching ratios PSM can reduce selection bias without compromising predictive discrimination, calibration, or clinical utility in ML-based prediction models.</p>
<p>Therefore, the objective of this study is to investigate the role of PSM as a data-level resampling strategy for mitigating sample selection bias in supervised ML prediction. In particular, we examine how the strength of matching interacts with dataset-specific demographic imbalance to influence downstream model behavior. To achieve this, we quantitatively evaluate the effectiveness of PSM as a data-level resampling technique and examine its impact on data imbalance and classification model performance. This work presents a substantially extended version of our earlier study (<xref ref-type="bibr" rid="B17">17</xref>), offering broader experimental evaluation and deeper analytical insights to support medical researchers in assessing the appropriateness of PSM for evaluating prediction models applied to diverse patient populations, particularly under limited demographic selection bias.</p></sec>
<sec sec-type="materials|methods" id="s2">
<label>2</label>
<title>Materials and methods</title>
<sec>
<label>2.1</label>
<title>Datasets</title>
<p>This study investigates the effects of sample selection bias and resampling techniques across heterogeneous real-world data settings. Rather than focusing on disease-specific prediction, three datasets were intentionally selected to represent distinct data sources and selection mechanisms commonly encountered in medical AI research. A summary of each dataset is provided in <xref ref-type="table" rid="T1">Table 1</xref> and detailed characteristics are reported in <xref ref-type="supplementary-material" rid="SM1">Supplementary material S1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Description of datasets.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>No</bold></th>
<th valign="top" align="center"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Prevalence rate</bold></th>
<th valign="top" align="center"><bold>Total number</bold></th>
<th valign="top" align="center"><bold>Features</bold></th>
<th valign="top" align="center"><bold>Demographic variables</bold></th>
<th valign="top" align="center"><bold>Healthy <italic>N</italic> (%)</bold></th>
<th valign="top" align="center"><bold>Patients <italic>N</italic> (%)</bold></th>
<th valign="top" align="center"><bold>Imbalance ratio (IR)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="center">RPD</td>
<td valign="top" align="center">9&#x02013;30 (<xref ref-type="bibr" rid="B32">32</xref>, <xref ref-type="bibr" rid="B33">33</xref>)</td>
<td valign="top" align="center">628</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">Age, sex, education</td>
<td valign="top" align="center">577 (91.88)</td>
<td valign="top" align="center">51 (8.12)</td>
<td valign="top" align="center">11.31</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="center">HypoT</td>
<td valign="top" align="center">0.3&#x02013;15 (<xref ref-type="bibr" rid="B23">23</xref>, <xref ref-type="bibr" rid="B34">34</xref>)</td>
<td valign="top" align="center">3,772</td>
<td valign="top" align="center">3</td>
<td valign="top" align="center">Age, sex<sup>&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">291 (7.71)</td>
<td valign="top" align="center">3,481 (92.29)</td>
<td valign="top" align="center">11.96</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="center">CVD</td>
<td valign="top" align="center">4&#x02013;49 (<xref ref-type="bibr" rid="B35">35</xref>&#x02013;<xref ref-type="bibr" rid="B37">37</xref>)</td>
<td valign="top" align="center">253,680</td>
<td valign="top" align="center">20</td>
<td valign="top" align="center">Age<sup>&#x0002A;&#x0002A;&#x0002A;</sup>, sex<sup>&#x0002A;&#x0002A;&#x0002A;</sup>, education<sup>&#x0002A;&#x0002A;&#x0002A;</sup>, income<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">229,787 (90.58)</td>
<td valign="top" align="center">23,893 (9.42)</td>
<td valign="top" align="center">9.62</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p><sup>&#x0002A;&#x0002A;</sup><italic>p</italic> &#x0003C; 0.01.</p>
<p><sup>&#x0002A;&#x0002A;&#x0002A;</sup><italic>p</italic> &#x0003C; 0.001.</p>
</table-wrap-foot>
</table-wrap>
<p>The RPD dataset was obtained from the Alzheimer&#x00027;s Disease Neuroimaging Initiative (ADNI) database (adni.loni.usc.edu), a research-oriented cohort launched in 2003 as a public&#x02013;private partnership led by Michael W. Weiner, MD. ADNI aims to assess the progression of mild cognitive impairment (MCI) and early Alzheimer&#x00027;s disease (AD) using MRI, PET, biomarkers, and clinical evaluations, and has since become a widely used resource in neuroimaging and dementia-related research (<xref ref-type="bibr" rid="B18">18</xref>&#x02013;<xref ref-type="bibr" rid="B20">20</xref>). As a registry-based research cohort, ADNI is characterized by voluntary participation and structured follow-up, which may introduce selection mechanisms related to sociodemographic factors and healthcare access. Within this dataset, rapidly progressive dementia (RPD), an acute-onset neurodegenerative condition requiring early intervention (<xref ref-type="bibr" rid="B21">21</xref>), was identified using Group-Based Trajectory Analysis based on longitudinal trajectories of Clinical Dementia Rating-Sum of Boxes (CDR-SB) scores. Missing values in the RPD dataset were handled using a K-nearest neighbors (KNN) imputation approach to ensure consistency across resampling experiments.</p>
<p>The HypoT dataset was derived from the Hypothyroidism data within the Thyroid Disease Databases in the UCI Machine Learning Repository (<xref ref-type="bibr" rid="B22">22</xref>). This dataset represents routine clinical data, where sample inclusion is primarily driven by healthcare-seeking behavior and diagnostic testing during clinical encounters. Hypothyroidism (HypoT) is a condition characterized by reduced thyroid hormone production, leading to impaired physiological function (<xref ref-type="bibr" rid="B23">23</xref>). Owing to the nature of routine care data, this dataset reflects institution and practice-dependent selection patterns. Missing values were addressed using a KNN imputation.</p>
<p>The CVD dataset was obtained from a refined version of the 2015 Behavioral Risk Factor Surveillance System (BRFSS), specifically the Heart Disease Health Indicators dataset available on Kaggle (<xref ref-type="bibr" rid="B24">24</xref>). BRFSS is a large-scale, population-based health survey conducted via standardized telephone interviews, and the Kaggle-distributed dataset excludes records with missing values. In this dataset, cardiovascular disease (CVD) status was defined by self-reported diagnoses of coronary heart disease (CHD) or myocardial infarction (MI). Compared to the other datasets, the CVD dataset offers broader population coverage and exhibits relatively weaker selection mechanisms, while still retaining the substantial class imbalance typical of large-scale epidemiological data.</p>
<p>Across all datasets, development and evaluation were conducted using internal resampling procedures to enable fair comparison of resampling strategies under consistent selection mechanisms. None of the datasets was considered fully representative of the general population; rather, each dataset was treated as representative of its respective data-generating process and care context. This design enables systematic evaluation of resampling techniques under varying degrees and forms of sample selection bias.</p></sec>
<sec>
<label>2.2</label>
<title>Data resampling techniques</title>
<p>In this study, five undersampling techniques are employed: Random Under Sampling (RUS), Tomek Links (TL), One-Sided Selection (OSS), Edited Nearest Neighbour (ENN), and Neighbourhood Cleaning Rule (NCR). For oversampling, we utilize Random Over Sampling (ROS), Synthetic Minority Over-sampling Technique (SMOTE), and Adaptive Synthetic Sampling (ADASYN). Additionally, SMOTE-Tomek Links (SMOTE-TL), SMOTE-ENN, and Over-sampling using Propensity Scores (OUPS) (<xref ref-type="bibr" rid="B25">25</xref>) are implemented as hybrid sampling techniques.</p>
<p>Undersampling reduces the number of majority class instances to balance the class distribution, thereby lowering computational costs. However, this approach carries the risk of discarding potentially informative samples. In contrast, oversampling increases the number of minority class instances, eliminating the possibility of information loss but potentially introducing artificial instances that may distort model outcomes. Hybrid sampling combines both techniques, enhancing the model&#x00027;s generalizability. However, it also increases the risk of overfitting (<xref ref-type="bibr" rid="B26">26</xref>) and is typically applicable only to binary classification tasks.</p></sec>
<sec>
<label>2.3</label>
<title>Propensity score matching</title>
<sec>
<label>2.3.1</label>
<title>Propensity score modeling and matching procedure</title>
<p>In this study, PSM was employed not as a causal inference framework, but as a covariate balance-oriented resampling strategy to reduce systematic differences in observed covariates between outcome-defined groups before training prediction models. This usage is conceptually closer to case-control matching commonly adopted in medical prediction studies than to treatment-effect estimation.</p>
<p>Let <italic>T</italic> denote a binary group indicator defined by the outcome variable and let <italic>X</italic> denote a vector of observed covariates. The propensity score is defined as the conditional probability of belonging to the outcome-positive group given the covariates,</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo class="qopname">Pr</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02223;</mml:mo><mml:mi>X</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where <italic>T</italic> &#x0003D; 1 indicates individuals with the target condition and <italic>T</italic> &#x0003D; 0 indicates those without the condition. Importantly, in our setting, <italic>T</italic> does not represent an intervention or treatment assignment, and the goal of PSM is not to estimate causal effects but to construct covariate-balanced subsets of cases and controls.</p>
<p>The definition of <italic>T</italic> and the covariates included in the propensity-score model differed across datasets, reflecting data availability and clinical relevance. In the RPD dataset, <italic>T</italic> &#x0003D; 1 indicated individuals with the target outcome, and the propensity score model included age, sex, and education. In the HypoT dataset, <italic>T</italic> &#x0003D; 1 indicated outcome-negative individuals, and age and sex were included as covariates. In the CVD dataset, <italic>T</italic> &#x0003D; 1 indicated individuals with outcome-positive individuals, and the model included age, sex, education, and income.</p>
<p>For all datasets, propensity scores were estimated using logistic regression with main effects only, without interaction terms or non-linear transformations:</p>
<disp-formula id="E2"><mml:math id="M2"><mml:mrow><mml:mtext class="textrm" mathvariant="normal">logit</mml:mtext><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>This specification was chosen for consistency across datasets and to reflect common practice in medical PSM applications.</p>
<p>Matching was conducted using the <monospace>MatchIt</monospace> package in R with nearest neighbor matching without replacement. The logit of the propensity score was used as the distance metric (<monospace>distance = &#x0007E;logit&#x0007E;</monospace>). To examine the effect of different matching stringencies, two matching ratios were applied: 1:1 and 1:4. No caliper restriction was imposed, and all other hyperparameters were left at their default values in <monospace>MatchIt</monospace>. The same matching procedure and hyperparameter settings were applied consistently across all datasets.</p>
<p>After matching, the resulting subsets were used as inputs for subsequent machine-learning classifiers. Covariate balance before and after matching was evaluated using standardized mean differences (SMDs) for each covariate, as well as summary measures such as the mean and maximum absolute SMD.</p></sec>
<sec>
<label>2.3.2</label>
<title>Conceptual role of PSM as a resampling strategy</title>
<p>Conceptually, PSM modifies the training data in a fundamentally different manner from standard class-imbalance techniques. While under-sampling and over-sampling methods primarily manipulate class prevalence to address imbalance, they typically operate without explicitly considering the joint distribution of observed covariates. In contrast, PSM directly targets covariate balance by restricting the analysis to regions of sufficient overlap in the propensity score space between outcome-defined groups.</p>
<p>As a consequence, PSM may alter both covariate distributions and class prevalence indirectly, with the final sample composition determined by the availability of suitable matches rather than by a predefined class ratio. In this study, we therefore interpret PSM not as an outcome-driven balancing technique, but as a structured, covariate-aware resampling strategy that complements conventional resampling methods commonly used in supervised classification.</p></sec></sec>
<sec>
<label>2.4</label>
<title>Classification models</title>
<p>To compare classification performance, we employ common machine learning classification models, including Logistic Regression, Random Forest (RF), eXtreme Gradient Boosting (XGBoost), Light Gradient Boosting Model (LGBM), Na&#x000EF;ve Bayes (NB) classifier, and the Soft Voting ensemble model. Detailed hyperparameter configurations for each classification model are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary material S2</xref>.</p>
<list list-type="bullet">
<list-item><p>Linear model: Logistic Regression</p></list-item>
<list-item><p>Tree-based models: RF, XGBoost, LGBM</p></list-item>
<list-item><p>Probability-based model: Na&#x000EF;ve Bayes</p></list-item>
<list-item><p>Voting-based model: Soft Voting</p></list-item>
</list></sec>
<sec>
<label>2.5</label>
<title>Evaluation metrics</title>
<p>Model performance was assessed through distributional balance measures and predictive metrics at both fixed and varying decision thresholds.</p>
<p>To assess distributional balance between groups, we calculated the Standardized Mean Difference (SMD) and the Imbalance Ratio (IR). The SMD is calculated as the difference in covariate-specific means between two groups, divided by the pooled standard deviation of the corresponding covariate (<xref ref-type="bibr" rid="B27">27</xref>). Since it is a standardized measure of mean differences for each covariate, it can be used to assess covariate imbalance between groups, which is commonly reported in studies addressing selection bias (<xref ref-type="bibr" rid="B28">28</xref>). A value below 0.2 is considered a small effect size, whereas a value of 0.8 or higher is regarded as a large effect size (<xref ref-type="bibr" rid="B29">29</xref>). The IR was computed as the ratio of the resampled size of the original majority class to that of the original minority class. Thus, IR = 1 indicates perfect balance; IR &#x0003E;1 indicates the baseline-majority class remains larger; and IR &#x0003C; 1 indicates a class-size reversal where the baseline-minority class becomes larger.</p>
<p>Predictive performance was evaluated using both threshold-dependent and threshold-independent metrics. Threshold-dependent evaluation was conducted using accuracy, precision, recall, F1-score, and specificity, which quantify classification performance at a standard probability threshold of 0.5. These metrics provide complementary perspectives on model behavior, particularly in imbalanced classification settings.</p>
<p>Discrimination and calibration were assessed using threshold-independent metrics and visual tools. AUROC summarizes sensitivity-specificity tradeoffs across all thresholds through ROC curves. AUPRC, visualized through precision-recall curves, is particularly informative for imbalanced datasets. The Brier score quantifies probabilistic accuracy by measuring mean squared error between predicted probabilities and observed outcomes. Calibration plots were used to assess the agreement between predicted probabilities and observed event rates. Decision curve analysis (DCA) evaluated clinical utility by quantifying net benefit across varying risk thresholds. Detailed descriptions of all evaluation metrics are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary material S3</xref>.</p>
<p>The experiments were conducted using Python 3.10.12, including numpy 1.25.2, pandas 2.0.3, scikit-learn 1.2.2, imbalanced-learn 0.10.1, xgboost 2.0.3, lightgbm 4.1.0, and smote-variants 0.7.3, as well as R 4.3.3 with MatchIt 4.5.5.</p>
<p>This study has ethical approval, consent to participate, and consent for publication, and was approved by the Institutional Review Board of Kangwon National University (IRB No. KWNUIRB-2023-10-001).</p></sec></sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<p>In this study, we applied various resampling techniques and PSM (1:4 and 1:1) to the RPD, HypoT, and CVD datasets and compared the SMD of the datasets as well as the classification performance of machine learning models (LR, RF, LGBM, XGBoost, NB, and the Soft Voting ensemble model). The datasets were min-max normalized, and five-fold cross-validation was employed.</p>
<p>To account for the stochasticity introduced by resampling procedures and cross-validation, all experiments were repeated using 10 different random seeds. Preliminary analyses indicated that model performance metrics stabilized within this range, with only marginal variance reduction observed beyond 10 repetitions. Given the modest performance differences among methods and the computational cost associated with additional runs, 10 repetitions were considered sufficient to provide robust and comparable estimates. All resampling techniques were applied without hyperparameter tuning. For PSM, in addition to 1:1 matching, we also employed 1:4 matching to prevent excessive information loss and mitigate the effects of incomplete matching.</p>
<p>For each dataset, a representative model was selected based on the mean AUPRC aggregated across all resampling strategies using out-of-fold predictions from 10 repetitions of five-fold cross-validation. This selection criterion was pre-specified to reflect the primary discrimination objective under class imbalance and to identify a stable reference model for subsequent analyses. Using the selected model, we further examined the effects of different resampling and PSM strategies on model behavior, rather than to claim optimal performance. The specific models selected according to this criterion are summarized in <xref ref-type="supplementary-material" rid="SM1">Supplementary material S4</xref>.</p>
<p>When evaluating classification performance, additional experiments were conducted under the same conditions, excluding demographic variables, to assess the degree of information loss introduced by PSM.</p>
<sec>
<label>3.1</label>
<title>Effect of resampling techniques on selection bias reduction</title>
<p>To compare the impact of different resampling techniques on selection bias in demographic variables, we compute the SMD and summarize the results in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Comparison of the standardized mean difference (SMD) by datasets, variables, and resampling techniques.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Category</bold></th>
<th valign="top" align="center"><bold>Technique</bold></th>
<th valign="top" align="center" colspan="3"><bold>RPD</bold></th>
<th valign="top" align="center" colspan="2"><bold>HypoT</bold></th>
<th valign="top" align="center" colspan="4"><bold>CVD</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td/>
<td/>
<td valign="top" align="center"><bold>Age</bold></td>
<td valign="top" align="center"><bold>Sex</bold></td>
<td valign="top" align="center"><bold>Education</bold></td>
<td valign="top" align="center"><bold>Age</bold></td>
<td valign="top" align="center"><bold>Sex</bold></td>
<td valign="top" align="center"><bold>Age</bold></td>
<td valign="top" align="center"><bold>Sex</bold></td>
<td valign="top" align="center"><bold>Education</bold></td>
<td valign="top" align="center"><bold>Income</bold></td>
</tr>
<tr>
<td valign="top" align="left">Baseline</td>
<td/>
<td valign="top" align="center">0.10 &#x000B1; 0.07</td>
<td valign="top" align="center">0.08 &#x000B1; 0.05</td>
<td valign="top" align="center">0.14 &#x000B1; 0.09</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.20 &#x000B1; 0.03</td>
<td valign="top" align="center">0.87 &#x000B1; 0.00</td>
<td valign="top" align="center">0.30 &#x000B1; 0.00</td>
<td valign="top" align="center">0.33 &#x000B1; 0.00</td>
<td valign="top" align="center">0.47 &#x000B1; 0.00</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="5">Undersampling</td>
<td valign="top" align="left">RUS</td>
<td valign="top" align="center">0.20 &#x000B1; 0.14</td>
<td valign="top" align="center">0.14 &#x000B1; 0.11</td>
<td valign="top" align="center">0.17 &#x000B1; 0.12</td>
<td valign="top" align="center">0.05 &#x000B1; 0.03</td>
<td valign="top" align="center">0.19 &#x000B1; 0.07</td>
<td valign="top" align="center">0.87 &#x000B1; 0.01</td>
<td valign="top" align="center">0.30 &#x000B1; 0.01</td>
<td valign="top" align="center">0.33 &#x000B1; 0.01</td>
<td valign="top" align="center">0.47 &#x000B1; 0.01</td>
</tr>
 <tr>
<td valign="top" align="left">TL</td>
<td valign="top" align="center">0.12 &#x000B1; 0.07</td>
<td valign="top" align="center">0.09 &#x000B1; 0.06</td>
<td valign="top" align="center">0.13 &#x000B1; 0.09</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.20 &#x000B1; 0.03</td>
<td valign="top" align="center">0.89 &#x000B1; 0.00</td>
<td valign="top" align="center">0.30 &#x000B1; 0.00</td>
<td valign="top" align="center">0.34 &#x000B1; 0.00</td>
<td valign="top" align="center">0.49 &#x000B1; 0.00</td>
</tr>
 <tr>
<td valign="top" align="left">OSS</td>
<td valign="top" align="center">0.11 &#x000B1; 0.07</td>
<td valign="top" align="center">0.08 &#x000B1; 0.06</td>
<td valign="top" align="center">0.13 &#x000B1; 0.09</td>
<td valign="top" align="center">0.10 &#x000B1; 0.06</td>
<td valign="top" align="center">0.20 &#x000B1; 0.03</td>
<td valign="top" align="center">0.89 &#x000B1; 0.00</td>
<td valign="top" align="center">0.30 &#x000B1; 0.00</td>
<td valign="top" align="center">0.34 &#x000B1; 0.00</td>
<td valign="top" align="center">0.49 &#x000B1; 0.00</td>
</tr>
 <tr>
<td valign="top" align="left">ENN</td>
<td valign="top" align="center">0.15 &#x000B1; 0.07</td>
<td valign="top" align="center">0.12 &#x000B1; 0.07</td>
<td valign="top" align="center">0.16 &#x000B1; 0.09</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.20 &#x000B1; 0.03</td>
<td valign="top" align="center">1.05 &#x000B1; 0.00</td>
<td valign="top" align="center">0.33 &#x000B1; 0.00</td>
<td valign="top" align="center">0.40 &#x000B1; 0.00</td>
<td valign="top" align="center">0.57 &#x000B1; 0.00</td>
</tr>
 <tr>
<td valign="top" align="left">NCR</td>
<td valign="top" align="center">0.15 &#x000B1; 0.07</td>
<td valign="top" align="center">0.12 &#x000B1; 0.07</td>
<td valign="top" align="center">0.16 &#x000B1; 0.09</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.20 &#x000B1; 0.03</td>
<td valign="top" align="center">1.01 &#x000B1; 0.00</td>
<td valign="top" align="center">0.31 &#x000B1; 0.00</td>
<td valign="top" align="center">0.38 &#x000B1; 0.00</td>
<td valign="top" align="center">0.55 &#x000B1; 0.00</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Oversampling</td>
<td valign="top" align="left">ROS</td>
<td valign="top" align="center">0.10 &#x000B1; 0.07</td>
<td valign="top" align="center">0.08 &#x000B1; 0.05</td>
<td valign="top" align="center">0.12 &#x000B1; 0.09</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.19 &#x000B1; 0.04</td>
<td valign="top" align="center">0.87 &#x000B1; 0.00</td>
<td valign="top" align="center">0.30 &#x000B1; 0.00</td>
<td valign="top" align="center">0.33 &#x000B1; 0.00</td>
<td valign="top" align="center">0.47 &#x000B1; 0.00</td>
</tr>
 <tr>
<td valign="top" align="left">SMOTE</td>
<td valign="top" align="center">0.11 &#x000B1; 0.08</td>
<td valign="top" align="center">0.13 &#x000B1; 0.08</td>
<td valign="top" align="center">0.19 &#x000B1; 0.11</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.20 &#x000B1; 0.04</td>
<td valign="top" align="center">0.89 &#x000B1; 0.00</td>
<td valign="top" align="center">0.32 &#x000B1; 0.00</td>
<td valign="top" align="center">0.32 &#x000B1; 0.00</td>
<td valign="top" align="center">0.47 &#x000B1; 0.01</td>
</tr>
 <tr>
<td valign="top" align="left">ADASYN</td>
<td valign="top" align="center">0.09 &#x000B1; 0.06</td>
<td valign="top" align="center">0.13 &#x000B1; 0.07</td>
<td valign="top" align="center">0.19 &#x000B1; 0.12</td>
<td valign="top" align="center">0.09 &#x000B1; 0.04</td>
<td valign="top" align="center">0.23 &#x000B1; 0.04</td>
<td valign="top" align="center">0.84 &#x000B1; 0.00</td>
<td valign="top" align="center">0.30 &#x000B1; 0.00</td>
<td valign="top" align="center">0.30 &#x000B1; 0.00</td>
<td valign="top" align="center">0.44 &#x000B1; 0.00</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Hybrid sampling</td>
<td valign="top" align="left">SMOTE-TL</td>
<td valign="top" align="center">0.12 &#x000B1; 0.08</td>
<td valign="top" align="center">0.14 &#x000B1; 0.08</td>
<td valign="top" align="center">0.19 &#x000B1; 0.11</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.21 &#x000B1; 0.04</td>
<td valign="top" align="center">0.89 &#x000B1; 0.00</td>
<td valign="top" align="center">0.32 &#x000B1; 0.00</td>
<td valign="top" align="center">0.32 &#x000B1; 0.00</td>
<td valign="top" align="center">0.47 &#x000B1; 0.00</td>
</tr>
 <tr>
<td valign="top" align="left">SMOTE-ENN</td>
<td valign="top" align="center">0.21 &#x000B1; 0.10</td>
<td valign="top" align="center">0.24 &#x000B1; 0.11</td>
<td valign="top" align="center">0.23 &#x000B1; 0.12</td>
<td valign="top" align="center">0.03 &#x000B1; 0.02</td>
<td valign="top" align="center">0.21 &#x000B1; 0.04</td>
<td valign="top" align="center">1.22 &#x000B1; 0.01</td>
<td valign="top" align="center">0.35 &#x000B1; 0.00</td>
<td valign="top" align="center">0.48 &#x000B1; 0.00</td>
<td valign="top" align="center">0.68 &#x000B1; 0.01</td>
</tr>
 <tr>
<td valign="top" align="left">OUPS</td>
<td valign="top" align="center">0.18 &#x000B1; 0.09</td>
<td valign="top" align="center">0.06 &#x000B1; 0.05</td>
<td valign="top" align="center">0.15 &#x000B1; 0.09</td>
<td valign="top" align="center">0.06 &#x000B1; 0.04</td>
<td valign="top" align="center">0.26 &#x000B1; 0.04</td>
<td valign="top" align="center">0.89 &#x000B1; 0.01</td>
<td valign="top" align="center">0.28 &#x000B1; 0.01</td>
<td valign="top" align="center">0.39 &#x000B1; 0.01</td>
<td valign="top" align="center">0.53 &#x000B1; 0.01</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">PSM</td>
<td valign="top" align="left">PSM1:4</td>
<td valign="top" align="center">0.04 &#x000B1; 0.03</td>
<td valign="top" align="center">0.05 &#x000B1; 0.04</td>
<td valign="top" align="center">0.03 &#x000B1; 0.03</td>
<td valign="top" align="center">0.01 &#x000B1; 0.00</td>
<td valign="top" align="center">0.00 &#x000B1; 0.00</td>
<td valign="top" align="center">0.07 &#x000B1; 0.00</td>
<td valign="top" align="center">0.10 &#x000B1; 0.00</td>
<td valign="top" align="center">0.06 &#x000B1; 0.00</td>
<td valign="top" align="center">0.05 &#x000B1; 0.00</td>
</tr>
<tr>
<td valign="top" align="left">PSM1:1</td>
<td valign="top" align="center">0.08 &#x000B1; 0.06</td>
<td valign="top" align="center">0.11 &#x000B1; 0.10</td>
<td valign="top" align="center">0.06 &#x000B1; 0.06</td>
<td valign="top" align="center">0.00 &#x000B1; 0.00</td>
<td valign="top" align="center">0.00 &#x000B1; 0.00</td>
<td valign="top" align="center">0.00 &#x000B1; 0.00</td>
<td valign="top" align="center">0.00 &#x000B1; 0.00</td>
<td valign="top" align="center">0.00 &#x000B1; 0.00</td>
<td valign="top" align="center">0.00 &#x000B1; 0.00</td>
</tr></tbody>
</table>
</table-wrap>
<p>Among the undersampling techniques, TL and OSS produce results similar to the baseline. However, RUS, ENN, and NCR generally tend to increase selection bias. Notably, for the Age variable in the CVD dataset, the SMD increases significantly to 0.18 when ENN is applied.</p>
<p>Among the oversampling techniques, ROS has minimal impact on selection bias, whereas SMOTE-based techniques (SMOTE, ADASYN, SMOTE-TL, and SMOTE-ENN) exhibit a general trend of increasing the SMD.</p>
<p>In contrast, PSM (1:1 and 1:4) successfully reduces the SMD to below 0.01 across all variables and demonstrates a stable bias correction effect, as confirmed by standard deviation comparisons.</p></sec>
<sec>
<label>3.2</label>
<title>Results of resampling for class balancing</title>
<p><xref ref-type="fig" rid="F1">Figure 1</xref> presents the IR observed after applying each resampling technique and PSM, where the results are shown for all datasets. <xref ref-type="fig" rid="F2">Figure 2</xref> provides the t-SNE visualization of the datasets after resampling and PSM. Detailed numerical results are provided in Supplementary material S5.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Imbalance ratio (IR) distribution by resampling techniques. IR is defined as the ratio of the larger to the smaller class size, with IR = 1 indicating a balanced class distribution.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-14-1747762-g0001.tif">
<alt-text content-type="machine-generated">Boxplot comparing imbalance ratios for various resampling techniques, including Baseline, RUS, TL, OSS, ENN, NCR, ROS, SMOTE, ADASYN, SMOTE-TL, SMOTE-ENN, OUPS, PSM:1:1, and PSM:1:4. The Baseline has the highest ratio around twelve, while techniques such as SMOTE show ratios close to one, indicating near-balanced class distributions.</alt-text>
</graphic>
</fig>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>t-SNE visualization of different resampling techniques. Due to differences in sample sizes, RPD and HypoT were plotted using an alpha of 0.4 and a size of 20. For CVD, an alpha of 0.1 and a size of 1 were generally used, but for oversampling and hybrid sampling within CVD, which involved larger sample sizes, a smaller alpha of 0.05 and a size of 0.1 were used to avoid overplotting.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-14-1747762-g0002.tif">
<alt-text content-type="machine-generated">A grid of scatter plots showing data distributions for three scenarios: RPD, HypoT, and CVD. Each column represents a scenario, and each row represents a different sampling technique including Baseline, Undersampling, Oversampling, Hybrid Sampling, and PSM. Pink dots represent the minority class, and blue dots represent the majority class. The distribution and density of dots vary across techniques, showcasing different sampling methods effects on class distribution.</alt-text>
</graphic>
</fig>
<p>Randomized resampling techniques (RUS and ROS) and SMOTE-based techniques (SMOTE, SMOTE-TL, and OUPS) achieve an IR of 1, ensuring an equal distribution of majority and minority class instances without requiring hyperparameter tuning. In contrast, certain undersampling techniques, such as TL, ENN, and NCR are less effective in mitigating class imbalance. OSS exhibits high variability, with IR values ranging from a minimum of 1.15 to a maximum of 11.91. We note that undersampling techniques can significantly reduce the dataset size when applied to highly imbalanced data. For example, for RUS, the majority class is reduced to match the minority class at a 1:1 ratio, reducing the final sample size to 82 (RPD), 466 (HypoT), and 38,230 (CVD), compared to 502, 3,017, and 202,944 in the baseline, respectively. Among the hybrid sampling techniques, SMOTE-ENN (IR 0.91 &#x000B1; 0.15) is generally effective in reducing class imbalance. However, it is the only method where the majority class becomes smaller than the minority class in certain cases. This behavior can be attributed to the hybrid nature of SMOTE-ENN, which simultaneously applies minority oversampling and majority undersampling, occasionally leading to an overcorrection of class imbalance relative to the baseline distribution.</p></sec>
<sec>
<label>3.3</label>
<title>Classification performance by datasets and resampling techniques</title>
<p><xref ref-type="fig" rid="F4">Figure 4</xref> illustrates the classification performance across datasets and resampling techniques, comparing models trained with and without demographic variables. <xref ref-type="fig" rid="F4">Figures 4</xref>&#x02013;<xref ref-type="fig" rid="F6">6</xref> further present these results separately for each dataset.</p>
<p>In the RPD dataset, the difference in AUROC when demographic variables are included versus excluded remains relatively small, with a maximum deviation of 0.0126 in the mean AUROC and 0.0284 in the mean AUPRC. Additionally, the trend in performance variation due to resampling remains consistent regardless of the presence of demographic variables (<xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Classification performance by resampling techniques with and without demographic variables for each dataset. As PSM inherently incorporates demographic variables into its matching process, results for the &#x0201C;excluding demographics" setting are not reported for PSM.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-14-1747762-g0003.tif">
<alt-text content-type="machine-generated">Five panels depicting classification performance under different resampling techniques on the HypoT dataset. Panel (a) shows the ROC curve comparing true positive and false positive rates. Panel (b) shows the precision&#x02013;recall curve. Panel (c) presents a radar chart summarizing multiple performance metrics, including recall, precision, accuracy, AUROC, AUPRC, F1 score, specificity, and Brier score across different resampling techniques. Panel (d) shows the calibration curve plotting the fraction of positives against predicted probability. Panel (e) displays decision curve analysis, showing net benefit versus threshold probability. A legend distinguishes baseline and resampling models.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F4">Figure 4</xref> indicates that, under models including demographic variables, the relative performance ordering across resampling techniques is largely preserved. PSM1:4 achieves the highest discriminative performance in both ROC and PR curve analyses, followed by PSM1:1. Furthermore, PSM1:4 demonstrates consistently strong performance across the ROC curve (<xref ref-type="fig" rid="F4">Figure 4a</xref>), PR curve (<xref ref-type="fig" rid="F4">Figure 4b</xref>), and aggregated performance metrics (<xref ref-type="fig" rid="F4">Figure 4c</xref>). PSM1:1 shows the second-best discriminative performance in terms of ROC Curve and PR Curve (<xref ref-type="fig" rid="F4">Figures 4a</xref>, <xref ref-type="fig" rid="F4">b</xref>), but exhibits relatively lower values in several aggregated performance metrics compared with several non-PSM resampling strategies (<xref ref-type="fig" rid="F4">Figure 4c</xref>). However, calibration and decision curve analyses reveal pronounced deviations and unstable net benefit for PSM1:1, whereas PSM1:4 remains well calibrated (<xref ref-type="fig" rid="F4">Figure 4d</xref>) and provides stable clinical net benefit across a wide range of threshold probabilities (<xref ref-type="fig" rid="F4">Figure 4e</xref>).</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Comprehensive performance comparison of resampling techniques on the RPD dataset, evaluated in terms of discrimination, calibration, and clinical utility. Panels show <bold>(a)</bold> ROC curves, <bold>(b)</bold> PR curves, <bold>(c)</bold> aggregated classification metrics, <bold>(d)</bold> calibration curves, and <bold>(e)</bold> decision curve analysis (DCA).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-14-1747762-g0004.tif">
<alt-text content-type="machine-generated">Graphs comparing AUROC and AUPRC for different conditions. Six panels show variations, with (a) AUROC and (b) AUPRC for RPD, (c) AUROC and (d) AUPRC for HypoT, and (e) AUROC and (f) AUPRC for CVD. Blue lines represent including demographic variables, while red lines exclude them. AUROC and AUPRC values vary across conditions.</alt-text>
</graphic>
</fig>
<p>In the HypoT dataset, classification performance improves across all techniques when demographic variables are removed. However, when demographic variables are included, AUROC and AUPRC already exhibit high baseline values, limiting the potential for further gains from resampling. The largest improvements are observed for PSM1:4 and PSM1:1, though these remain modest (<xref ref-type="fig" rid="F3">Figure 3</xref>). <xref ref-type="fig" rid="F5">Figure 5</xref> further illustrates this ceiling effect. Both ROC and PR curves (<xref ref-type="fig" rid="F5">Figures 5a</xref>, <xref ref-type="fig" rid="F5">b</xref>) show that all resampling techniques cluster tightly near the upper-left and upper-right regions, respectively, indicating near-optimal discrimination performance. Differences among methods are minimal, and the baseline model also maintains strong performance. In terms of aggregated performance metrics, PSM-based approaches achieve performance levels comparable to other resampling techniques. Notably, specificity under PSM is lower than that of some undersampling methods but remains higher than that achieved by oversampling approaches (<xref ref-type="fig" rid="F5">Figure 5c</xref>). Calibration curves (<xref ref-type="fig" rid="F5">Figure 5d</xref>) demonstrate that most techniques closely follow the ideal diagonal. Decision curve analysis (<xref ref-type="fig" rid="F5">Figure 5e</xref>) shows uniformly high net benefit across a wide range of thresholds, suggesting that resampling provides limited additional clinical utility in this high-performing scenario.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Comprehensive performance comparison of resampling techniques on the HypoT dataset, evaluated in terms of discrimination, calibration, and clinical utility. Panels show <bold>(a)</bold> ROC curves, <bold>(b)</bold> PR curves, <bold>(c)</bold> aggregated classification metrics, <bold>(d)</bold> calibration curves, and <bold>(e)</bold> decision curve analysis (DCA).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-14-1747762-g0005.tif">
<alt-text content-type="machine-generated">Five-panel chart displaying various performance metrics of different models. (a) ROC Curve shows true versus false positive rates. (b) AUPR Curve plots precision and recall. (c) Radar chart illustrates metrics like recall, precision, and accuracy. (d) Calibration curve depicts fraction of positives against mean predicted probability. (e) DCA Curve shows net benefit versus threshold probability. Legend includes models like Baseline, RUS, TL, SMOTE, and others, each represented by a distinct color.</alt-text>
</graphic>
</fig>
<p>For the CVD dataset, classification performance is generally higher when demographic variables are included. Under this setting, most non-PSM resampling techniques achieve comparable or improved performance, whereas PSM1:4 and PSM1:1 exhibit a notable decline and remain inferior to other approaches even when demographic variables are excluded (<xref ref-type="fig" rid="F3">Figure 3</xref>). <xref ref-type="fig" rid="F6">Figure 6</xref> further illustrates these differences across resampling techniques, including demographic variables. The ROC curves (<xref ref-type="fig" rid="F6">Figure 6a</xref>) indicate that all resampling techniques outperform the random baseline, with non-PSM resampling techniques achieving higher true positive rates across a wide range of false positive rates. In contrast, PSM-based approaches show heterogeneous behavior, with PSM1:1 consistently underperforming and PSM1:4 showing relatively stable discrimination only in intermediate false positive rate regions. These patterns are more pronounced in the PR curves (<xref ref-type="fig" rid="F6">Figure 6b</xref>), where non-PSM resampling techniques generally maintain higher precision across recall levels. The aggregated performance metrics indicate substantial trade-offs across resampling strategies (<xref ref-type="fig" rid="F6">Figure 6c</xref>). PSM1:4 exhibits a pattern similar to undersampling techniques, whereas PSM1:1 shows performance trends more closely aligned with hybrid sampling methods, reflecting differing balances among discrimination, calibration, and clinical utility. Calibration and decision curve analyses (<xref ref-type="fig" rid="F6">Figures 6d</xref>, <xref ref-type="fig" rid="F6">e</xref>) further highlight substantial differences in probability estimation and clinical utility, with PSM1:4 remaining relatively well calibrated and clinically stable, whereas PSM1:1 provides limited net benefit.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Comprehensive performance comparison of resampling techniques on the CVD dataset, evaluated in terms of discrimination, calibration, and clinical utility. Panels show <bold>(a)</bold> ROC curves, <bold>(b)</bold> PR curves, <bold>(c)</bold> aggregated classification metrics, <bold>(d)</bold> calibration curves, and <bold>(e)</bold> decision curve analysis (DCA).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-14-1747762-g0006.tif">
<alt-text content-type="machine-generated">Five panels depicting classification performance under different resampling techniques on the RPD dataset. Panel (a) shows the ROC curve comparing true positive and false positive rates. Panel (b) shows the precision&#x02013;recall curve. Panel (c) presents a radar chart summarizing multiple performance metrics, including recall, precision, accuracy, AUROC, AUPRC, F1 score, specificity, and Brier score across different resampling techniques. Panel (d) shows the calibration curve plotting the fraction of positives against predicted probability. Panel (e) displays decision curve analysis, showing net benefit versus threshold probability. A legend distinguishes baseline and resampling models.</alt-text>
</graphic>
</fig>
<p>Detailed numerical results for all experimental settings, as well as comprehensive performance comparisons excluding demographic variables for each dataset, are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary material S6</xref>.</p></sec>
<sec>
<label>3.4</label>
<title>Comparison of classification performance based on variable distribution</title>
<p>In the HypoT dataset, among the two demographic variables, Age does not exhibit statistically significant differences between classes, whereas Sex does. To examine the effect of each variable, PSM is performed under four conditions: (1) matching using Age only; (2) matching using Sex only; (3) matching using Age while excluding Sex; and (4) matching using Sex while excluding Age. The distributions of these matched variables are presented in <xref ref-type="table" rid="T3">Table 3</xref>, and the classification performance under each condition is shown in <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Comparison of 4:1 matched patients and controls based on matching variables in the HypoT dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Matching variables</bold></th>
<th valign="top" align="left"><bold>Post-matching variables</bold></th>
<th valign="top" align="center"><bold>Patients <italic>N</italic> (%)</bold></th>
<th valign="top" align="center"><bold>Controls <italic>N</italic> (%)</bold></th>
<th valign="top" align="center"><bold><italic>p</italic>-value</bold></th>
<th valign="top" align="center"><bold>SMD</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="6">Baseline</td>
<td valign="top" align="left"><italic>N</italic></td>
<td valign="top" align="center">3,481</td>
<td valign="top" align="center">291</td>
<td/>
<td/>
</tr>
 <tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">51.76 (20.17)</td>
<td valign="top" align="center">51.52 (19.11)</td>
<td valign="top" align="center">0.842</td>
<td valign="top" align="center">0.012</td>
</tr>
 <tr>
<td valign="top" align="left">Sex</td>
<td valign="top" align="center" colspan="4"></td>
</tr>
 <tr>
<td valign="top" align="left">Male</td>
<td valign="top" align="center">1077 (30.9)</td>
<td valign="top" align="center">65 (22.3)</td>
<td valign="top" align="center">0.003<sup>&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.196</td>
</tr>
 <tr>
<td valign="top" align="left">Female</td>
<td valign="top" align="center">2404 (69.1)</td>
<td valign="top" align="center">226 (77.7)</td>
<td/>
<td/>
</tr>
 <tr>
<td valign="top" align="left">TSH</td>
<td valign="top" align="center">2.17 (10.21)</td>
<td valign="top" align="center">39.23 (74.38)</td>
<td valign="top" align="center">&#x0003C; 0.001<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.698</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Sex, age</td>
<td valign="top" align="left"><italic>N</italic></td>
<td valign="top" align="center">932</td>
<td valign="top" align="center">233</td>
<td/>
<td/>
</tr>
 <tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">51.65 (18.55)</td>
<td valign="top" align="center">51.59 (18.77)</td>
<td valign="top" align="center">0.967</td>
<td valign="top" align="center">0.003</td>
</tr>
 <tr>
<td valign="top" align="left">Sex</td>
<td valign="top" align="center" colspan="4"></td>
</tr>
 <tr>
<td valign="top" align="left">Male</td>
<td valign="top" align="center">220 (23.6)</td>
<td valign="top" align="center">55 (23.6)</td>
<td valign="top" align="center">1.000</td>
<td valign="top" align="center">&#x0003C; 0.001</td>
</tr>
 <tr>
<td valign="top" align="left">Female</td>
<td valign="top" align="center">712 (76.4)</td>
<td valign="top" align="center">178 (76.4)</td>
<td/>
<td/>
</tr>
 <tr>
<td valign="top" align="left">TSH</td>
<td valign="top" align="center">2.09 (6.97)</td>
<td valign="top" align="center">37.62 (68.69)</td>
<td valign="top" align="center">&#x0003C; 0.001<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.728</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Sex</td>
<td valign="top" align="left"><italic>N</italic></td>
<td valign="top" align="center">932</td>
<td valign="top" align="center">233</td>
<td/>
<td/>
</tr>
 <tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">51.94 (18.83)</td>
<td valign="top" align="center">52.29 (18.65)</td>
<td valign="top" align="center">0.799</td>
<td valign="top" align="center">0.019</td>
</tr>
 <tr>
<td valign="top" align="left">Sex</td>
<td valign="top" align="center" colspan="4"></td>
</tr>
 <tr>
<td valign="top" align="left">Male</td>
<td valign="top" align="center">224 (24.0)</td>
<td valign="top" align="center">56 (24.0)</td>
<td valign="top" align="center">1.000</td>
<td valign="top" align="center">&#x0003C; 0.001</td>
</tr>
 <tr>
<td valign="top" align="left">Female</td>
<td valign="top" align="center">708 (76.0)</td>
<td valign="top" align="center">177 (76.0)</td>
<td/>
<td/>
</tr>
 <tr>
<td valign="top" align="left">TSH</td>
<td valign="top" align="center">2.30 (13.69)</td>
<td valign="top" align="center">38.63 (75.33)</td>
<td valign="top" align="center">&#x0003C; 0.001<sup>&#x0002A;&#x0002A;&#x0002A;</sup></td>
<td valign="top" align="center">0.671</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Age</td>
<td valign="top" align="left"><italic>N</italic></td>
<td valign="top" align="center">932</td>
<td valign="top" align="center">233</td>
<td/>
<td/>
</tr>
 <tr>
<td valign="top" align="left">Age</td>
<td valign="top" align="center">52.35 (18.46)</td>
<td valign="top" align="center">52.29 (18.65)</td>
<td valign="top" align="center">0.967</td>
<td valign="top" align="center">0.003</td>
</tr>
 <tr>
<td valign="top" align="left">Sex</td>
<td valign="top" align="center" colspan="4"></td>
</tr>
 <tr>
<td valign="top" align="left">Male</td>
<td valign="top" align="center">289 (31.0)</td>
<td valign="top" align="center">56 (24.0)</td>
<td valign="top" align="center">0.045<sup>&#x0002A;</sup></td>
<td valign="top" align="center">0.157</td>
</tr>
 <tr>
<td valign="top" align="left">Female</td>
<td valign="top" align="center">643 (69.0)</td>
<td valign="top" align="center">177 (76.0)</td>
<td/>
<td/>
</tr>
<tr>
<td valign="top" align="left">TSH</td>
<td valign="top" align="center">2.02 (4.92)</td>
<td valign="top" align="center">38.63 (75.33)</td>
<td valign="top" align="center">&#x0003C; 0.001</td>
<td valign="top" align="center">0.686</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>TSH, thyroid-stimulating hormone.</p>
<p><sup>&#x0002A;</sup><italic>p</italic> &#x0003C; 0.05.</p>
<p><sup>&#x0002A;&#x0002A;</sup><italic>p</italic> &#x0003C; 0.01.</p>
<p><sup>&#x0002A;&#x0002A;&#x0002A;</sup><italic>p</italic> &#x0003C; 0.001.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Classification performance by resampling techniques based on variable distribution. As propensity score matching (PSM) directly utilizes demographic variables during matching, results for the &#x0201C;no matching while excluding Sex and Age&#x0201D; setting are not reported, unlike the other resampling techniques. Instead, PSM enables variable-specific matching, and results for matching by Sex and Age are presented separately. Panels show <bold>(a)</bold> AUROC and <bold>(b)</bold> AUPRC across resampling methods when matching is performed using sex, age, or both variables. Because propensity score matching (PSM) explicitly relies on the selected covariates for matching, a &#x0201C;no matching&#x0201D; condition does not apply to PSM and is therefore not reported. Instead, PSM results are presented separately according to the variables included in the matching process, enabling direct comparison of variable-specific matching effects.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-14-1747762-g0007.tif">
<alt-text content-type="machine-generated">Two line graphs compare AUROC and AUPRC values using resampling techniques and variable matching types. Both graphs use the same legend, depicting methods like matching with sex and age, with lines clustered closely around high performance values. The AUROC graph ranges from 0.70 to 1.00, and the AUPRC graph ranges from 0.20 to 1.00. Resampling methods include RUS, TL, OSS, ENN, NCR, and others. The graphs show minimal variation across techniques, with a notable drop at PSM variations.</alt-text>
</graphic>
</fig>
<p>The results in <xref ref-type="table" rid="T3">Table 3</xref> indicate that selection bias is effectively mitigated when both variables are used for matching, as well as when Sex alone is used. However, when matching is performed using Age alone, Sex-based selection bias is not fully corrected.</p>
<p>As shown in <xref ref-type="fig" rid="F7">Figure 7</xref>, AUROC and AUPRC remain largely stable across most resampling techniques regardless of the matching strategy, except for PSM-based approaches. When all demographic variables are excluded, classification performance is generally highest, whereas both PSM1:4 and PSM1:1 exhibit a noticeable decline. Moreover, removing Sex while performing matching using Age consistently results in lower performance than removing Age and matching using Sex, highlighting the greater predictive contribution of Sex.</p>
<p>Within the PSM-based methods, PSM1:4 demonstrates relatively stable performance across matching conditions, while PSM1:1 shows more pronounced degradation. Supplementary analyses of calibration curves and decision curve analysis (<xref ref-type="supplementary-material" rid="SM1">Supplementary material S6</xref>; <xref ref-type="supplementary-material" rid="SM1">Supplementary Figures S4</xref>&#x02013;<xref ref-type="supplementary-material" rid="SM1">S7</xref>) further support these findings, showing that PSM1:4 maintains more stable calibration and achieves higher net benefit, whereas PSM1:1 exhibits poorer calibration and substantially reduced net benefit. Overall, these results indicate that PSM1:4 provides a more consistent balance between discrimination, calibration, and clinical utility in the HypoT dataset.</p></sec></sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>In this study, PSM is applied as a data-level resampling technique to correct selection bias and evaluate its impact on the performance of machine learning classification models.</p>
<sec>
<label>4.1</label>
<title>Advantages of propensity score matching in mitigating selection bias</title>
<p>The experimental results indicate that PSM reduces selection bias more effectively than other resampling techniques. Across all datasets, PSM lowers the SMD of demographic variables to values close to zero and yields the smallest standard deviation.</p>
<p>Traditional resampling techniques primarily focus on class balancing. They adjust the overall data distribution but do not consider the distribution of other variables. In contrast, PSM matches samples while considering both specific variables and class labels, making it a more effective method for mitigating selection bias.</p>
<p>A statistical analysis using chi-square tests and mean comparisons shows that variables with statistically significant differences in the original dataset exhibit higher <italic>p</italic>-values after PSM. This suggests that PSM effectively reduces selection bias.</p>
<p>PSM allows researchers to explicitly specify variables for matching, leading to a more stable reduction of bias in targeted variables. Consequently, PSM enhances the internal validity of the dataset, making it a valuable tool for bias correction.</p></sec>
<sec>
<label>4.2</label>
<title>Classification performance of PSM</title>
<p>Our results demonstrate that the effectiveness of PSM is highly dependent on the underlying distributional characteristics of the dataset. In the RPD dataset, where none of the demographic variables exhibit statistically significant differences between classes, PSM consistently achieves competitive or superior classification performance compared with other resampling techniques. Across discrimination, calibration, and decision curve analyses (<xref ref-type="fig" rid="F4">Figure 4</xref>), PSM-based approaches, particularly PSM1:4, show stable performance, indicating that matching can mitigate residual selection bias without substantially distorting informative feature distributions. The superior performance of PSM1:4 relative to PSM1:1 suggests that a higher matching ratio provides additional training samples while preserving sufficient covariate balance.</p>
<p>In the HypoT dataset, where only a subset of demographic variables exhibits moderate imbalance, classification performance remains largely comparable, with only a slight decline observed. Although PSM1:4 and PSM1:1 achieve classification performance comparable to that obtained after removing demographic variables, <xref ref-type="fig" rid="F5">Figure 5</xref> reveals that overall discrimination is near saturation. ROC and PR curves cluster tightly across all resampling strategies, and both calibration and decision curve analyses indicate uniformly high probability reliability and clinical net benefit. These findings suggest that, in datasets with intrinsically high separability, resampling, including PSM, provides limited additional benefit beyond the baseline model.</p>
<p>Overall, the impact of PSM varies by dataset and depends on both the degree of demographic imbalance and the matching ratio. In datasets with weak or moderate demographic&#x02013;outcome associations, such as RPD and HypoT, differences between matching ratios are more evident in calibration and clinical utility than in discrimination. In these settings, PSM1:4 maintains stable discrimination, calibration, and net benefit, whereas PSM1:1 shows consistent degradation across evaluation metrics, indicating that overly restrictive matching can compromise model reliability even when discrimination is near saturation.</p>
<p>In contrast, in the CVD dataset, where all variables&#x02014;including demographic ones&#x02014;exhibit statistically significant distributional differences between classes (<italic>p</italic>-value &#x0003C; 0.001) with large effect sizes, applying PSM leads to a consistent decline in classification performance. As illustrated in <xref ref-type="fig" rid="F6">Figure 6</xref>, this degradation is evident not only in AUROC and AUPRC but also in calibration behavior and clinical net benefit. In particular, PSM1:1 shows inferior performance across multiple evaluation dimensions, while PSM1:4 exhibits only partial stability. These results indicate that, when predictors themselves encode strong disease-related signals, enforcing covariate balance through matching can attenuate informative heterogeneity and reduce model discriminability.</p>
<p>These findings highlight the importance of conducting preliminary data analysis and strategically selecting preprocessing techniques when developing machine learning models using imbalanced medical datasets collected outside of RCTs. Notably, PSM demonstrates the ability to address both selection bias and class imbalance depending on the distributional characteristics of the dataset. As such, PSM presents a practical and lightweight preprocessing strategy for AI-driven clinical decision-making, offering improvements in data consistency and model performance without requiring major modifications to model architectures.</p>
<p>It has been previously argued that sampling-based imbalance correction techniques may adversely affect probability calibration by altering the underlying data distribution. Such concerns emphasize that improvements in discrimination metrics do not necessarily translate into reliable probabilistic predictions. In this study, we explicitly address this issue by evaluating calibration curves, Brier scores and decision curve analysis across all resampling strategies. Our results demonstrate that the impact of resampling on calibration is highly dataset dependent. While certain oversampling methods exhibit noticeable calibration distortion, particularly in the CVD dataset, PSM-based approaches, especially PSM1:4, maintain relatively stable calibration in datasets with limited or moderate covariate imbalance. These findings suggest that the calibration-related risks associated with sampling techniques are not uniform and should be assessed empirically rather than assumed <italic>a priori</italic>.</p></sec>
<sec>
<label>4.3</label>
<title>Impact of selection bias on classification performance</title>
<p>Our findings indicate that reductions in selection bias do not uniformly translate into improved classification performance. Although bias mitigation is essential for internal validity, aggressive matching may attenuate informative heterogeneity, particularly in datasets with minimal baseline demographic imbalance. This trade-off is evident in the contrasting behavior of PSM1:4 and PSM1:1, with the latter showing greater degradation in discrimination, calibration, and clinical net benefit despite stronger covariate balance.</p>
<p>To quantitatively assess the impact of selection bias on the classification performance of machine learning models, we analyze the correlation between the degree of selection bias after applying each resampling technique and the corresponding classification performance. The results indicate a negative correlation between the mean SMD and classification performance in the RPD and HypoT datasets, whereas a positive correlation is observed in the CVD dataset (<xref ref-type="table" rid="T4">Table 4</xref>). The absolute values of all correlation coefficients exceed 0.650, indicating a strong correlation. In particular, the CVD dataset exhibits very high correlations of 0.902 and 0.897, suggesting a significant relationship between selection bias and classification performance. These findings confirm that selection bias within a dataset influences the learning process of classification models.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Correlation between selection bias and classification performance.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>SMD-AUROC</bold></th>
<th valign="top" align="center"><bold>SMD-AUPRC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">RPD</td>
<td valign="top" align="center">&#x02013;0.694</td>
<td valign="top" align="center">&#x02013;0.815</td>
</tr>
<tr>
<td valign="top" align="left">HypoT</td>
<td valign="top" align="center">&#x02013;0.846</td>
<td valign="top" align="center">&#x02013;0.650</td>
</tr>
<tr>
<td valign="top" align="left">CVD</td>
<td valign="top" align="center">0.902</td>
<td valign="top" align="center">0.897</td>
</tr></tbody>
</table>
</table-wrap>
<p>This study also reconfirms the issue reported in previous research, where selection bias leads to the underestimation of certain group characteristics (<xref ref-type="bibr" rid="B30">30</xref>). In the HypoT dataset, the only remaining variable after excluding demographic variables is TSH, which is a direct biomarker of hypothyroidism. As a result, the classification performance remains high using this variable alone. The <italic>p</italic>-value of TSH is below 0.001, indicating a statistically significant difference in distribution between classes.</p>
<p>However, in this experiment, AUROC and AUPRC increase across all resampling techniques when demographic variables are removed. According to previous studies, hypothyroidism is also known to be influenced by age and sex. Thus, the observed decline in performance when sex and age are included suggests that the model fails to accurately capture the patterns in which these demographic factors influence class differentiation. This trend is further illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>, where an increase in the availability of sex and age information results in a decline in classification performance.</p>
<p>Such a phenomenon deteriorates the internal validity of the model and increases the risk of underdiagnosis or overdiagnosis for certain groups. In contrast, PSM demonstrates classification performance comparable to that of resampling techniques where demographic variables are removed, despite incorporating these variables.</p>
<p>These findings indicate that PSM mitigates the impact of dataset bias, enhances classification performance, and strengthens internal validity, thereby improving the reliability of results and facilitating better generalization to external datasets. Although the effect may vary depending on the characteristics of the dataset, PSM offers a data-driven approach to bias adjustment that balances class distributions and demographic variables without introducing significant model complexity. Therefore, it can be a useful option in practical applications where selection bias is a concern, such as when working with real-world data.</p></sec>
<sec>
<label>4.4</label>
<title>Limitations and future directions</title>
<p>This study quantitatively analyzes the relationship between selection bias and model performance. Moreover, unlike prior studies focusing primarily on statistical balance or treatment effect estimation, our work highlights how PSM can influence downstream machine learning classification performance when applied to observational medical datasets. However, it has certain limitations. First, all experiments were conducted on publicly available datasets, which may not fully reflect the complexity of clinical data from diverse institutions. In particular, the datasets used in this study were not considered to be fully representative of the general population, but rather were interpreted as reflecting their respective data-generating processes and clinical care contexts. Second, whereas our study primarily focuses on selection bias, further validation is needed across datasets of varying sizes. Additionally, we compare only 1:1 and 1:4 matching ratios in PSM. Future research should investigate the effect of using dataset-specific optimal matching ratios. Such efforts would lead to more tailored and effective strategies for selection bias adjustment across diverse real-world medical datasets. Lastly, while our study emphasizes statistical bias reduction, additional evaluation of clinical relevance, such as impact on diagnostic decision-making or subgroup fairness, would enhance the practical significance of the findings and help support their generalizability and utility in real-world healthcare settings.</p></sec></sec>
<sec id="s5">
<label>5</label>
<title>Conclusions</title>
<p>This study analyzes the impact of PSM as a method for correcting selection bias and assessing its influence on classification performance in machine learning&#x02013;based models. To achieve this, we compare PSM with various resampling techniques across multiple medical datasets (RPD, HypoT, and CVD) and quantitatively assess the effect of selection bias on data imbalance and model performance.</p>
<p>The results demonstrate that PSM effectively reduces the SMD and maintains stable classification performance when applied to datasets containing demographic variables with low selection bias. Additionally, PSM contributes to enhancing the internal validity of models. However, when applied to datasets with highly biased variables, or when overly restrictive matching is employed, PSM results in a decline in model performance. This finding highlights the necessity of carefully considering dataset characteristics and matching intensity when employing PSM.</p>
<p>These findings emphasize the importance of selecting appropriate resampling techniques when utilizing machine learning in real-world medical research settings. Although this study does not explore the influence of the size of the dataset or the determination of optimal matching ratios, future work may address these limitations by incorporating a broader range of medical datasets and systematically evaluating alternative matching configurations.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Restrictions apply to the availability of RPD data. Data were obtained from the Alzheimer&#x00027;s Disease Neuroimaging Initiative (ADNI) and are available at <ext-link ext-link-type="uri" xlink:href="http://adni.loni.usc.edu">adni.loni.usc.edu</ext-link> with the permission of ADNI. The original HypoT data presented in the study are openly available in UCI Machine Learning Repository at <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.24432/C5D010">https://doi.org/10.24432/C5D010</ext-link>. The original CVD data presented in the study are openly available in Kaggle at <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/soumyodippal000/heart-disease-health-indicators">https://www.kaggle.com/datasets/soumyodippal000/heart-disease-health-indicators</ext-link>. This work conforms to the TRIPOD-AI guidelines (<xref ref-type="bibr" rid="B31">31</xref>), with the completed checklist included in the <xref ref-type="supplementary-material" rid="SM1">Supplementary material S7</xref>. The code used for this study is publicly available at <ext-link ext-link-type="uri" xlink:href="https://github.com/knu-plml/psm-medical-ml-bias">https://github.com/knu-plml/psm-medical-ml-bias</ext-link>.</p>
</sec>
<sec sec-type="ethics-statement" id="s7">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Institutional Review Board of Kangwon National University (KWNUIRB-2023-10-001), Kangwon National University, Chuncheon, South Korea. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x00027; legal guardians/next of kin in accordance with the national legislation and institutional requirements because only de-identified, publicly available ADNI data were used, and all participants had previously provided consent as part of the original ADNI study.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>MR: Conceptualization, Visualization, Methodology, Data curation, Formal analysis, Writing &#x02013; review &#x00026; editing, Writing &#x02013; original draft. SY: Conceptualization, Writing &#x02013; review &#x00026; editing, Data curation, Methodology. GJ: Writing &#x02013; review &#x00026; editing, Supervision, Formal analysis. J-WJ: Data curation, Writing &#x02013; review &#x00026; editing. HI: Supervision, Writing &#x02013; review &#x00026; editing, Funding acquisition, Methodology, Project administration.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. During the preparation of this work the author(s) used ChatGPT in order to enhance readability, clarify expressions, and assist with English translation. After using this tool, the author(s) reviewed and edited the content as needed and take full responsibility for the content of the published article.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec><sec sec-type="supplementary-material" id="s12">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fpubh.2026.1747762/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fpubh.2026.1747762/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lim</surname> <given-names>YMF</given-names></name> <name><surname>Molnar</surname> <given-names>M</given-names></name> <name><surname>Vaartjes</surname> <given-names>I</given-names></name> <name><surname>Savarese</surname> <given-names>G</given-names></name> <name><surname>Eijkemans</surname> <given-names>MJC</given-names></name> <name><surname>Uijl</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Generalizability of randomized controlled trials in heart failure with reduced ejection fraction</article-title>. <source>Eur Heart J Qual Care Clin Outcomes</source>. (<year>2022</year>) <volume>8</volume>:<fpage>761</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1093/ehjqcco/qcab070</pub-id><pub-id pub-id-type="pmid">34596659</pub-id></mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ong</surname> <given-names>SW</given-names></name> <name><surname>Tong</surname> <given-names>SY</given-names></name> <name><surname>Daneman</surname> <given-names>N</given-names></name></person-group>. <article-title>Are we enrolling the right patients? A scoping review of external validity and generalizability of clinical trials in bloodstream infections</article-title>. <source>Clin Microbiol Infect</source>. (<year>2023</year>) <volume>29</volume>:<fpage>1393</fpage>&#x02013;<lpage>401</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cmi.2023.08.019</pub-id><pub-id pub-id-type="pmid">37633330</pub-id></mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hong</surname> <given-names>JC</given-names></name></person-group>. <article-title>Strategies to turn real-world data into real-world knowledge</article-title>. <source>JAMA Netw Open</source>. (<year>2021</year>) <volume>4</volume>:<fpage>e2128045</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.28045</pub-id><pub-id pub-id-type="pmid">34618043</pub-id></mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Concato</surname> <given-names>J</given-names></name> <name><surname>Corrigan-Curay</surname> <given-names>J</given-names></name></person-group>. <article-title>Real-world evidence&#x02014;where are we now?</article-title> <source>N Engl J Med</source>. (<year>2022</year>) <volume>386</volume>:<fpage>1680</fpage>&#x02013;<lpage>2</lpage>. doi: <pub-id pub-id-type="doi">10.1056/NEJMp2200089</pub-id></mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Knevel</surname> <given-names>R</given-names></name> <name><surname>Liao</surname> <given-names>KP</given-names></name></person-group>. <article-title>From real-world electronic health record data to real-world results using artificial intelligence</article-title>. <source>Ann Rheum Dis</source>. (<year>2023</year>) <volume>82</volume>:<fpage>306</fpage>&#x02013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1136/ard-2022-222626</pub-id><pub-id pub-id-type="pmid">36150748</pub-id></mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>R</given-names></name> <name><surname>Vora</surname> <given-names>B</given-names></name> <name><surname>Menon</surname> <given-names>S</given-names></name> <name><surname>Younis</surname> <given-names>I</given-names></name> <name><surname>Dwivedi</surname> <given-names>G</given-names></name> <name><surname>Meng</surname> <given-names>Z</given-names></name> <etal/></person-group>. <article-title>Clinical pharmacology applications of real-world data and real-world evidence in drug development and approval-an industry perspective</article-title>. <source>Clin Pharmacol Ther</source>. (<year>2023</year>) <volume>114</volume>:<fpage>751</fpage>&#x02013;<lpage>67</lpage>. doi: <pub-id pub-id-type="doi">10.1002/cpt.2988</pub-id><pub-id pub-id-type="pmid">37393555</pub-id></mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Penberthy</surname> <given-names>LT</given-names></name> <name><surname>Rivera</surname> <given-names>DR</given-names></name> <name><surname>Lund</surname> <given-names>JL</given-names></name> <name><surname>Bruno</surname> <given-names>MA</given-names></name> <name><surname>Meyer</surname> <given-names>AM</given-names></name></person-group>. <article-title>An overview of real-world data sources for oncology and considerations for research</article-title>. <source>CA Cancer J Clin</source>. (<year>2022</year>) <volume>72</volume>:<fpage>287</fpage>&#x02013;<lpage>300</lpage>. doi: <pub-id pub-id-type="doi">10.3322/caac.21714</pub-id><pub-id pub-id-type="pmid">34964981</pub-id></mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Hern&#x000E1;n</surname> <given-names>MA</given-names></name> <name><surname>Robins</surname> <given-names>JM</given-names></name></person-group>. <source>Causal Inference: What If</source> . Boca Raton, FL: Chapman &#x00026; Hall/CRC (<year>2024</year>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://content.sph.harvard.edu/wwwhsph/sites/1268/2024/01/hernanrobins_WhatIf_2jan24.pdf">https://content.sph.harvard.edu/wwwhsph/sites/1268/2024/01/hernanrobins_WhatIf_2jan24.pdf</ext-link> (Accessed December 20, 2025).</mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chauhan</surname> <given-names>VK</given-names></name> <name><surname>Clifton</surname> <given-names>L</given-names></name> <name><surname>Sala&#x000FC;n</surname> <given-names>A</given-names></name> <name><surname>Lu</surname> <given-names>HY</given-names></name> <name><surname>Branson</surname> <given-names>K</given-names></name> <name><surname>Schwab</surname> <given-names>P</given-names></name> <etal/></person-group>. <article-title>Sample selection bias in machine learning for healthcare</article-title>. <source>ACM Trans Comput Healthc</source>. (<year>2025</year>) <volume>6</volume>:<fpage>1</fpage>&#x02013;<lpage>24</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3761822</pub-id></mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>JL</given-names></name> <name><surname>Tang</surname> <given-names>XW</given-names></name> <name><surname>Qiu</surname> <given-names>JN</given-names></name></person-group>. <article-title>Analysis of the influences of sampling bias and class imbalance on performances of probabilistic liquefaction models</article-title>. <source>Int J Geomech</source>. (<year>2017</year>) <volume>17</volume>:<fpage>04016134</fpage>. doi: <pub-id pub-id-type="doi">10.1061/(ASCE)GM.1943-5622.0000808</pub-id></mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vuttipittayamongkol</surname> <given-names>P</given-names></name> <name><surname>Elyan</surname> <given-names>E</given-names></name></person-group>. <article-title>Neighbourhood-based undersampling approach for handling imbalanced and overlapped data</article-title>. <source>Inf Sci</source>. (<year>2020</year>) <volume>509</volume>:<fpage>47</fpage>&#x02013;<lpage>70</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ins.2019.08.062</pub-id></mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tasci</surname> <given-names>E</given-names></name> <name><surname>Zhuge</surname> <given-names>Y</given-names></name> <name><surname>Camphausen</surname> <given-names>K</given-names></name> <name><surname>Krauze</surname> <given-names>AV</given-names></name></person-group>. <article-title>Bias and class imbalance in oncologic data&#x02014;towards inclusive and transferrable AI in large scale oncology data sets</article-title>. <source>Cancers</source>. (<year>2022</year>) <volume>14</volume>:<fpage>2897</fpage>. doi: <pub-id pub-id-type="doi">10.3390/cancers14122897</pub-id><pub-id pub-id-type="pmid">35740563</pub-id></mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Nguyen</surname> <given-names>GH</given-names></name> <name><surname>Bouzerdoum</surname> <given-names>A</given-names></name> <name><surname>Phung</surname> <given-names>SL</given-names></name></person-group>. <article-title>Learning pattern classification tasks with imbalanced data sets</article-title>. In:<person-group person-group-type="editor"><name><surname>Yin</surname> <given-names>P</given-names></name></person-group>, editor. <source>Pattern Recognition</source>. <publisher-loc>Vukovar</publisher-loc>: <publisher-name>In-Teh</publisher-name> (<year>2009</year>). p. <fpage>193</fpage>&#x02013;<lpage>208</lpage>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://ro.uow.edu.au/infopapers/792">https://ro.uow.edu.au/infopapers/792</ext-link> (Accessed November 10, 2025).</mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khushi</surname> <given-names>M</given-names></name> <name><surname>Shaukat</surname> <given-names>K</given-names></name> <name><surname>Alam</surname> <given-names>TM</given-names></name> <name><surname>Hameed</surname> <given-names>IA</given-names></name> <name><surname>Uddin</surname> <given-names>S</given-names></name> <name><surname>Luo</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>A comparative performance analysis of data resampling methods on imbalance medical data</article-title>. <source>IEEE Access</source>. (<year>2021</year>) <volume>9</volume>:<fpage>109960</fpage>&#x02013;<lpage>75</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2021.3102399</pub-id></mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rosenbaum</surname> <given-names>PR</given-names></name> <name><surname>Rubin</surname> <given-names>DB</given-names></name></person-group>. <article-title>The central role of the propensity score in observational studies for causal effects</article-title>. <source>Biometrika</source>. (<year>1983</year>) <volume>70</volume>:<fpage>41</fpage>&#x02013;<lpage>55</lpage>. doi: <pub-id pub-id-type="doi">10.1093/biomet/70.1.41</pub-id></mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benedetto</surname> <given-names>U</given-names></name> <name><surname>Head</surname> <given-names>SJ</given-names></name> <name><surname>Angelini</surname> <given-names>GD</given-names></name> <name><surname>Blackstone</surname> <given-names>EH</given-names></name></person-group>. <article-title>Statistical primer: propensity score matching and its alternatives</article-title>. <source>Eur J Cardiothorac Surg</source>. (<year>2018</year>) <volume>53</volume>:<fpage>1112</fpage>&#x02013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1093/ejcts/ezy167</pub-id><pub-id pub-id-type="pmid">29684154</pub-id></mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Roh</surname> <given-names>M</given-names></name> <name><surname>Joo</surname> <given-names>G</given-names></name> <name><surname>Im</surname> <given-names>H</given-names></name> <name><surname>Jang</surname> <given-names>J</given-names></name></person-group>. <article-title>Using propensity score matching to resolve the class imbalance problem in medical data</article-title>. In: <source>Proceedings of the Korean Institute of Information Scientists and Engineers (KIISE)</source>. <publisher-loc>Seoul</publisher-loc>: <publisher-name>Korean Neurological Association</publisher-name> (<year>2023</year>). p. <fpage>131</fpage>&#x02013;<lpage>3</lpage>.</mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>C</given-names></name> <name><surname>Joo</surname> <given-names>G</given-names></name> <name><surname>Roh</surname> <given-names>M</given-names></name> <name><surname>Shin</surname> <given-names>S</given-names></name> <name><surname>Yum</surname> <given-names>S</given-names></name> <name><surname>Yeo</surname> <given-names>NY</given-names></name> <etal/></person-group>. <article-title>Predicting the progression of mild cognitive impairment to Alzheimer&#x00027;s dementia using recurrent neural networks with a series of neuropsychological tests</article-title>. <source>J Clin Neurol</source>. (<year>2024</year>) <volume>20</volume>:<fpage>478</fpage>&#x02013;<lpage>86</lpage>. doi: <pub-id pub-id-type="doi">10.3988/jcn.2023.0289</pub-id><pub-id pub-id-type="pmid">39227330</pub-id></mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ye</surname> <given-names>J</given-names></name> <name><surname>Zeng</surname> <given-names>A</given-names></name> <name><surname>Pan</surname> <given-names>D</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Zhao</surname> <given-names>J</given-names></name> <name><surname>Chen</surname> <given-names>Q</given-names></name> <etal/></person-group>. <article-title>MAD-former: a traceable interpretability model for Alzheimer&#x00027;s disease recognition based on multi-patch attention</article-title>. <source>IEEE J Biomed Health Inform</source>. (<year>2024</year>) <volume>28</volume>:<fpage>3637</fpage>&#x02013;<lpage>48</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JBHI.2024.3368500</pub-id><pub-id pub-id-type="pmid">38442047</pub-id></mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Atitallah</surname> <given-names>SB</given-names></name> <name><surname>Driss</surname> <given-names>M</given-names></name> <name><surname>Boulila</surname> <given-names>W</given-names></name> <name><surname>Koubaa</surname> <given-names>A</given-names></name></person-group>. <article-title>Enhancing early Alzheimer&#x00027;s disease detection through big data and ensemble few-shot learning</article-title>. <source>IEEE J Biomed Health Inform</source>. (<year>2024</year>) <volume>29</volume>:<fpage>6451</fpage>&#x02013;<lpage>62</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JBHI.2024.3473541</pub-id><pub-id pub-id-type="pmid">39356607</pub-id></mixed-citation>
</ref>
<ref id="B21">
<label>21.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hermann</surname> <given-names>P</given-names></name> <name><surname>Zerr</surname> <given-names>I</given-names></name></person-group>. <article-title>Rapidly progressive dementias&#x02014;aetiologies, diagnosis and management</article-title>. <source>Nat Rev Neurol</source>. (<year>2022</year>) <volume>18</volume>:<fpage>363</fpage>&#x02013;<lpage>76</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41582-022-00659-0</pub-id></mixed-citation>
</ref>
<ref id="B22">
<label>22.</label>
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Quinlan</surname> <given-names>R</given-names></name></person-group>. <article-title>Data from: Thyroid disease</article-title>. <source>UCI Machine Learning Repository</source>. (<year>1986</year>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.24432/C5D010">https://doi.org/10.24432/C5D010</ext-link></mixed-citation>
</ref>
<ref id="B23">
<label>23.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wilson</surname> <given-names>SA</given-names></name> <name><surname>Stem</surname> <given-names>LA</given-names></name> <name><surname>Bruehlman</surname> <given-names>RD</given-names></name></person-group>. <article-title>Hypothyroidism: diagnosis and treatment</article-title>. <source>Am Fam Physician</source>. (<year>2021</year>) <volume>103</volume>:<fpage>605</fpage>&#x02013;<lpage>13</lpage>.</mixed-citation>
</ref>
<ref id="B24">
<label>24.</label>
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Teboul</surname> <given-names>A</given-names></name></person-group>. <article-title>Data from: Heart disease health indicators</article-title>. <source>Kaggle</source>. (<year>2022</year>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/soumyodippal000/heart-disease-health-indicators">https://www.kaggle.com/datasets/soumyodippal000/heart-disease-health-indicators</ext-link> (Accessed November 10, 2025).</mixed-citation>
</ref>
<ref id="B25">
<label>25.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rivera</surname> <given-names>WA</given-names></name> <name><surname>Goel</surname> <given-names>A</given-names></name> <name><surname>Kincaid</surname> <given-names>JP</given-names></name></person-group>. <article-title>OUPS: a combined approach using SMOTE and propensity score matching</article-title>. In: <source>2014 13th IEEE International Conference on Machine Learning and Applications</source>. <publisher-loc>Detroit, MI</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2014</year>). p. <fpage>424</fpage>&#x02013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICMLA.2014.106</pub-id></mixed-citation>
</ref>
<ref id="B26">
<label>26.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Seiffert</surname> <given-names>C</given-names></name> <name><surname>Khoshgoftaar</surname> <given-names>TM</given-names></name> <name><surname>Van Hulse</surname> <given-names>J</given-names></name></person-group>. <article-title>Hybrid sampling for imbalanced data</article-title>. <source>Integr Comput Aided Eng</source>. (<year>2009</year>) <volume>16</volume>:<fpage>193</fpage>&#x02013;<lpage>210</lpage>. doi: <pub-id pub-id-type="doi">10.3233/ICA-2009-0314</pub-id></mixed-citation>
</ref>
<ref id="B27">
<label>27.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Andrade</surname> <given-names>C</given-names></name></person-group>. <article-title>Mean difference, standardized mean difference (SMD), and their use in meta-analysis: as simple as it gets</article-title>. <source>J Clin Psychiatry</source>. (<year>2020</year>) <volume>81</volume>:<fpage>11349</fpage>. doi: <pub-id pub-id-type="doi">10.4088/JCP.20f13681</pub-id><pub-id pub-id-type="pmid">32965803</pub-id></mixed-citation>
</ref>
<ref id="B28">
<label>28.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Austin</surname> <given-names>PC</given-names></name></person-group>. <article-title>An introduction to propensity score methods for reducing the effects of confounding in observational studies</article-title>. <source>Multivar Behav Res</source>. (<year>2011</year>) <volume>46</volume>:<fpage>399</fpage>&#x02013;<lpage>424</lpage>. doi: <pub-id pub-id-type="doi">10.1080/00273171.2011.568786</pub-id><pub-id pub-id-type="pmid">21818162</pub-id></mixed-citation>
</ref>
<ref id="B29">
<label>29.</label>
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Cohen</surname> <given-names>J</given-names></name></person-group>. <article-title>Chapter 10</article-title>. In: <source>Some Issues in Power Analysis</source>. 2nd, ed. New York, NY: Routledge (<year>1988</year>). p. <fpage>531</fpage>&#x02013;<lpage>42</lpage>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.routledge.com/9780805802832">https://www.routledge.com/9780805802832</ext-link> (Accessed December 20, 2025).</mixed-citation>
</ref>
<ref id="B30">
<label>30.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Holmberg</surname> <given-names>MJ</given-names></name> <name><surname>Andersen</surname> <given-names>LW</given-names></name></person-group>. <article-title>Collider bias</article-title>. <source>JAMA</source>. (<year>2022</year>) <volume>327</volume>:<fpage>1282</fpage>&#x02013;<lpage>3</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2022.1820</pub-id></mixed-citation>
</ref>
<ref id="B31">
<label>31.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Collins</surname> <given-names>GS</given-names></name> <name><surname>Moons</surname> <given-names>KG</given-names></name> <name><surname>Dhiman</surname> <given-names>P</given-names></name> <name><surname>Riley</surname> <given-names>RD</given-names></name> <name><surname>Beam</surname> <given-names>AL</given-names></name> <name><surname>Van Calster</surname> <given-names>B</given-names></name> <etal/></person-group>. <article-title>TRIPOD&#x0002B; AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods</article-title>. <source>BMJ</source>. (<year>2024</year>) <volume>385</volume>:<fpage>q902</fpage>. doi: <pub-id pub-id-type="doi">10.1136/bmj.q902</pub-id><pub-id pub-id-type="pmid">38636956</pub-id></mixed-citation>
</ref>
<ref id="B32">
<label>32.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schmidt</surname> <given-names>C</given-names></name> <name><surname>Wolff</surname> <given-names>M</given-names></name> <name><surname>Weitz</surname> <given-names>M</given-names></name> <name><surname>Bartlau</surname> <given-names>T</given-names></name> <name><surname>Korth</surname> <given-names>C</given-names></name> <name><surname>Zerr</surname> <given-names>I</given-names></name></person-group>. <article-title>Rapidly progressive Alzheimer disease</article-title>. <source>Swiss Arch Neurol Psychiatry Psychother</source>. (<year>2011</year>) <volume>68</volume>:<fpage>1124</fpage>&#x02013;<lpage>30</lpage>. doi: <pub-id pub-id-type="doi">10.1001/archneurol.2011.189</pub-id></mixed-citation>
</ref>
<ref id="B33">
<label>33.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>S</given-names></name> <name><surname>Lee</surname> <given-names>JY</given-names></name> <name><surname>Cho</surname> <given-names>Si</given-names></name> <name><surname>Oh</surname> <given-names>DJ</given-names></name> <name><surname>Yoon</surname> <given-names>DH</given-names></name></person-group>. <article-title>Risk factors for various cognitive function decline trajectories in adults over 40 years of age: a retrospective cohort study</article-title>. <source>Psychiatry Investig</source>. (<year>2023</year>) <volume>20</volume>:<fpage>293</fpage>. doi: <pub-id pub-id-type="doi">10.30773/pi.2022.0188</pub-id><pub-id pub-id-type="pmid">37005386</pub-id></mixed-citation>
</ref>
<ref id="B34">
<label>34.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wyne</surname> <given-names>KL</given-names></name> <name><surname>Nair</surname> <given-names>L</given-names></name> <name><surname>Schneiderman</surname> <given-names>CP</given-names></name> <name><surname>Pinsky</surname> <given-names>B</given-names></name> <name><surname>Antunez Flores</surname> <given-names>O</given-names></name> <name><surname>Guo</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>Hypothyroidism prevalence in the United States: a retrospective study combining national health and nutrition examination survey and claims data, 2009&#x02013;2019</article-title>. <source>J Endocr Soc</source>. (<year>2023</year>) <volume>7</volume>:<fpage>bvac172</fpage>. doi: <pub-id pub-id-type="doi">10.1210/jendso/bvac172</pub-id><pub-id pub-id-type="pmid">36466005</pub-id></mixed-citation>
</ref>
<ref id="B35">
<label>35.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Townsend</surname> <given-names>N</given-names></name> <name><surname>Kazakiewicz</surname> <given-names>D</given-names></name> <name><surname>Lucy Wright</surname> <given-names>F</given-names></name> <name><surname>Timmis</surname> <given-names>A</given-names></name> <name><surname>Huculeci</surname> <given-names>R</given-names></name> <name><surname>Torbica</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Epidemiology of cardiovascular disease in Europe</article-title>. <source>Nat Rev Cardiol</source>. (<year>2022</year>) <volume>19</volume>:<fpage>133</fpage>&#x02013;<lpage>43</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41569-021-00607-3</pub-id><pub-id pub-id-type="pmid">34497402</pub-id></mixed-citation>
</ref>
<ref id="B36">
<label>36.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>YTH</given-names></name> <name><surname>Fang</surname> <given-names>J</given-names></name> <name><surname>Schieb</surname> <given-names>L</given-names></name> <name><surname>Park</surname> <given-names>S</given-names></name> <name><surname>Casper</surname> <given-names>M</given-names></name> <name><surname>Gillespie</surname> <given-names>C</given-names></name></person-group>. <article-title>Prevalence and trends of coronary heart disease in the United States, 2011 to 2018</article-title>. <source>JAMA Cardiol</source>. (<year>2022</year>) <volume>7</volume>:<fpage>459</fpage>&#x02013;<lpage>62</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jamacardio.2021.5613</pub-id><pub-id pub-id-type="pmid">35044425</pub-id></mixed-citation>
</ref>
<ref id="B37">
<label>37.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>D</given-names></name></person-group>. <article-title>Epidemiological features of cardiovascular disease in Asia</article-title>. <source>JACC Asia</source>. (<year>2021</year>) <volume>1</volume>:<fpage>1</fpage>&#x02013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jacasi.2021.04.007</pub-id><pub-id pub-id-type="pmid">36338365</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/823248/overview">Toshiyo Tamura</ext-link>, Waseda University, Japan</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2106413/overview">Vinod Kumar Chauhan</ext-link>, University of Strathclyde, United Kingdom</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2720328/overview">Giovanna Nicora</ext-link>, University of Pavia, Italy</p>
</fn>
</fn-group>
</back>
</article>