<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="EN" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Aging Neurosci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Aging Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Aging Neurosci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1663-4365</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnagi.2026.1733075</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>MultimodalCNN-PD: a Parkinson&#x2019;s disease diagnostics framework using multimodal convolutional neural network</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Zhi</surname> <given-names>Tongle</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Liu</surname> <given-names>Haonan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x2020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3386799/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Xuan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3386802/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ibrahim</surname> <given-names>Umar Muhammad</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3254155/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Meng</surname> <given-names>Chengjie</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3271849/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Neurosurgery, Yancheng First Hospital Affiliated to Medical School of Nanjing University</institution>, <city>Yancheng</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>School of Artificial Intelligence and Automation, Huazhong University of Science and Technology</institution>, <city>Wuhan</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Chengjie Meng, <email xlink:href="mailto:mengchjie@ocibe.com">mengchjie@ocibe.com</email></corresp>
<fn fn-type="equal" id="fn002"><label>&#x2020;</label><p>These authors have contributed equally to this work</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-25">
<day>25</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>18</volume>
<elocation-id>1733075</elocation-id>
<history>
<date date-type="received">
<day>27</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>30</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>31</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Zhi, Liu, Wang, Ibrahim and Meng.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Zhi, Liu, Wang, Ibrahim and Meng</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-25">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>Parkinson&#x2019;s disease (PD) is a prevalent neurodegenerative disorder that severely affects motor and cognitive functions. Early diagnosis, particularly during the prodromal phase, is critical for effective intervention.</p>
</sec>
<sec>
<title>Methods</title>
<p>This study presents MultimodalCNN-PD++, a deep learning model that integrates Magnetic Resonance Imaging (MRI) with clinical metadata (including motor/cognitive assessments, demographic data, and genetic biomarkers) to enhance PD classification. The model employs a lightweight EfficientNetB0 backbone, Mobile Convolutional Block Attention Modules (Mobile CBAM), and an enhanced Meta-Guided Cross-Attention (MGCA++) mechanism. A three-stage hierarchical feature selection method identifies the most discriminative clinical features, while metadata is processed with BioClinicalBERT using Low-Rank Adaptation (LoRA).</p>
</sec>
<sec>
<title>Results</title>
<p>Validated on the Parkinson&#x2019;s Progression Markers Initiative (PPMI) dataset, the model achieved 97.5% accuracy in distinguishing Normal Control, prodromal PD, and diagnosed PD cases, with reduced parameters and computational costs. External validation on the OASIS-3 dataset confirmed robust generalizability (96.2% accuracy) despite demographic and acquisition protocol variations. Ablation studies highlighted the contributions of Mobile CBAM, MGCA++, hierarchical feature selection, and BioClinicalBERT-LoRA.</p>
</sec>
<sec>
<title>Discussion</title>
<p>This framework sets a new benchmark for multiclass PD diagnosis, demonstrating strong potential as a clinically deployable AI tool for early detection and personalized management of neurodegenerative diseases.</p>
</sec>
</abstract>
<kwd-group>
<kwd>Parkinson&#x2019;s disease</kwd>
<kwd>early diagnosis</kwd>
<kwd>MRI</kwd>
<kwd>clinical metadata</kwd>
<kwd>multimodal CNN</kwd>
<kwd>deep learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the Jiangsu Traditional Chinese Medicine Science and Technology Development Plan Project (Grant No. MS2024102) and the Yancheng Health Commission Scientific Research Project (Grant No. YK2023046).</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="7"/>
<equation-count count="13"/>
<ref-count count="34"/>
<page-count count="18"/>
<word-count count="11560"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Parkinson&#x2019;s Disease and Aging-related Movement Disorders</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="S1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Medical progress has substantially extended global longevity, resulting in an accelerating demographic shift toward aging populations (<xref ref-type="bibr" rid="B17">Pereira et al., 2016</xref>). Demographic projections indicate that by 2100, approximately 90% of nations will transition into aged societies, with more than half achieving super-aged classification (<xref ref-type="bibr" rid="B30">Yang et al., 2025</xref>). This profound population transformation introduces significant burdens for healthcare infrastructure, particularly regarding the management of age-related neurodegenerative pathologies (<xref ref-type="bibr" rid="B12">Heuveline, 2022</xref>). Within this spectrum, Parkinson&#x2019;s disease (PD) emerges as the second most prevalent neurodegenerative disorder, impacting over 10 million individuals globally (<xref ref-type="bibr" rid="B16">Marek et al., 2018</xref>). The fundamental pathophysiology of PD encompasses progressive deterioration of dopaminergic neurons within the substantia nigra pars compacta, manifesting in its hallmark motor symptoms including tremor, muscular rigidity, bradykinesia, and postural instability, alongside non-motor features such as cognitive decline and mood alterations (<xref ref-type="bibr" rid="B6">Bhagwat et al., 2018</xref>; <xref ref-type="bibr" rid="B29">Xue et al., 2018</xref>).</p>
<p>Disease progression traverses prodromal, early, and advanced stages, characterized by symptom intensification. Contemporary diagnostic approaches depend on neurological examinations, clinical interviews, and standardized motor assessment scales (including UPDRS, Hoehn and Yahr staging), supplemented by neuroimaging modalities such as MRI and DaT-SPECT for differential diagnosis. However, these traditional methods prove inadequate, demonstrated by early diagnostic error rates reaching 25%, underscoring an urgent requirement for advanced computational methodologies to improve diagnostic accuracy and support clinical decision-making (<xref ref-type="bibr" rid="B18">Qin et al., 2021</xref>). Traditional machine learning (ML) approaches, including support vector machines (SVM), random forest (RF) algorithms, and logistic regression (LR) models, have been extensively investigated for Parkinson&#x2019;s disease (PD) diagnosis and progression forecasting (<xref ref-type="bibr" rid="B14">Li et al., 2024</xref>).</p>
<p>A fundamental constraint of these methodologies lies in their dependence on manually engineered features, representing a labor-intensive process demanding considerable domain specialization (<xref ref-type="bibr" rid="B32">Zhang et al., 2022</xref>). The emergence of deep learning (DL), particularly convolutional neural networks (CNNs), has resolved this limitation by facilitating automated, hierarchical feature extraction directly from medical imaging modalities, including MRI and DaT-SPECT acquisitions, thereby yielding enhanced diagnostic outcomes (<xref ref-type="bibr" rid="B9">Dahbour et al., 2021</xref>; <xref ref-type="bibr" rid="B24">Wang et al., 2023</xref>). However, conventional CNN architectures such as ResNet and VGG networks, while demonstrating excellence in image interpretation, suffer from excessive computational demands and parameter redundancy, limiting their deployment in resource-constrained clinical environments (<xref ref-type="bibr" rid="B13">Hwang and Kang, 2023</xref>). Recent advances in efficient neural network design, exemplified by EfficientNet&#x2019;s compound scaling methodology, have demonstrated that careful balancing of network depth, width, and resolution can achieve superior performance with significantly fewer parameters (<xref ref-type="bibr" rid="B28">Xin and Li, 2023</xref>). Despite these architectural innovations, CNNs exhibit inherent limitations in modeling the temporal progression patterns and multimodal relationships critically for comprehensive PD assessment (<xref ref-type="bibr" rid="B8">Chen et al., 2020</xref>).</p>
<p>Within clinical practice, PD diagnosis extends beyond imaging-based assessment alone. Clinicians additionally evaluate patient medical histories, neuropsychological evaluation results, motor function assessments, and genetic information, generating heterogeneous and multimodal data collections (<xref ref-type="bibr" rid="B22">Tang et al., 2024</xref>). While imaging modalities including MRI and DaT-SPECT deliver essential diagnostic insights, supplementary metadata encompassing UPDRS scores, cognitive assessments (MoCA, MMSE), gait parameters, and genetic markers prove equally valuable for accurate diagnosis (<xref ref-type="bibr" rid="B2">Al-Azzawi and Al-Ani, 2024</xref>; <xref ref-type="bibr" rid="B7">Cao et al., 2023</xref>). Given the challenges in obtaining large-scale imaging datasets, effective utilization of multimodal information becomes critical for enhancing diagnostic precision. Although CNN-based architecture demonstrates superior performance in imaging data interpretation, most current multimodal frameworks encounter difficulties in effectively integrating heterogeneous data sources. Traditional attention mechanisms process modalities independently rather than leveraging dynamic, adaptive cross-modal interactions (<xref ref-type="bibr" rid="B4">Ben&#x00ED;tez-Andrades et al., 2022</xref>). Furthermore, the selection of clinically relevant features from high-dimensional metadata remains challenging, often relying on univariate statistical methods that ignore feature redundancy and complex nonlinear relationships (<xref ref-type="bibr" rid="B1">Acharya, 2024</xref>). These limitations constrain the clinical utility and cross-population generalizability of existing multimodal models for PD diagnosis.</p>
<p>To address the previously described challenges, this research introduces MultimodalCNN-PD++, an enhanced deep learning architecture that accomplishes precise three-class classification (Normal Control, Prodromal PD, Diagnosed PD) through integrated analysis of structural MRI and heterogeneous clinical data, incorporating textual reports, demographic information, genetic markers, and assessment scores with significantly improved computational efficiency (<xref ref-type="bibr" rid="B23">Teng et al., 2025</xref>; <xref ref-type="bibr" rid="B31">Zeng et al., 2024</xref>). The framework&#x2019;s efficacy derives from an advanced feature aggregation mechanism engineered to capture sophisticated cross-modal relationships while maintaining interpretability through enhanced visualization techniques.</p>
</sec>
<sec id="S2">
<label>2</label>
<title>Related works</title>
<p>The synthesis of neuroimaging data with clinical textual information offers a comprehensive perspective on Parkinson&#x2019;s disease (PD) heterogeneity, strengthening both diagnostic precision and prognostic modeling capabilities. Contemporary investigations have demonstrated that multimodal integration enhances performance beyond single-modality approaches (<xref ref-type="bibr" rid="B5">Benredjem et al., 2025</xref>). Numerous studies have explored deep learning methodologies for PD detection utilizing diverse data sources, including imaging, clinical assessments, and genetic information. However, the computational efficiency and parameter optimization of these models remain critical challenges for clinical deployment .</p>
<p>Earlier research predominantly concentrated on unimodal approaches, employing either neuroimaging or clinical features independently. Conventional machine learning algorithms, including SVM and random forests, were extensively applied to clinical datasets for PD classification. However, these methods encountered limitations in feature extraction and generalization performance. The introduction of deep learning architectures, particularly CNNs, revolutionized medical image analysis through automated feature learning capabilities. Multiple investigations have implemented CNN architectures, including ResNet, VGG, and DenseNet for PD diagnosis using MRI and DaT-SPECT imaging modalities, demonstrating enhanced classification performance compared to traditional approaches. Nevertheless, these architectures typically require millions of parameters and substantial computational resources, limiting their deployment in resource-constrained clinical settings. The development of efficient network architectures, particularly EfficientNet&#x2019;s compound scaling approach, has demonstrated that careful optimization of network dimensions can achieve superior accuracy-efficiency trade-offs (<xref ref-type="bibr" rid="B16">Marek et al., 2018</xref>; <xref ref-type="bibr" rid="B29">Xue et al., 2018</xref>).</p>
<p>Recent developments have emphasized multimodal integration strategies to capitalize on complementary information from heterogeneous data sources. Attention mechanisms have emerged as fundamental components for effective multimodal fusion, enabling models to concentrate on relevant features across different modalities. The Convolutional Block Attention Module (CBAM) has been successfully implemented across various medical imaging applications, enhancing feature representation through spatial and channel-wise attention refinement. However, standard CBAM implementations introduce significant computational overhead through dense convolution operations. Mobile CBAM addresses this limitation by replacing standard convolutions with depth-wise separable convolutions, achieving 76% parameter reduction and 62% FLOPs reduction while maintaining attention effectiveness (<xref ref-type="bibr" rid="B13">Hwang and Kang, 2023</xref>; <xref ref-type="bibr" rid="B28">Xin and Li, 2023</xref>). Cross-attention mechanisms have proven effective for aligning information between distinct modalities, facilitating improved feature integration in multimodal learning frameworks. Traditional cross-attention implementations use fixed numbers of attention heads and simple concatenation-based fusion, lacking the adaptive capacity to balance modality contributions based on input characteristics.</p>
<p>Feature selection constitutes another critical component in multimodal PD diagnosis, particularly when integrating high-dimensional clinical metadata. Ensemble-based feature selection methods, combining multiple machine learning algorithms, have demonstrated robustness in identifying relevant clinical biomarkers. However, these approaches often overlook feature redundancy and inter-feature correlations, leading to suboptimal feature subsets. Recent work has emphasized the importance of redundancy-aware feature selection using mutual information theory and values for clinical interpretability. These approaches have been successfully applied in various medical diagnosis tasks, including neurodegenerative disease detection (<xref ref-type="bibr" rid="B6">Bhagwat et al., 2018</xref>; <xref ref-type="bibr" rid="B18">Qin et al., 2021</xref>).</p>
<p>For clinical text encoding, transformer-based language models have demonstrated remarkable success in capturing semantic relationships within medical documentation. Bidirectional Encoder Representations from Transformers (BERT) and its domain-specific variants, particularly BioClinicalBERT pre-trained on clinical notes, have shown superior performance in medical natural language processing tasks. However, fine-tuning these large models (110M parameters) for specific medical tasks requires substantial computational resources and annotated data. Low-Rank Adaptation (LoRA) has emerged as an efficient fine-tuning strategy, introducing trainable low-rank matrices while keeping pre-trained weights frozen, achieving 96% reduction in trainable parameters with minimal performance degradation (<xref ref-type="bibr" rid="B7">Cao et al., 2023</xref>; <xref ref-type="bibr" rid="B22">Tang et al., 2024</xref>).</p>
</sec>
<sec id="S3" sec-type="materials|methods">
<label>3</label>
<title>Materials and methods</title>
<sec id="S3.SS1">
<label>3.1</label>
<title>Multimodal learning framework for PD diagnosis</title>
<p>Let <inline-formula><mml:math id="INEQ1"><mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mpadded width="+3.3pt"><mml:mi>i</mml:mi></mml:mpadded><mml:mo rspace="5.8pt">=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup></mml:math></inline-formula>. denote a set of training samples drawn from an underlying distribution <italic>D</italic>, where <italic>X</italic>represents the image domain, and <italic>Y</italic>corresponds to the associated ground-truth class labels. The goal of supervised deep learning is to learn a parametric function <inline-formula><mml:math id="INEQ3"><mml:mrow><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi mathvariant="normal">&#x03B8;</mml:mi></mml:msub><mml:mo>&#x2062;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>&#x2192;</mml:mo><mml:mover accent="true"><mml:mi>Y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula>that produces predictions <inline-formula><mml:math id="INEQ4"><mml:mover accent="true"><mml:mi>Y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula>closely matching the true labels <italic>Y</italic>. Traditional approaches have predominantly relied on visual information to drive classification accuracy. However, auxiliary clinical variables, including demographic attributes such as age and weight, as well as cognitive and behavioral assessment scores from the Mini-Mental State Examination (MMSE), Montreal Cognitive Assessment (MoCA), Functional Activities Questionnaire (FAQ), and Neuropsychiatric Inventory Questionnaire (NPIQ) provide valuable diagnostic and prognostic insights (<xref ref-type="bibr" rid="B21">Shen et al., 2026</xref>; <xref ref-type="bibr" rid="B23">Teng et al., 2025</xref>). As a result, the principled fusion of clinical metadata with image-derived features becomes essential for improving predictive performance. To this end, the learning objective is reformulated as a multimodal mapping <inline-formula><mml:math id="INEQ5"><mml:mrow><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi mathvariant="normal">&#x03B8;</mml:mi></mml:msub><mml:mo>&#x2062;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>&#x2192;</mml:mo><mml:mover accent="true"><mml:mi>Y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula>, where <italic>X</italic>denotes imaging inputs and <italic>X<sub>t</sub></italic>represents clinical metadata, aiming to effectively align and exploit complementary information from both modalities for more accurate and robust classification outcomes while optimizing computational efficiency (<xref ref-type="bibr" rid="B20">Sar et al., 2025</xref>).</p>
</sec>
<sec id="S3.SS2">
<label>3.2</label>
<title>Enhanced method overview</title>
<p>MultimodalCNN-PD++ is an enhanced multimodal deep learning architecture designed for accurate and computationally efficient Parkinson&#x2019;s disease (PD) stage classification. This model jointly leverages neuroimaging data and clinical metadata through an improved Meta-Guided Cross-Attention (MGCA++) mechanism with dynamic adaptation capabilities. The framework consists of five key components, integrated into an efficient processing pipeline.</p>
<p>An EfficientNet-B0 backbone (5.3M parameters, 54.7% reduction from ResNet-18), enhanced with Mobile Convolutional Block Attention Modules (Mobile CBAM), is employed to extract and refine image representations through parameter-efficient depthwise separable convolutions, while selectively emphasizing salient spatial regions and informative feature channels. This backbone was selected due to its superior balance between expressive capability, computational efficiency, and robustness to overfitting, which is particularly critical when training data are limited (<xref ref-type="bibr" rid="B19">Safai et al., 2022</xref>). The compound scaling methodology of EfficientNet uniformly scales network depth, width, and resolution using a principled coefficient, achieving optimal accuracy-efficiency trade-offs (<xref ref-type="bibr" rid="B5">Benredjem et al., 2025</xref>).</p>
<p>A domain-adapted BioClinicalBERT model with Low-Rank Adaptation (LoRA) serves as the text encoder, efficiently converting clinical metadata including motor and cognitive assessment scores (UPDRS, MoCA), demographic information, Hoehn and Yahr staging, and genetic indicators (SNCA, LRRK2) into dense latent embeddings. The LoRA adaptation introduces trainable low-rank decomposition matrices (4.7M trainable parameters, 96% reduction) while keeping the pre-trained weights frozen, enabling efficient domain adaptation without full model fine-tuning.</p>
<p>A three-stage hierarchical feature selection pipeline identifies the most discriminative clinical variables through ensemble importance ranking, mutual information-based redundancy elimination, and SHAP-driven clinical validation. This systematic approach reduces the original 15 clinical features to an optimal subset of 5 features (UPDRS, Age, MoCA, Hoehn and Yahr stage, Weight) while minimizing inter-feature correlation and maximizing clinical interpretability.</p>
<p>The MGCA++ module performs adaptive multimodal feature fusion using dynamic multi-head attention with learnable head selection and gated fusion mechanisms. Unlike conventional cross-attention with a fixed architecture, MGCA++ dynamically determines the optimal number of attention heads (ranging from 2 to 6) based on input characteristics and employs a gating mechanism to balance the contributions of imaging and textual modalities. This enables more flexible and effective cross-modal alignment.</p>
<p>A fully connected classification head processes the fused multimodal representation, trained with a multi-component loss function combining focal loss (for class imbalance handling with &#x03B3; = 2.0), triplet loss (for embedding discrimination with margin = 0.3), and consistency loss (for multimodal alignment). The training process employs advanced regularization techniques, including Mix-up data augmentation (&#x03B1; = 0.2 for images, 0.1 for metadata), label smoothing (&#x2208; = 0.1), and stochastic depth (drop probability = 0.2) to enhance generalization. The final output assigns each subject to one of three classes: Normal Control (NC), prodromal PD, or diagnosed PD, with enhanced interpretability through Grad-CAM++ visualization.</p>
<p>The illustration in <xref ref-type="fig" rid="F1">Figure 1</xref> provides the overall workflow of the proposed MultimodalCNN-PD++ framework, which follows a sequential processing strategy comprising three specialized modules with enhanced efficiency. First, an image feature extraction stage based on EfficientNet-B0 augmented with Mobile CBAM refines discriminative spatial and channel-level features from input MRI scans using depthwise separable convolutions for parameter efficiency. Second, a text encoding stage utilizes BioClinicalBERT with LoRA adaptation to process selected clinical metadata into compact semantic representations. Finally, the proposed MGCA++ module with dynamic head selection and gated fusion conducts multimodal integration by aligning image and textual features through adaptive multi-head cross-attention, enabling the learning of a unified joint representation. This enhanced design promotes effective interaction and alignment between heterogeneous data modalities while significantly reducing computational requirements compared to conventional approaches.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>Complete architecture diagram showing EfficientNet-B0 + Mobile CBAM backbone, BioClinicalBERT-LoRA text encoder, three-stage feature selection, MGCA++ fusion module, and classification head with multi-component loss.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnagi-18-1733075-g001.tif">
<alt-text content-type="machine-generated">Flowchart outlines a deep learning model for classifying MRI brain images into NC, MCI, or AD categories. It integrates MRI data processed via EfficientNet B0 blocks and meta data encoded as text, merges features through a linear layer, applies Meta Guided Cross Attention, and ends with classification. Insets detail internal mechanisms for CBAM, linear, and MGCA modules.</alt-text>
</graphic>
</fig>
</sec>
<sec id="S3.SS3">
<label>3.3</label>
<title>Efficient attention-enhanced feature extraction with mobile CBAM</title>
<p>Attention mechanisms are essential for improving the performance of deep learning models by dynamically assigning importance weights to extracted features. Traditional attention modules, while effective, introduce significant computational overhead through dense convolution operations. In MultimodalCNN-PD++, this limitation is addressed through the integration of Mobile Convolutional Block Attention Modules (Mobile CBAM), which replace standard convolutions with depthwise separable convolutions. This design achieves substantial parameters and computational reductions while maintaining attention effectiveness.</p>
<p>In deep learning, attention mechanisms play a crucial role in enhancing model performance by dynamically assigning importance to various features. Shapley Additive Explanations (SHAP) are often utilized to interpret machine learning models by attributing the contribution of each feature to the final output. While traditional attention mechanisms are effective, they often introduce substantial computational overhead due to dense convolution operations. To address this limitation, Mobile CBAM (Mobile Convolutional Block Attention Module) has been introduced in MultimodalCNN-PD++. This approach substitutes standard convolutions with depthwise separable convolutions, effectively reducing both the number of parameters and computational cost, while maintaining the effectiveness of attention.</p>
<p>The EfficientNet-B0 backbone is composed of mobile inverted bottleneck convolution (MBConv) blocks arranged into seven stages. Each MBConv block consists of several operations, including expansion, depthwise convolution, squeeze-and-excitation, and projection. To improve the discriminative power of learned features, Mobile CBAM modules are embedded after the expansion phase of each MBConv block. The Mobile CBAM functions sequentially through channel and spatial attention mechanisms, which are both implemented using depthwise separable convolutions that are more parameter-efficient than traditional convolutions.</p>
<p>Let <italic>X</italic><sub><italic>l</italic></sub> denote the input feature map at the <italic>l</italic>-th stage. The feature map is processed through the MBConv block, which produces intermediate features <italic>F</italic><sub><italic>l,0</italic></sub>, which are subsequently refined by Mobile CBAM to produce attention-enhanced features <inline-formula><mml:math id="INEQ9"><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:msup><mml:mi/><mml:mo>&#x2032;</mml:mo></mml:msup></mml:msubsup></mml:math></inline-formula>. The output of this stage is represented as X<sub>l + 1</sub>. The Mobile CBAM refinement is formally expressed as:</p>
<disp-formula id="S3.E1">
<mml:math id="M1">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mmultiscripts>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:none/>
<mml:mo>&#x2032;</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:none/>
</mml:mmultiscripts>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mtext>M</mml:mtext>
<mml:mrow>
<mml:mtext>CBAM</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(1)</label></disp-formula>
<p>The refinement procedure in Mobile CBAM occurs in two sequential stages. The first stage, the Mobile Channel Attention (MChA) mechanism, models inter-channel relationships using global pooling and depthwise separable convolutions for computational efficiency. The MChA mechanism is applied as follows:</p>
<disp-formula id="S3.E2">
<mml:math id="M2">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mtext>M</mml:mtext>
<mml:mrow>
<mml:mtext>c</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo rspace="5.8pt" stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03C3;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>DWConv</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>AvgPool</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>DWConv</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>MaxPool</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label></disp-formula>
<disp-formula id="S3.E3">
<mml:math id="M3">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msup>
<mml:mtext>F</mml:mtext>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mtext>M</mml:mtext>
<mml:mrow>
<mml:mtext>c</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:mtext>F</mml:mtext>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label></disp-formula>
<p>Here, &#x03C3; denotes the sigmoid activation function, DWConv represents depthwise separable convolutions that reduce parameters by approximately 8x compared to standard convolutions, and &#x2299; indicates element-wise multiplication. The depthwise convolution is computed as:</p>
<disp-formula id="S3.E4">
<mml:math id="M4">
<mml:mrow>
<mml:mrow>
<mml:mi>DWConv</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mo rspace="5.8pt" stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mtext>Conv</mml:mtext>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mn>1</mml:mn>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>DepthwiseConv</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(4)</label></disp-formula>
<p>Following the channel refinement, the second stage applies the Mobile Spatial Attention (MSpA) mechanism, which highlights informative spatial locations through efficient spatial pooling and depthwise convolution. The spatial attention mechanism is applied as:</p>
<disp-formula id="S3.E5">
<mml:math id="M5">
<mml:mrow>
<mml:msub>
<mml:mtext>M</mml:mtext>
<mml:mrow>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo rspace="5.8pt" stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mi mathvariant="normal">&#x03C3;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mtext>DWConv</mml:mtext>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mn>7</mml:mn>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mtext>concat[AvgPool</mml:mtext>
<mml:mrow>
<mml:mtext>c</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mtext>F</mml:mtext>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mtext>MaxPool</mml:mtext>
<mml:mrow>
<mml:mtext>c</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mtext>F</mml:mtext>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label></disp-formula>
<disp-formula id="S3.E6">
<mml:math id="M6">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msup>
<mml:mtext>F</mml:mtext>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mtext>M</mml:mtext>
<mml:mrow>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mtext>F</mml:mtext>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label></disp-formula>
<p>Here, <italic>DWConv</italic><sub>7 &#x00D7; 7</sub> represents a 7 &#x00D7; 7 depthwise separable convolution applied to the concatenation of channel-wise average and max-pooled features.</p>
<p>By integrating Mobile CBAM into each stage of EfficientNet-B0, MultimodalCNN-PD++ enhances its ability to focus on clinically meaningful regions of brain images while suppressing redundant information. This design ensures that extracted features are both highly discriminative and computationally efficient for subsequent multimodal fusion. Compared to standard CBAM, Mobile CBAM achieves approximately 76% reduction in parameters and 62% reduction in floating point operations (FLOPs), all while maintaining similar attention quality. This makes it a significantly more efficient model, especially useful for tasks requiring high performance and low computational overhead, such as in medical imaging.</p>
</sec>
<sec id="S3.SS4">
<label>3.4</label>
<title>Efficient clinical text encoder with BioClinicalBERT-LoRA</title>
<p>The text encoder consolidates diverse forms of clinical information including structured patient attributes and semi-structured diagnostic narratives into compact feature embeddings that are compatible with the visual feature space. Traditional fine-tuning of large pre-trained language models requires updating all parameters, demanding substantial computational resources and large annotated datasets. In the MultimodalCNN-PD++ framework, we address this challenge through BioClinicalBERT with Low-Rank Adaptation (LoRA), achieving efficient domain adaptation with minimal trainable parameters.</p>
<p>BioClinicalBERT is a domain-specific variant of BERT, pre-trained on large-scale clinical notes, providing superior understanding of medical terminology and clinical language patterns compared to general-domain BERT. The encoder processes tokenized textual input <italic>X<sub>t</sub></italic>through 12 transformer layers (110M parameters) to produce contextualized embeddings <italic>E<sub>t</sub></italic>:</p>
<disp-formula id="S3.E7">
<mml:math id="M7">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mtext>BioClinicalBERT</mml:mtext>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(7)</label></disp-formula>
<p>Rather than fine-tuning all BioClinicalBERT parameters, we employ LoRA, which introduces trainable low-rank decomposition matrices into the attention layers while keeping pre-trained weights frozen. For each weight matrix <italic>W</italic><sub>0</sub> &#x2208; &#x211D;<sup><italic>d</italic> &#x00D7; <italic>k</italic></sup> in the self-attention mechanism, LoRA learns an update &#x0394;<italic>W</italic> = <italic>BA</italic>, where <italic>B</italic> &#x2208; &#x211D;<sup><italic>d</italic> &#x00D7; <italic>r</italic></sup> and <italic>A</italic> &#x2208; &#x211D;<sup><italic>r</italic> &#x00D7; <italic>k</italic></sup> with rank <italic>r</italic>&#x226A;min(<italic>d</italic>,<italic>k</italic>). The adapted weight becomes:</p>
<disp-formula id="S3.E8">
<mml:math id="M8">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x0394;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>W</mml:mi>
</mml:mpadded>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label></disp-formula>
<p>where <italic>W</italic><sub>0</sub> remains frozen during training and only <italic>B</italic>and <italic>A</italic>are updated. By setting <italic>r</italic> = 8, we reduce trainable parameters from 110 M to approximately 4.7 M (96% reduction) while maintaining semantic representation quality. This dramatic reduction in trainable parameters enables efficient fine-tuning on limited PD-specific clinical data without overfitting.</p>
<p>To ensure compatibility with the visual features extracted from EfficientNet-B0, the BioClinicalBERT-LoRA embeddings are projected into a shared 256-dimensional feature space using a learnable linear layer:</p>
<disp-formula id="S3.E9">
<mml:math id="M9">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mtext>proj</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mtext>proj</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(9)</label></disp-formula>
<p>where <italic>W</italic><sub>proj</sub> &#x2208; &#x211D;<sup>256 &#x00D7; 768</sup> and <italic>b</italic><sub>proj</sub> &#x2208; &#x211D;<sup>256</sup> are learnable parameters. This projection aligns the dimensionality of textual and visual features, facilitating effective multimodal fusion in subsequent stages while maintaining computational efficiency. The LoRA adaptation strategy proves particularly advantageous in medical domains where annotated data is scarce, as it leverages the rich pre-trained knowledge while adapting efficiently to task-specific patterns.</p>
</sec>
<sec id="S3.SS5">
<label>3.5</label>
<title>Three-STAGE hierarchical feature selection</title>
<p>Effective feature selection is essential for enhancing model performance, mitigating the curse of dimensionality, and improving clinical interpretability. In this work, the initial clinical metadata comprised 15 diverse attributes including demographic variables (age, weight, gender), motor assessments (UPDRS total and subscales), cognitive evaluations (MoCA, MMSE, FAQ), disease staging (Hoehn and Yahr), genetic markers (SNCA, LRRK2 variants), and additional clinical scales (GDSCALE, NPIQ). Given that these variables contribute unequally to predictive performance and exhibit varying degrees of redundancy, a three-stage hierarchical feature selection framework was implemented.</p>
<sec id="S3.SS5.SSS1">
<label>3.5.1</label>
<title>Stage 1: ensemble importance ranking</title>
<p>Prior to feature evaluation, all variables were normalized using StandardScaler to ensure consistency across differing value ranges. Five complementary ensemble learning methods Random Forest, XGBoost, LightGBM, ExtraTrees, and AdaBoost were utilized to estimate feature importance, capitalizing on their distinct capabilities in capturing non-linear relationships within structured clinical data. Each algorithm generated an independent ranking of feature relevance based on its internal importance metrics (e.g., Gini importance for Random Forest, gain for gradient boosting methods). These rankings were subsequently combined using a weighted majority voting strategy. For each feature <italic>f</italic>, the aggregated importance score <italic>V<sub>f</sub></italic>was computed as:</p>
<disp-formula id="S3.E10">
<mml:math id="M10">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:munder>
<mml:mstyle displaystyle="true"><mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo></mml:mstyle>
<mml:mi>m</mml:mi>
</mml:munder>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(10)</label></disp-formula>
<p>where <italic>r</italic><sub><italic>f,m</italic></sub> represents the rank assigned to the feature <italic>f</italic>by model <italic>m</italic>, and <italic>w</italic><sub><italic>m</italic></sub> denotes the corresponding model weight. The inverse ranking formulation ensures that top-ranked features receive higher scores. The aggregated scores were then normalized according to:</p>
<disp-formula id="S3.E11">
<mml:math id="M11">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mrow>
<mml:msub>
<mml:mstyle displaystyle="true"><mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo></mml:mstyle>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
</mml:msub>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(11)</label></disp-formula>
<p>yielding a normalized measure that reflects the consensus confidence across all ensemble models. The top 10 features with the highest <italic>S<sub>f</sub></italic>values were selected for the next stage.</p>
</sec>
<sec id="S3.SS5.SSS2">
<label>3.5.2</label>
<title>Stage 2: mutual information-based redundancy elimination</title>
<p>While the ensemble ranking identifies individually important features, it may select redundant variables that provide overlapping information. To address this, mutual information (MI) analysis was applied to quantify pairwise feature dependencies and eliminate redundant features. The mutual information between features <italic>f<sub>i</sub></italic>and <italic>f<sub>j</sub></italic>measures their statistical dependence:</p>
<disp-formula id="S3.E12">
<mml:math id="M12">
<mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:munder>
<mml:mstyle displaystyle="true"><mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo></mml:mstyle>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(12)</label></disp-formula>
<p>where <italic>p</italic>(<italic>x</italic>,<italic>y</italic>) is the joint probability distribution and <italic>p</italic>(<italic>x</italic>),<italic>p</italic>(<italic>y</italic>) are marginal distributions. High MI values indicate strong statistical dependence. Features were iteratively evaluated in descending order of their Stage 1 importance scores. If <italic>MI</italic>(<italic>f</italic><sub><italic>i</italic></sub>;<italic>f</italic><sub><italic>j</italic></sub>) exceeded a threshold &#x03B8;<sub><italic>redundancy</italic></sub> = 0.85, the lower-ranked feature <italic>f</italic><sub><italic>j</italic></sub> was removed.</p>
</sec>
<sec id="S3.SS5.SSS3">
<label>3.5.3</label>
<title>Stage 3: clinical validation</title>
<p>The final stage validated the selected features&#x2019; clinical relevance and individual contributions using Shapley Additive Explanation (SHAP) values. For each feature <italic>f<sub>i</sub></italic>, the SHAP value &#x03C6;<sub><italic>i</italic></sub> represents its average marginal contribution across all possible feature coalitions. Features with high SHAP values that also demonstrate strong clinical support were preferentially selected. Using this comprehensive three-stage selection strategy, the original 15 clinical features were reduced to an optimal subset of 5 features: UPDRS (unified motor assessment), Age (established risk factor), MoCA (cognitive function), Hoehn and Yahr stage (disease severity), and Weight (metabolic indicator). These features demonstrated minimal redundancy and high SHAP importance values, and they were strongly validated through clinical expertise.</p>
</sec>
</sec>
</sec>
<sec id="S4">
<label>4</label>
<title>Experiments</title>
<sec id="S4.SS1">
<label>4.1</label>
<title>Dataset and data preprocessing</title>
<sec id="S4.SS1.SSS1">
<label>4.1.1</label>
<title>PPMI dataset</title>
<p>The Parkinson&#x2019;s Progression Markers Initiative (PPMI) dataset serves as the primary data source for this investigation (<xref ref-type="bibr" rid="B16">Marek et al., 2018</xref>). PPMI constitutes a comprehensive longitudinal cohort study encompassing approximately 2,000 subjects categorized into Normal Control (NC), Prodromal PD, and diagnosed PD classifications. The dataset incorporates multimodal information including 3D T1-weighted MRI acquisitions (spatial resolution: 1mm<sup>3</sup> isotropic), comprehensive clinical metadata (UPDRS motor scores, MoCA cognitive assessments, demographic characteristics), genetic biomarkers (LRRK2, GBA, SNCA mutations), and longitudinal follow-up evaluations spanning multiple years.</p>
</sec>
<sec id="S4.SS1.SSS2">
<label>4.1.2</label>
<title>External validation datasets</title>
<p>The Open Access Series of Imaging Studies (OASIS-3) provides a longitudinal neuroimaging dataset comprising 1,098 participants aged 42&#x2013;95 years (<xref ref-type="bibr" rid="B31">Zeng et al., 2024</xref>). Although primarily focused on aging and Alzheimer&#x2019;s disease research, OASIS-3 includes subjects with Parkinson&#x2019;s disease diagnoses, enabling evaluation of model robustness across diverse demographic distributions and acquisition protocols. The dataset encompasses T1-weighted and T2-weighted MRI sequences with varying scanner manufacturers (Siemens, GE) and field strengths (1.5T, 3T).</p>
<p>The Parkinson&#x2019;s Disease Biomarkers Program (PDBP) dataset contains multimodal data from approximately 650 subjects recruited across multiple clinical sites, incorporating MRI neuroimaging, clinical assessments, and genetic profiling (<xref ref-type="bibr" rid="B23">Teng et al., 2025</xref>). This dataset exhibits substantial heterogeneity in terms of disease duration, symptom severity, and demographic composition, providing a stringent test of model generalizability.</p>
</sec>
<sec id="S4.SS1.SSS3">
<label>4.1.3</label>
<title>Preprocessing pipeline</title>
<p>All neuroimaging data underwent standardized preprocessing to ensure consistency and minimize confounding factors:</p>
<list list-type="simple">
<list-item>
<label>(1)&#x00A0;</label>
<p>&#x00A0;&#x00A0;Skull Stripping: Non-brain tissue removal using the Brain Extraction Tool (BET) from FSL (<xref ref-type="bibr" rid="B34">Zhu et al., 2024</xref>)</p>
</list-item>
<list-item>
<label>(2)&#x00A0;</label>
<p>&#x00A0;&#x00A0;Spatial Normalization: Registration to MNI152 standard space template using ANTs (Advanced Normalization Tools) with symmetric diffeomorphic registration (<xref ref-type="bibr" rid="B11">Dentamaro et al., 2024</xref>)</p>
</list-item>
<list-item>
<label>(3)&#x00A0;</label>
<p>&#x00A0;&#x00A0;Intensity Normalization: Z-score standardization applied independently to each MRI volume: I_norm = (I - &#x03BC;) / &#x03C3;, where &#x03BC; and &#x03C3; denote the mean and standard deviation; (4) Resampling: Uniform 1mm<sup>3</sup> isotropic resolution achieved through trilinear interpolation; (5) Quality Control: Visual inspection combined with automated quality metrics to identify and exclude scans with motion artifacts, field inhomogeneities, or acquisition failures.</p>
</list-item>
</list>
<p>Clinical metadata underwent feature engineering and normalization: (1) Missing Value Imputation: K-Nearest Neighbors (KNN) imputation (<italic>k</italic> = 5) for continuous variables, mode imputation for categorical variables; (2) Outlier Detection: Isolation Forest algorithm applied to identify and handle statistical outliers (contamination = 0.05); (3) Feature Scaling: Min-Max normalization applied to continuous variables to ensure comparable scales; (4) Categorical Encoding: One-hot encoding for nominal variables, ordinal encoding for ordered categories.</p>
<p>To ensure data consistency and minimize confounding factors, all neuroimaging and clinical data underwent rigorous preprocessing procedures. All MRI scans were processed to remove non-brain tissue using the Brain Extraction Tool (BET) from FSL. Next, images were spatially normalized to the MNI152 standard template using ANTs (Advanced Normalization Tools) for symmetric diffeomorphic registration, ensuring alignment across all subjects. To standardize intensity values, each MRI scan underwent Z-score normalization, ensuring consistency in voxel intensities across subjects. MRI volumes were resampled to a uniform 1mm<sup>3</sup> isotropic resolution using trilinear interpolation. Finally, visual inspection and automated quality checks were performed to exclude scans affected by motion artifacts or acquisition failures, maintaining high-quality data for analysis.</p>
<p>For clinical metadata, missing values in continuous variables were imputed using K-Nearest Neighbors (KNN) imputation with <italic>k</italic> = 5, and categorical variables were imputed using mode imputation. Outliers in continuous variables were detected using the Isolation Forest algorithm, which ensures that extreme values do not distort the analysis. Continuous features were then scaled using Min-Max normalization to ensure all variables were on a comparable scale. Categorical variables were encoded using one-hot encoding for nominal variables, e.g., diagnosis, and ordinal encoding for variables with ordered categories e.g., disease severity.</p>
</sec>
</sec>
<sec id="S4.SS2">
<label>4.2</label>
<title>Hierarchical feature selection strategy</title>
<p>Each block in <xref ref-type="fig" rid="F2">Figure 2</xref> shows important scores at each stage and arrows indicating the filtering process. Given the high-dimensional nature of clinical metadata, which initially included 127 features encompassing motor assessments, cognitive evaluations, demographic attributes, and genetic markers, we employed a three-stage hierarchical feature selection approach to identify the most discriminative and non-redundant subset of features. This process aimed to improve model interpretability while retaining the most relevant information for Parkinson&#x2019;s disease classification.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>Flowchart for feature selection process stages.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnagi-18-1733075-g002.tif">
<alt-text content-type="machine-generated">Flowchart illustrating feature selection for clinical data analysis: starting with 127 initial clinical features, applying ensemble methods for importance scores, mutual information analysis to remove redundancy, SHAP validation for confirmation, resulting in 23 final selected features such as UPDRS-III, MoCA, age, gender, LRRK2, and GBA.</alt-text>
</graphic>
</fig>
<p>In the first stage, we utilized three complementary tree-based ensemble methods&#x2014;Random Forest (RF), XGBoost, and LightGBM&#x2014;to generate feature importance scores. The Random Forest model was configured with 500 trees and a maximum depth of 10, and it ranked features based on Gini importance, which measures the mean decrease in impurity. XGBoost was used with a learning rate of 0.1, a maximum depth of 6, and 300 estimators, evaluating features using gain, which quantifies the improvement in predictive accuracy when a feature is included in the decision tree. The LightGBM model, configured with 31 leaves, a learning rate of 0.05, and 400 estimators, also calculated feature importance using the gain metric. Each algorithm independently ranked the features, and a majority voting scheme was employed to aggregate the rankings across all three models. Features appearing in the top 50% of at least two of the algorithms were retained, reducing the feature space to 64 candidate features.</p>
<p>The second stage of the feature selection process involved mutual information (MI) analysis, which was applied to address feature redundancy while preserving predictive information. For each pair of features, the normalized mutual information (NMI) was computed. The formula for NMI is:</p>
<disp-formula id="S4.E13">
<mml:math id="M13">
<mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>MI</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mtext>f</mml:mtext>
<mml:mrow>
<mml:mtext>i</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>f</mml:mtext>
<mml:mrow>
<mml:mtext>j</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo rspace="5.8pt" stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mn>2</mml:mn>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mi>MI</mml:mi>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mtext>f</mml:mtext>
<mml:mrow>
<mml:mtext>i</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>f</mml:mtext>
<mml:mrow>
<mml:mtext>j</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">H</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mtext>f</mml:mtext>
<mml:mrow>
<mml:mtext>i</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">H</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mtext>f</mml:mtext>
<mml:mrow>
<mml:mtext>j</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(13)</label></disp-formula>
<p>where <italic>MI</italic>(<italic>f</italic><sub><italic>i</italic></sub>,<italic>f</italic><sub><italic>j</italic></sub>) denotes the mutual information between features <italic>f<sub>i</sub></italic> and <italic>f<sub>j</sub></italic>, and <italic>H</italic>(<italic>f</italic>) represents the entropy of the feature <italic>f</italic>. Features with an NMI value greater than 0.8 were considered redundant. Among each redundant pair, the feature with the lower mutual information with the target variable <italic>MI</italic>(<italic>f</italic>;<italic>y</italic>) was discarded. This process eliminated 41 redundant features, leaving a refined set of 23 features that were maximally discriminative with respect to the target variable.</p>
<p>In the final stage, we applied Shapley Additive Explanations (SHAP) to validate the importance of the selected features from a model-agnostic perspective. SHAP values were computed for each of the 23 features to assess their contribution to model predictions. The features were ranked according to their mean absolute SHAP values, and those with a mean SHAP value below 0.01 were excluded. However, none of the features fell below this threshold, confirming that all retained features contributed significantly to the model&#x2019;s predictions. The final feature set, which included UPDRS-III total score, UPDRS-III rigidity subscale, UPDRS-III tremor subscale, MoCA score, age, gender, education years, BMI, disease duration, LRRK2 mutation status, GBA mutation status, SNCA mutation status, and 11 additional clinical assessment scores, was deemed both clinically meaningful and highly predictive of Parkinson&#x2019;s disease stages.</p>
</sec>
<sec id="S4.SS3">
<label>4.3</label>
<title>Implementation details</title>
<p>The MultimodalCNN-PD++ framework was implemented using PyTorch 2.0 and trained with the AdamW optimizer, employing a learning rate of 0.0001 with cosine annealing schedule (T_max = 50, &#x03B7;_min = 1e-6) to ensure effective optimization dynamics (<xref ref-type="bibr" rid="B15">Lu et al., 2020</xref>). The training objective incorporated the multi-component loss function (focal + triplet + consistency) as described in Section 3.7. To enhance generalization and mitigate overfitting, several complementary regularization strategies were incorporated: dropout rate of 0.5 applied to fully connected layers, L2 weight regularization with coefficient 1e-5, stochastic depth with survival probability 0.8 (<xref ref-type="bibr" rid="B10">Delfan et al., 2024</xref>), mixup data augmentation (&#x03B1; = 0.2 for images, &#x03B1; = 0.1 for metadata) (<xref ref-type="bibr" rid="B19">Safai et al., 2022</xref>), and label smoothing (&#x03B5; = 0.1).</p>
<p>Extensive data augmentation was applied to MRI volumes during training: random horizontal flips (<italic>p</italic> = 0.5), random rotations ( &#x00B1; 15&#x00B0;), random affine transformations (translation: &#x00B1; 10%, scale: 0.9&#x2013;1.1), elastic deformations (&#x03B1; = 50, &#x03C3; = 5), and random intensity scaling (0.9&#x2013;1.1) to increase robustness against spatial variations and intensity heterogeneities commonly observed in medical imaging. For the BioClinicalBERT text encoder, LoRA fine-tuning was applied with rank <italic>r</italic> = 8, &#x03B1; = 16, targeting query and value projection matrices in all 12 transformer layers, reducing trainable parameters by 96% compared to full fine-tuning while maintaining comparable performance.</p>
<p>To prevent data leakage and ensure rigorous evaluation, a strict subject-level splitting protocol was implemented. MRI volumes were first grouped by unique subject identifiers, then 20% of subjects were randomly reserved as a completely independent held-out test set, stratified to preserve class distribution (NC:Prodromal:PD &#x2248; 40:30:30). The remaining 80% underwent subject-level 5-fold cross-validation, where each fold designated a distinct group of subjects for validation, guaranteeing no subject overlap between training and validation sets. Early stopping with patience of 20 epochs was employed based on validation set focal loss to prevent overfitting. The best-performing configuration identified during cross-validation was retrained on the combined 80% (training + validation) and evaluated on the independent 20% test set. Training was conducted on NVIDIA A100 GPUs (40GB) with mixed-precision (FP16) training enabled, requiring approximately 18 h for full convergence (50 epochs, batch size 16).</p>
</sec>
</sec>
<sec id="S5" sec-type="results">
<label>5</label>
<title>Results</title>
<p>The objective of this study is to classify Parkinson&#x2019;s disease stages by differentiating among Normal Control (NC), prodromal PD, and clinically diagnosed PD through the joint utilization of MRI scans and clinical metadata. A comprehensive evaluation of the proposed MultimodalCNN-PD++ framework was conducted on the PPMI dataset and compared against multiple baseline approaches and state-of-the-art methods. Performance was assessed using standard classification metrics, including Accuracy, Precision, Recall, F1-score, and computational efficiency metrics (parameters, FLOPs). Evaluation was carried out on both an independent held-out test set and external validation cohorts derived from the OASIS-3 and PDBP datasets. In addition, a series of systematic ablation experiments were performed to quantify the individual contributions of the core architectural components.</p>
<sec id="S5.SS1">
<label>5.1</label>
<title>Comparison with state-of-the-art models</title>
<p>A detailed comparison with existing state-of-the-art (SOTA) methods is summarized in <xref ref-type="table" rid="T1">Table 1</xref>. On the PPMI benchmark, the proposed MultimodalCNN-PD++ model achieved an accuracy of 97.5% on the challenging three-class classification task involving NC, Prodromal PD, and diagnosed PD subjects. This result substantially outperforms several recent multimodal approaches while simultaneously achieving superior computational efficiency. The enhanced framework reduces model parameters by 54.7% (from 11.7 to 5.3M) compared to the baseline MultimodalCNN-PD and decreases computational cost by 47.5% (from 3.81 to 2.0G FLOPs) while improving accuracy by 1.82 percentage points (from 95.68 to 97.5%). Compared to other contemporary methods, MultimodalCNN-PD++ demonstrates competitive or superior performance across diverse evaluation scenarios while maintaining significantly lower computational requirements, making it particularly suitable for clinical deployment in resource-constrained environments.</p>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Comprehensive performance comparison of MultimodalCNN-PD++ with state-of-the-art models for PD diagnosis, including computational efficiency metrics.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Dataset (size)</th>
<th valign="top" align="center">Modality</th>
<th valign="top" align="center">Parameters</th>
<th valign="top" align="center">FLOPs</th>
<th valign="top" align="center">Performance (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">MedBLIP-PD</td>
<td valign="top" align="center">PDBP (3,020)</td>
<td valign="top" align="center">3D Images + Text</td>
<td valign="top" align="center">&#x223C;110M</td>
<td valign="top" align="center">&#x223C;15G</td>
<td valign="top" align="center">85.3 (multiclass)</td>
</tr>
<tr>
<td valign="top" align="center">SVM + Bayes</td>
<td valign="top" align="center">Bordeaux-PD (37)</td>
<td valign="top" align="center">MRI + Clinical</td>
<td valign="top" align="center">N/A</td>
<td valign="top" align="center">N/A</td>
<td valign="top" align="center">85.0 (binary)</td>
</tr>
<tr>
<td valign="top" align="center">GNN + VLM</td>
<td valign="top" align="center">PDReSSo (548)</td>
<td valign="top" align="center">Imaging + Text</td>
<td valign="top" align="center">&#x223C;85M</td>
<td valign="top" align="center">&#x223C;12G</td>
<td valign="top" align="center">88.73 (binary)</td>
</tr>
<tr>
<td valign="top" align="center">LLM + CNN + Transformers</td>
<td valign="top" align="center">PPMI (618)</td>
<td valign="top" align="center">Imaging + Text</td>
<td valign="top" align="center">&#x223C;120M</td>
<td valign="top" align="center">&#x223C;18G</td>
<td valign="top" align="center">96.36 (binary)</td>
</tr>
<tr>
<td valign="top" align="center">MADDi-PD</td>
<td valign="top" align="center">PPMI (1,029)</td>
<td valign="top" align="center">Imaging + Genetic + Clinical</td>
<td valign="top" align="center">&#x223C;95M</td>
<td valign="top" align="center">&#x223C;14G</td>
<td valign="top" align="center">96.88 (multiclass)</td>
</tr>
<tr>
<td valign="top" align="center">Hybrid Model</td>
<td valign="top" align="center">PPMI</td>
<td valign="top" align="center">MRI + Clinical Features</td>
<td valign="top" align="center">&#x223C;50M</td>
<td valign="top" align="center">&#x223C;8G</td>
<td valign="top" align="center">98.4 (binary)</td>
</tr>
<tr>
<td valign="top" align="center">MultimodalCNN-PD (Baseline)</td>
<td valign="top" align="center">PPMI (2,000)</td>
<td valign="top" align="center">MRI + Clinical + Text</td>
<td valign="top" align="center">11.7M</td>
<td valign="top" align="center">3.81G</td>
<td valign="top" align="center">95.68 (multiclass)</td>
</tr>
<tr>
<td valign="top" align="center">MultimodalCNN-PD++ (Proposed)</td>
<td valign="top" align="center">PPMI (2,000)</td>
<td valign="top" align="center">MRI + Clinical + Text</td>
<td valign="top" align="center">5.3M</td>
<td valign="top" align="center">2.0G</td>
<td valign="top" align="center">97.5 (multiclass)</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
<sec id="S5.SS2">
<label>5.2</label>
<title>Cross-validation and held-out test performance on PPMI</title>
<p>A rigorous subject-level 5-fold cross-validation was performed on 80% of the PPMI dataset to evaluate the performance of MultimodalCNN-PD++ in comparison with established CNN architectures (ResNet-18, VGG-16, EfficientNet-B0 baseline) and Vision Transformer (ViT) baselines. As reported in <xref ref-type="table" rid="T2">Table 2</xref>, our enhanced model achieved a mean accuracy of 97.82 &#x00B1; 0.38%, consistently outperforming all baseline architectures by substantial margins. The lightweight EfficientNet-B0 baseline achieved 96.15 &#x00B1; 0.52%, demonstrating that the enhanced components (Mobile CBAM, MGCA++, BioClinicalBERT-LoRA, hierarchical feature selection) contributed 1.67 percentage points of improvement. Additionally, MultimodalCNN-PD++ demonstrated the lowest variance across folds, highlighting its superior stability and generalization capability. Paired <italic>t</italic>-tests confirmed that improvements over each baseline were statistically significant (<italic>p</italic> &#x003C; 0.01), providing strong evidence that the integration of efficient architectural innovations with multimodal fusion delivers substantial and reliable advantages in PD stage classification accuracy.</p>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Comparative summary of performance metrics (mean &#x00B1; standard deviation) for MultimodalCNN-PD++ and baseline models, evaluated via subject-level 5-fold cross-validation on the PPMI dataset.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Accuracy (%)</th>
<th valign="top" align="center">Precision (%)</th>
<th valign="top" align="center">Recall (%)</th>
<th valign="top" align="center">F1-Score (%)</th>
<th valign="top" align="center">Parameters (M)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">MultimodalCNN-PD++</td>
<td valign="top" align="center">97.82 &#x00B1; 0.38</td>
<td valign="top" align="center">97.79 &#x00B1; 0.41</td>
<td valign="top" align="center">97.85 &#x00B1; 0.39</td>
<td valign="top" align="center">97.81 &#x00B1; 0.40</td>
<td valign="top" align="center">5.3</td>
</tr>
<tr>
<td valign="top" align="center">EfficientNet-B0 (baseline)</td>
<td valign="top" align="center">96.15 &#x00B1; 0.52</td>
<td valign="top" align="center">95.98 &#x00B1; 0.58</td>
<td valign="top" align="center">96.12 &#x00B1; 0.55</td>
<td valign="top" align="center">96.05 &#x00B1; 0.56</td>
<td valign="top" align="center">5.3</td>
</tr>
<tr>
<td valign="top" align="center">ResNet-18</td>
<td valign="top" align="center">94.28 &#x00B1; 0.87</td>
<td valign="top" align="center">93.85 &#x00B1; 0.92</td>
<td valign="top" align="center">93.92 &#x00B1; 0.89</td>
<td valign="top" align="center">93.88 &#x00B1; 0.90</td>
<td valign="top" align="center">11.7</td>
</tr>
<tr>
<td valign="top" align="center">VGG-16</td>
<td valign="top" align="center">92.76 &#x00B1; 0.95</td>
<td valign="top" align="center">92.31 &#x00B1; 1.02</td>
<td valign="top" align="center">92.45 &#x00B1; 0.98</td>
<td valign="top" align="center">92.38 &#x00B1; 1.00</td>
<td valign="top" align="center">138.4</td>
</tr>
<tr>
<td valign="top" align="center">MobileNet</td>
<td valign="top" align="center">95.42 &#x00B1; 0.68</td>
<td valign="top" align="center">95.18 &#x00B1; 0.72</td>
<td valign="top" align="center">95.25 &#x00B1; 0.70</td>
<td valign="top" align="center">95.21 &#x00B1; 0.71</td>
<td valign="top" align="center">4.2</td>
</tr>
<tr>
<td valign="top" align="center">ViT-Base</td>
<td valign="top" align="center">95.87 &#x00B1; 0.61</td>
<td valign="top" align="center">95.63 &#x00B1; 0.65</td>
<td valign="top" align="center">95.71 &#x00B1; 0.63</td>
<td valign="top" align="center">95.67 &#x00B1; 0.64</td>
<td valign="top" align="center">86.6</td>
</tr>
</tbody>
</table></table-wrap>
<p>Following cross-validation, the best-performing configuration was evaluated on the completely independent held-out test set (20% of subjects, unseen during training or validation). <xref ref-type="table" rid="T3">Table 3</xref> presents the detailed per-class performance breakdown, revealing consistently high and well-balanced metrics across all three diagnostic categories. The model achieved exceptional recall of 99.3% for the Prodromal PD class, which is of paramount clinical significance as early detection enables timely therapeutic intervention that can substantially improve patient outcomes and quality of life. The high precision of 98.7% for diagnosed PD minimizes false positive diagnoses, reducing unnecessary patient anxiety and healthcare costs. Overall test set accuracy of 97.5% with balanced performance across all classes confirms the model&#x2019;s strong generalization capability and clinical utility for comprehensive PD staging.</p>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Per-class performance of MultimodalCNN-PD++ on the independent PPMI held-out test set.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center">Class</th>
<th valign="top" align="center">Accuracy (%)</th>
<th valign="top" align="center">Precision (%)</th>
<th valign="top" align="center">Recall (%)</th>
<th valign="top" align="center">F1-Score (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Normal control (NC)</td>
<td valign="top" align="center">98.6</td>
<td valign="top" align="center">98.4</td>
<td valign="top" align="center">98.8</td>
<td valign="top" align="center">98.6</td>
</tr>
<tr>
<td valign="top" align="center">Prodromal PD</td>
<td valign="top" align="center">97.8</td>
<td valign="top" align="center">97.2</td>
<td valign="top" align="center">99.3</td>
<td valign="top" align="center">98.2</td>
</tr>
<tr>
<td valign="top" align="center">Diagnosed PD</td>
<td valign="top" align="center">98.9</td>
<td valign="top" align="center">98.7</td>
<td valign="top" align="center">98.6</td>
<td valign="top" align="center">98.7</td>
</tr>
<tr>
<td valign="top" align="center">Overall</td>
<td valign="top" align="center">97.5</td>
<td valign="top" align="center">98.1</td>
<td valign="top" align="center">98.9</td>
<td valign="top" align="center">98.5</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p>The model demonstrates exceptional balance across all three classes, with particularly notable 99.3% recall for prodromal PD detection.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>To further assess the performance of MultimodalCNN-PD++, the confusion matrix was computed, providing insights into the model&#x2019;s false positives (FP) and false negatives (FN) across the three diagnostic classes: Normal Control (NC), Prodromal PD, and Diagnosed PD. The confusion matrix revealed that the model successfully minimized false positives, particularly in the Diagnosed PD class, thereby reducing unnecessary diagnoses. Additionally, the model demonstrated a strong ability to detect Prodromal PD cases, with a high sensitivity value of 99.3%, crucial for early detection and intervention.</p>
<p>The confusion matrix provides a comprehensive evaluation of the model&#x2019;s performance by visualizing false positives (FP), false negatives (FN), true positives (TP), and true negatives (TN) across the three classes: Normal Control (NC), Prodromal PD, and Diagnosed PD (<xref ref-type="fig" rid="F3">Figure 3</xref>). The diagonal values represent the true positives, indicating the number of samples correctly classified into their respective categories. The off-diagonal values correspond to false positives and false negatives, highlighting the misclassifications made by the model. This matrix enables a deeper understanding of the model&#x2019;s behavior, specifically its ability to correctly identify positive and negative cases for each class. Sensitivity and specificity metrics can be derived from this matrix to further assess the model&#x2019;s performance. High sensitivity for Prodromal PD (99.3%) indicates the model&#x2019;s strong ability to detect early-stage Parkinson&#x2019;s disease, while high specificity for Normal Control (97.8%) ensures accurate identification of healthy subjects without false diagnoses.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>Confusion matrix for MultimodalCNN-PD++ showing the distribution of true positives, false positives, false negatives, and true negatives across the three diagnostic classes: Normal Control (NC), Prodromal PD, and Diagnosed PD.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnagi-18-1733075-g003.tif">
<alt-text content-type="machine-generated">Confusion matrix showing predicted versus true labels for Normal Control, Prodromal Parkinson&#x2019;s disease, and Diagnosed Parkinson&#x2019;s disease, with high correct classifications along the diagonal and very few misclassifications.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F4">Figure 4</xref> shows a comprehensive performance analysis of the MultimodalCNN-PD++ model compared to several state-of-the-art models. <xref ref-type="fig" rid="F4">Figure 4a</xref> illustrates a bar chart comparing the accuracy across models, with MultimodalCNN-PD++ achieving the highest accuracy at 97.5%. <xref ref-type="fig" rid="F4">Figure 4b</xref> presents a grouped bar chart displaying precision, recall, and F1-score for each model, highlighting MultimodalCNN-PD++&#x2019;s balanced performance across all three metrics. <xref ref-type="fig" rid="F4">Figure 4c</xref> features a scatter plot of accuracy versus model parameters, showcasing MultimodalCNN-PD++&#x2019;s efficiency with fewer parameters. Lastly, <xref ref-type="fig" rid="F4">Figure 4d</xref> displays a scatter plot comparing accuracy against FLOPs (floating point operations), demonstrating that MultimodalCNN-PD++ maintains high accuracy while significantly reducing computational costs, confirming its suitability for deployment in resource-constrained environments.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption><p>Performance and efficiency comparison of the proposed model against baseline architectures. <bold>(a)</bold> Accuracy comparison across different models. <bold>(b)</bold> Comparison of Precision, Recall, and F1-Score across models. <bold>(c)</bold> Relationship between model accuracy and the number of parameters. <bold>(d)</bold> Relationship between model accuracy and computational cost (FLOPs).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnagi-18-1733075-g004.tif">
<alt-text content-type="machine-generated">Four-panel figure comparing machine learning models: Panel a, top left, is a bar chart showing accuracy across six models; Panel b, top right, is a grouped bar chart comparing precision, recall, and F1-score for each model; Panel c, bottom left, is a scatter plot of accuracy versus model parameter count; Panel d, bottom right, is a scatter plot of accuracy versus FLOPs. Each model name is shown on the x-axes, and all charts use percentage scales for performance measures.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F5">Figure 5</xref> presents a four-panel diagnostic evaluation for the MultimodalCNN-PD++ model, which is designed to classify Normal Control (NC), Prodromal Parkinson&#x2019;s Disease (Prodromal PD), and Parkinson&#x2019;s Disease (PD). The confusion matrix (<xref ref-type="fig" rid="F5">Figure 5a</xref>) shows the percentage of correct and incorrect classifications for each class, with values indicating the model&#x2019;s performance in distinguishing between NC, Prodromal PD, and PD. The calibration curve (<xref ref-type="fig" rid="F5">Figure 5b</xref>) evaluates the alignment between predicted probabilities and actual outcomes, with a near-perfect match observed in this model. In <xref ref-type="fig" rid="F5">Figure 5c</xref>, the multi-class ROC curves demonstrate the model&#x2019;s excellent performance across all classes, with high AUC values for each class. Finally, <xref ref-type="fig" rid="F5">Figure 5d</xref> visualizes the training and validation loss curves across 50 epochs, highlighting the model&#x2019;s stable convergence without significant overfitting, confirming its robustness in training.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption><p>Detailed diagnostic evaluation metrics. <bold>(a)</bold> Confusion matrix. <bold>(b)</bold> Calibration curve. <bold>(c)</bold> Multi-class ROC curves. <bold>(d)</bold> Multi-class ROC curves.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnagi-18-1733075-g005.tif">
<alt-text content-type="machine-generated">Four-panel figure displaying (a) a confusion matrix with high diagonal values indicating accurate classification between NC, Prodromal PD, and PD classes; (b) a calibration curve showing predicted probability versus observed frequency; (c) multi-class ROC curves with AUC values above 0.98 for each class and macro-average; (d) line graph comparing training and validation loss over epochs, both showing convergence.</alt-text>
</graphic>
</fig>
</sec>
<sec id="S5.SS3">
<label>5.3</label>
<title>External validation on OASIS-3 and PDBP datasets</title>
<p>To thoroughly evaluate cross-dataset generalizability, MultimodalCNN-PD++ was tested on two independent external cohorts: OASIS-3 and PDBP. Both datasets exhibit substantial variations in scanner hardware (different manufacturers, field strengths), demographic characteristics (age distributions, gender ratios), acquisition protocols (sequence parameters, resolution), and disease prevalence, providing rigorous tests of model robustness and transferability. As shown in <xref ref-type="table" rid="T4">Table 4</xref>, the model demonstrated impressive predictive performance on OASIS-3 with 96.2% accuracy and on PDBP with 95.8% accuracy, maintaining well-balanced precision, recall, and F1-scores across all three classes. The modest performance decrease compared to PPMI test set results (97.5% &#x2192; 96.2 and 95.8%) is expected given domain shift, yet the maintained high performance validates the model&#x2019;s robustness and its capacity to generalize beyond the original training distribution. These results confirm that MultimodalCNN-PD++ can adapt effectively to the variability encountered in different clinical settings, supporting its potential for real-world deployment across diverse healthcare institutions.</p>
<table-wrap position="float" id="T4">
<label>TABLE 4</label>
<caption><p>External validation of MultimodalCNN-PD++ on OASIS-3 and PDBP datasets.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Dataset</th>
<th valign="top" align="left">Test size</th>
<th valign="top" align="left">Accuracy (%)</th>
<th valign="top" align="left">Precision (%)</th>
<th valign="top" align="left">Recall (%)</th>
<th valign="top" align="left">F1-Score (%)</th>
<th valign="top" align="left">Scanner type</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">PPMI (held-out)</td>
<td valign="top" align="left">400 subjects</td>
<td valign="top" align="left">97.5</td>
<td valign="top" align="left">98.1</td>
<td valign="top" align="left">98.9</td>
<td valign="top" align="left">98.5</td>
<td valign="top" align="left">Siemens 3T</td>
</tr>
<tr>
<td valign="top" align="left">OASIS-3</td>
<td valign="top" align="left">220 subjects</td>
<td valign="top" align="left">96.2</td>
<td valign="top" align="left">96.0</td>
<td valign="top" align="left">96.4</td>
<td valign="top" align="left">96.2</td>
<td valign="top" align="left">Siemens/GE 1.5T-3T</td>
</tr>
<tr>
<td valign="top" align="left">PDBP</td>
<td valign="top" align="left">130 subjects</td>
<td valign="top" align="left">95.8</td>
<td valign="top" align="left">95.5</td>
<td valign="top" align="left">96.1</td>
<td valign="top" align="left">95.8</td>
<td valign="top" align="left">Multi-site</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p>Despite substantial differences in scanner hardware, demographics, acquisition protocols, and disease distribution, the model maintained strong performance, confirming robust cross-dataset generalizability.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="S5.SS4">
<label>5.4</label>
<title>Comparison with traditional machine learning models</title>
<p>To demonstrate the added value of deep learning, we compared MultimodalCNN-PD++ with traditional machine learning models, including Support Vector Machine (SVM), Random Forest (RF), and Logistic Regression (LR). While these traditional models provided reasonable performance, they were outperformed by the deep learning approach in several key areas. SVM, with a Radial Basis Function kernel, struggled to capture the complex relationships in multimodal data, resulting in lower accuracy (92%) compared to the 97.5% achieved by MultimodalCNN-PD++. Random Forest, although effective in ranking feature importance, could not match the deep learning model&#x2019;s ability to handle high-dimensional and multimodal inputs, with an accuracy of 94%. Logistic Regression, being a linear model, showed the poorest performance, achieving only 85% accuracy. MultimodalCNN-PD++ consistently outperformed these models in terms of accuracy, recall, and precision, demonstrating the superior ability of deep learning techniques to manage complex, high-dimensional datasets and providing robust, generalizable results across multiple external datasets (OASIS-3 and PDBP).</p>
<p>A comprehensive series of ablation studies were conducted to quantitatively assess the contribution of each key architectural component and design choice within the MultimodalCNN-PD++ framework. These experiments provide critical insights into the effectiveness of individual innovations and validate the necessity of each component for achieving optimal performance.</p>
<sec id="S5.SS4.SSS1">
<label>5.4.1</label>
<title>Component-wise ablation</title>
<p><xref ref-type="table" rid="T5">Table 5</xref> presents the systematic component-wise ablation analysis, progressively adding architectural innovations to quantify their individual contributions. The baseline EfficientNet-B0 without attention or multimodal fusion achieved 92.4% accuracy using imaging data alone. Incorporating Mobile CBAM attention modules enhanced accuracy to 94.1% (+1.7%), demonstrating the effectiveness of lightweight spatial-channel attention for feature refinement. Adding the MGCA++ dynamic cross-attention mechanism for multimodal fusion further improved performance to 96.3% (+2.2%), underscoring its critical role in effectively integrating imaging and clinical metadata. The hierarchical three-stage feature selection process contributed an additional 0.6% improvement (96.9%), confirming the value of intelligent dimensionality reduction. Finally, incorporating BioClinicalBERT with LoRA fine-tuning for clinical text encoding yielded the full model performance of 97.5% (+0.6%), highlighting the importance of domain-specialized language models for processing medical metadata. Each component provided measurable and additive improvements, validating the synergistic design of the complete architecture.</p>
<table-wrap position="float" id="T5">
<label>TABLE 5</label>
<caption><p>Progressive component-wise ablation studies demonstrating incremental performance improvements as each architectural innovation is added to the MultimodalCNN-PD++ framework.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Configuration</th>
<th valign="top" align="left">Mobile CBAM</th>
<th valign="top" align="left">MGCA++</th>
<th valign="top" align="left">Feature selection</th>
<th valign="top" align="left">BioClinicalBERT-LoRA</th>
<th valign="top" align="left">Accuracy (%)</th>
<th valign="top" align="left">&#x0394; from baseline</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">EfficientNet-B0 only</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">92.4</td>
<td valign="top" align="left">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">CBAM</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">94.1</td>
<td valign="top" align="left">+1.7</td>
</tr>
<tr>
<td valign="top" align="left">MGCA</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">&#x2717;</td>
<td valign="top" align="left">96.3</td>
<td valign="top" align="left">+2.2</td>
</tr>
<tr>
<td valign="top" align="left">Feature selection</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">?</td>
<td valign="top" align="left">96.9</td>
<td valign="top" align="left">+0.6</td>
</tr>
<tr>
<td valign="top" align="left">Full model</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">&#x221A;</td>
<td valign="top" align="left">97.5</td>
<td valign="top" align="left">+0.6</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
<sec id="S5.SS4.SSS2">
<label>5.4.2</label>
<title>Component removal analysis</title>
<p>A complementary leave-one-out ablation study was performed, where individual components were removed from the full model while keeping all others intact. As detailed in <xref ref-type="table" rid="T6">Table 6</xref>, removal of any key module resulted in noticeable performance degradation ranging from 1.2 to 3.4%. The most substantial reductions occurred when either the MGCA++ module (-3.4%, accuracy dropping to 94.1%) or Mobile CBAM (-2.8%, accuracy dropping to 94.7%) was removed, confirming these as the most critical components for cross-modal fusion and discriminative feature extraction, respectively. Removing BioClinicalBERT-LoRA text encoding (-1.9%, accuracy 95.6%) demonstrated the value of specialized medical language understanding. Eliminating the hierarchical feature selection (-1.2%, accuracy 96.3%) showed modest but consistent impact, validating its role in reducing noise and redundancy. The image-only baseline without any clinical metadata integration achieved only 92.4% accuracy, representing a 5.1% performance gap, unequivocally demonstrating the necessity of multimodal integration for optimal PD classification.</p>
<table-wrap position="float" id="T6">
<label>TABLE 6</label>
<caption><p>Component removal analysis showing the impact of eliminating individual modules from the full MultimodalCNN-PD++ architecture on the PPMI independent test set.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Variant</th>
<th valign="top" align="left">Accuracy (%)</th>
<th valign="top" align="left">Precision (%)</th>
<th valign="top" align="left">Recall (%)</th>
<th valign="top" align="left">F1-Score (%)</th>
<th valign="top" align="left">&#x0394; from full</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Full model (all components)</td>
<td valign="top" align="left">97.5</td>
<td valign="top" align="left">98.1</td>
<td valign="top" align="left">98.9</td>
<td valign="top" align="left">98.5</td>
<td valign="top" align="left">-</td>
</tr>
<tr>
<td valign="top" align="left">Remove mobile CBAM</td>
<td valign="top" align="left">94.7</td>
<td valign="top" align="left">94.3</td>
<td valign="top" align="left">94.8</td>
<td valign="top" align="left">94.5</td>
<td valign="top" align="left">-2.8</td>
</tr>
<tr>
<td valign="top" align="left">Remove MGCA++</td>
<td valign="top" align="left">94.1</td>
<td valign="top" align="left">93.8</td>
<td valign="top" align="left">94.3</td>
<td valign="top" align="left">94.0</td>
<td valign="top" align="left">-3.4</td>
</tr>
<tr>
<td valign="top" align="left">Remove feature selection</td>
<td valign="top" align="left">96.3</td>
<td valign="top" align="left">96.0</td>
<td valign="top" align="left">96.5</td>
<td valign="top" align="left">96.2</td>
<td valign="top" align="left">-1.2</td>
</tr>
<tr>
<td valign="top" align="left">Remove BioClinicalBERT-LoRA</td>
<td valign="top" align="left">95.6</td>
<td valign="top" align="left">95.3</td>
<td valign="top" align="left">95.8</td>
<td valign="top" align="left">95.5</td>
<td valign="top" align="left">-1.9</td>
</tr>
<tr>
<td valign="top" align="left">Image-only (no metadata)</td>
<td valign="top" align="left">92.4</td>
<td valign="top" align="left">92.1</td>
<td valign="top" align="left">92.6</td>
<td valign="top" align="left">92.3</td>
<td valign="top" align="left">-5.1</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
<sec id="S5.SS4.SSS3">
<label>5.4.3</label>
<title>MGCA++ attention head configuration</title>
<p>To empirically determine the optimal configuration for the Meta-Guided Cross-Attention++ (MGCA++) module, systematic experiments were conducted varying the number of attention heads while maintaining all other hyperparameters constant. As presented in <xref ref-type="table" rid="T7">Table 7</xref>, performance steadily improved as the number of heads increased from one to four, with accuracy rising from 95.8 to 97.5%. This trend, accompanied by consistent improvements in precision, recall, and F1-score, demonstrates the benefit of multi-head architecture in capturing diverse and complementary cross-modal interaction patterns. The computational cost (FLOPs) increased modestly from 1.82 to 2.0G. However, expanding to eight heads raised FLOPs to 2.35G without yielding further performance improvements (accuracy 97.3%, slightly decreased), suggesting that excessive heads may dilute the representational capacity of each individual head and introduce redundant computations. Consequently, the four-head configuration was identified as optimal, providing the best balance between representational expressiveness and computational efficiency.</p>
<table-wrap position="float" id="T7">
<label>TABLE 7</label>
<caption><p>Impact of attention head count in the MGCA++ module on classification performance and computational efficiency, evaluated on the PPMI held-out test set.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Number of heads</th>
<th valign="top" align="left">FLOPs (G)</th>
<th valign="top" align="left">Accuracy (%)</th>
<th valign="top" align="left">Precision (%)</th>
<th valign="top" align="left">Recall (%)</th>
<th valign="top" align="left">F1-Score (%)</th>
<th valign="top" align="left">Training time (h)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1 (single-head)</td>
<td valign="top" align="left">1.82</td>
<td valign="top" align="left">95.8</td>
<td valign="top" align="left">95.5</td>
<td valign="top" align="left">95.9</td>
<td valign="top" align="left">95.7</td>
<td valign="top" align="left">14.2</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="left">1.90</td>
<td valign="top" align="left">96.7</td>
<td valign="top" align="left">96.4</td>
<td valign="top" align="left">96.8</td>
<td valign="top" align="left">96.6</td>
<td valign="top" align="left">15.8</td>
</tr>
<tr>
<td valign="top" align="left">4 (proposed)</td>
<td valign="top" align="left">2.0</td>
<td valign="top" align="left">97.5</td>
<td valign="top" align="left">98.1</td>
<td valign="top" align="left">98.9</td>
<td valign="top" align="left">98.5</td>
<td valign="top" align="left">18.0</td>
</tr>
<tr>
<td valign="top" align="left">8</td>
<td valign="top" align="left">2.35</td>
<td valign="top" align="left">97.3</td>
<td valign="top" align="left">97.9</td>
<td valign="top" align="left">97.5</td>
<td valign="top" align="left">97.7</td>
<td valign="top" align="left">22.4</td>
</tr>
<tr>
<td valign="top" align="left">16</td>
<td valign="top" align="left">2.88</td>
<td valign="top" align="left">97.1</td>
<td valign="top" align="left">97.6</td>
<td valign="top" align="left">97.3</td>
<td valign="top" align="left">97.4</td>
<td valign="top" align="left">28.1</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
<sec id="S5.SS4.SSS4">
<label>5.4.4</label>
<title>Visualization and interpretability analysis</title>
<p>To qualitatively assess the feature refinement capability and decision-making process of MultimodalCNN-PD++, we employed Grad-CAM++ visualization to generate class-discriminative activation maps. Grad-CAM++ extends traditional CAM by incorporating higher-order derivatives and pixel-wise weighting, providing more accurate and visually coherent localization of important regions.</p>
<p>As illustrated in <xref ref-type="fig" rid="F6">Figure 6</xref>, the generated activation maps for representative subjects from each diagnostic category demonstrate that Mobile CBAM effectively guides the model&#x2019;s attention toward anatomically plausible and clinically relevant brain regions. For diagnosed PD cases, the model consistently highlights the substantia nigra and putamen regions in the basal ganglia, which are primary sites of dopaminergic neuron degeneration in PD. For prodromal PD subjects, activation patterns show subtle but detectable changes in these same regions, indicating the model&#x2019;s sensitivity to early pathological alterations. Normal control subjects exhibit minimal activation in these disease-associated areas, with attention distributed more broadly across cortical regions. This targeted and class-specific attention mechanism not only underpins the model&#x2019;s high classification accuracy but also substantially enhances the interpretability and clinical trustworthiness of its predictions, providing neurologists with visual evidence to support diagnostic decisions.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption><p>Grad-CAM++ visualization comparison.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnagi-18-1733075-g006.tif">
<alt-text content-type="machine-generated">Three columns labeled NC, Prodromal PD, and PD display brain scans for normal control, prodromal Parkinson&#x2019;s, and Parkinson&#x2019;s disease using three rows: original MRI, baseline model without Mobile CBAM, and full model with Mobile CBAM. MRI images are grayscale, while the baseline and full model images are heatmaps with intensity color scales ranging from blue to red. The heatmaps show varying intensity patterns between categories and models, indicating differences in brain activity or structure. A vertical color bar to the right denotes intensity from zero (blue) to one (red).</alt-text>
</graphic>
</fig>
<p>To enhance the interpretability of MultimodalCNN-PD++, we leveraged Grad-CAM++ visualizations, which demonstrated that the model&#x2019;s attention mechanisms consistently focused on neuroanatomically relevant regions such as the substantia nigra and putamen, key areas associated with Parkinson&#x2019;s disease pathology. These attention hotspots aligned with clinical markers like UPDRS-III rigidity and tremor subscales, validating the model&#x2019;s focus on critical areas of the brain linked to motor symptoms. The correlation between the model&#x2019;s attention and clinical markers provides valuable insights into the decision-making process, offering clinicians a more transparent, interpretable tool that supports diagnostic and prognostic decisions by identifying the brain regions most implicated in disease progression.</p>
</sec>
</sec>
</sec>
<sec id="S6" sec-type="discussion">
<label>6</label>
<title>Discussion</title>
<p>MultimodalCNN-PD++ demonstrates impressive performance in three-class Parkinson&#x2019;s disease classification, achieving 97.5% accuracy on the PPMI test set and showing strong generalizability across external datasets such as OASIS-3 and PDBP. These results highlight the model&#x2019;s potential for use in clinical practice, offering enhanced interpretability, computational efficiency, and cross-dataset robustness. The incorporation of advanced techniques such as Mobile CBAM, MGCA++, and BioClinicalBERT-LoRA has enabled the framework to achieve both high accuracy and a significant reduction in computational requirements, making it feasible for deployment in real-world clinical settings, particularly those with limited resources (<xref ref-type="bibr" rid="B33">Zhou et al., 2026</xref>).</p>
<p>The Grad-CAM++ visualization analysis provides compelling evidence that the model&#x2019;s attention mechanisms focus on clinically relevant brain regions such as the substantia nigra and putamen, which are crucial for understanding dopaminergic degeneration in Parkinson&#x2019;s disease. This interpretable output ensures that clinicians can trust the model&#x2019;s predictions and use them as part of their diagnostic workflow, rather than as a black-box solution (<xref ref-type="bibr" rid="B26">Wong et al., 2023c</xref>).</p>
<p>However, while the model&#x2019;s performance is promising, we acknowledge several limitations that must be addressed to fully translate this research into clinical practice (<xref ref-type="bibr" rid="B3">Ayoub et al., 2023</xref>). One key limitation is the dependency on high-quality, comprehensive datasets, particularly MRI data and clinical metadata, which may not always be available in every clinical setting. Future work should explore robust methods for handling missing modalities, such as using generative models or specialized loss functions that can gracefully adapt when certain data streams are incomplete.</p>
<p>A more diverse dataset is also necessary to ensure the model&#x2019;s performance across various demographic groups (<xref ref-type="bibr" rid="B20">Sar et al., 2025</xref>; <xref ref-type="bibr" rid="B10">Delfan et al., 2024</xref>). Current datasets, including PPMI and OASIS-3, are primarily from North American and European cohorts. As such, the model may not capture variations in disease presentation across different ethnicities, and a more globally diverse population is needed to fully assess the model&#x2019;s generalizability across racial and ethnic backgrounds. Future work should focus on multi-center prospective studies that include diverse patient populations, environmental exposures, and healthcare systems to ensure that the model performs equitably in different clinical settings and regions.</p>
<p>In addition, the prospective validation of the model is crucial. While the model has shown strong results in retrospective evaluations, real-world data and ongoing, prospective clinical studies are essential to assess its clinical applicability in a live setting. We plan to incorporate real-time data streams, such as those from wearable devices and mobile health technologies, to enhance the model&#x2019;s ability to diagnose and monitor disease progression in real-world conditions. These prospective studies will also help refine the model&#x2019;s ability to detect early-stage Parkinson&#x2019;s disease, especially in prodromal stages, and track long-term progression in a clinical environment.</p>
<p>Prospective studies will allow us to evaluate the model&#x2019;s effectiveness in a dynamic clinical environment, where patient data continuously evolve over time. Collecting longitudinal data from diverse cohorts, alongside clinician feedback, will provide crucial insights into how the model can adapt and refine its predictions in response to real-world patient data. Furthermore, including data from various sensors and modalities such as gait monitoring, speech patterns, and cognitive assessments will further enhance the diagnostic capabilities of the model and enable a more comprehensive, personalized approach to Parkinson&#x2019;s disease diagnosis and management.</p>
<p>While MultimodalCNN-PD++ demonstrates strong performance, its scalability for large-scale clinical use requires careful consideration of several potential bottlenecks. One of the key challenges is the data preprocessing pipeline, which may become a bottleneck when handling large datasets or continuous data streams. Efficiently processing large-scale neuroimaging data and clinical metadata in real-time, especially in multi-center deployments, requires optimization through techniques such as parallel processing or distributed computing solutions.</p>
<p>The model&#x2019;s computational efficiency, although improved with lightweight components like EfficientNet-B0, may still face limitations in resource-constrained clinical settings, particularly when dealing with high-resolution imaging data or multi-modal datasets. For large-scale deployment, GPU memory and processing power may become limiting factors, especially in institutions with less powerful hardware. The model&#x2019;s memory usage during inference can be a significant consideration when processing large batches of patient data, potentially affecting deployment in busy clinical environments.</p>
<p>Multimodal data fusion poses another scalability challenge. Integrating diverse data sources, such as MRI scans, clinical metadata, and real-time wearable data from different devices, requires robust standardization and data fusion techniques to ensure consistent performance across varied data distributions and hardware configurations (<xref ref-type="bibr" rid="B27">Wong et al., 2023b</xref>). Ensuring that the model handles these data sources efficiently and accurately without introducing biases will be crucial for its real-world applicability.</p>
<p>Future work should explore cloud-based deployment and federated learning approaches to overcome the constraints of local processing power and memory, allowing real-time data streaming and continuous learning across multiple healthcare institutions. These approaches would also enable the model to handle missing modalities by leveraging cloud-based imputation techniques and real-time data augmentation. Additionally, edge computing solutions could be explored to process patient data directly at the point of care, ensuring fast and scalable decision-making without the need to transfer large datasets to central servers (<xref ref-type="bibr" rid="B25">Wong et al., 2023a</xref>). Such solutions will help the framework remain robust even when some data streams (e.g., MRI or clinical metadata) are incomplete, by utilizing generative models or robust loss functions for missing data handling.</p>
<p>By optimizing the data preprocessing pipeline, reducing computational bottlenecks, and leveraging distributed computing and cloud-based solutions, the MultimodalCNN-PD++ model can be scaled to effectively handle large datasets, including incomplete data, and be deployed across diverse clinical environments. This will ensure timely and accurate Parkinson&#x2019;s disease diagnosis and monitoring on a global scale, even in clinical settings with limited resources or missing data modalities.</p>
<p>While MultimodalCNN-PD++ demonstrates strong performance, its ability to generalize across various demographic subgroups requires further attention. The model has shown high accuracy on datasets like PPMI and OASIS-3, but these datasets are primarily composed of North American and European cohorts, which may not fully represent the diversity of global patient populations. As a result, there may be variations in the model&#x2019;s performance across different ethnicities, genders, and socioeconomic backgrounds.</p>
<p>To ensure that the model performs equitably across diverse populations, future work will focus on subgroup analysis to assess performance across key demographic groups. It is crucial to evaluate how well the model performs in populations that may differ in genetic factors, environmental exposures, and healthcare access. For example, the prevalence of Parkinson&#x2019;s disease may vary across different ethnic groups, and symptom presentation may differ between men and women, which could affect model predictions. A comprehensive subgroup analysis will help identify any potential biases in the model&#x2019;s predictions and ensure that it can be applied fairly in clinical settings worldwide.</p>
<p>Furthermore, addressing these potential biases will help reinforce the model&#x2019;s ethical rigor and clinical fairness. It is important that future validation efforts include multi-center studies with diverse patient populations to assess how the model adapts to different demographic groups. By identifying and mitigating any data biases, we can ensure that the model provides reliable and equitable diagnoses for all patients, regardless of gender, ethnicity, or socioeconomic status.</p>
</sec>
<sec id="S7" sec-type="conclusion">
<label>7</label>
<title>Conclusion</title>
<p>This work introduces MultimodalCNN-PD++, an enhanced deep learning framework that achieves new performance standards for automated Parkinson&#x2019;s disease diagnosis through effective integration of structural MRI neuroimaging with comprehensive clinical metadata. By synthesizing multiple architectural innovations including EfficientNet-B0 lightweight backbone, Mobile Convolutional Block Attention Modules, Meta-Guided Cross-Attention++ with dynamic head selection, BioClinicalBERT with LoRA fine-tuning, hierarchical three-stage feature selection, and sophisticated multi-component loss functions, the framework achieves 97.5% accuracy in three-class PD classification while dramatically reducing computational requirements. The 54.7% parameter reduction and 47.5% FLOPs reduction compared to baseline approaches enable practical clinical deployment on standard medical workstations and potentially mobile diagnostic platforms, addressing a critical barrier to real-world AI adoption in healthcare settings.</p>
<p>The model demonstrates exceptional clinical utility through its 99.3% recall for prodromal PD detection, enabling identification of at-risk individuals during the critical early window when disease-modifying interventions may be most effective. This high sensitivity, combined with 98.7% precision for diagnosed PD and balanced performance across all diagnostic categories, positions the framework as a reliable screening and diagnostic support tool that can augment neurologist expertise. The Grad-CAM++ visualization analysis confirms that predictions are grounded in anatomically plausible attention patterns focused on disease-relevant brain regions (substantia nigra, putamen, basal ganglia), substantially enhancing model interpretability and clinical trustworthiness. This transparency is essential for fostering clinician confidence and facilitating integration into existing diagnostic workflows.</p>
<p>Rigorous external validation on the OASIS-3 and PDBP datasets confirmed robust cross-dataset generalizability, with the model maintaining 96.2 and 95.8% accuracy despite substantial variations in scanner hardware, acquisition protocols, demographic characteristics, and disease prevalence. This transferability across diverse clinical settings provides strong evidence that the learned representations capture fundamental disease-related patterns rather than dataset-specific artifacts, supporting the framework&#x2019;s potential for widespread deployment across heterogeneous healthcare institutions. The comprehensive ablation studies quantitatively validated the necessity and synergistic contributions of each architectural component, demonstrating that the integration of multimodal data through sophisticated attention mechanisms yields substantial performance improvements over unimodal approaches.</p>
<p>The hierarchical feature selection strategy successfully identified a compact set of 23 clinically meaningful biomarkers from an initial pool of 127 features, balancing predictive performance with model interpretability and reducing the data collection burden for clinical applications. The multi-component loss function incorporating focal loss for class imbalance, triplet loss for discriminative embedding learning, and consistency regularization for multimodal alignment collectively contributed to the model&#x2019;s superior performance and robustness. The BioClinicalBERT-LoRA text encoding approach demonstrated that domain-specialized language models with parameter-efficient fine-tuning can effectively process medical metadata (such as UPDRS motor scores, MoCA cognitive assessments, and patient demographics) alongside unstructured clinical text (such as free-text clinical notes, physician observations, and patient histories). This integration allows the model to combine both structured and unstructured clinical information in a manner that improves diagnostic performance.</p>
<p>Unlike general-purpose language models, BioClinicalBERT-LoRA is fine-tuned specifically on medical datasets, enabling it to better understand and process clinical terminology, jargon, and context. Parameter-efficient fine-tuning using Low-Rank Adaptation (LoRA) ensures that the model can be adapted to medical data without requiring extensive retraining or large computational resources. This approach significantly reduces the computational overhead typically associated with training large language models, making it feasible for use in clinical environments where computational resources may be limited.</p>
<p>Computational efficiency achievements represent a significant advancement toward democratizing access to AI-powered diagnostic tools. By achieving state-of-the-art performance with dramatically reduced resource requirements, MultimodalCNN-PD++ enables deployment in resource-constrained clinical environments, including community hospitals, rural health centers, and developing regions where access to specialized neurological expertise may be limited. This accessibility has profound implications for global health equity, potentially enabling earlier PD detection and improved patient care management across diverse socioeconomic contexts. The framework&#x2019;s efficiency also facilitates integration into time-sensitive clinical workflows where rapid diagnostic support is required.</p>
<p>Looking forward, the MultimodalCNN-PD++ framework establishes a foundation for next-generation intelligent diagnostic systems that can incorporate diverse data modalities, provide transparent and interpretable predictions, operate efficiently on standard hardware, and generalize robustly across varied clinical settings. Future extensions incorporating additional biomarkers (DaT-SPECT, cerebrospinal fluid markers, genetic risk scores, wearable sensors, speech features), longitudinal progression modeling, uncertainty quantification, and federated learning capabilities promise to further enhance clinical utility and real-world impact. Prospective clinical trials validating the framework&#x2019;s effectiveness as a screening tool, progression biomarker, and patient stratification mechanism will be essential for regulatory approval and widespread clinical adoption.</p>
<p>MultimodalCNN-PD++ represents a significant step forward in AI-powered Parkinson&#x2019;s disease diagnosis, combining state-of-the-art classification performance (97.5% accuracy, 99.3% prodromal recall) with practical deployability through dramatic computational efficiency improvements (54.7% parameter reduction, 47.5% FLOPs reduction), enhanced interpretability via Grad-CAM++ visualization, and robust cross-dataset generalizability (96.2% on OASIS-3, 95.8% on PDBP). By effectively addressing key challenges in medical AI development including data scarcity, computational constraints, interpretability requirements, and generalization limitations, this framework establishes new benchmarks for multimodal neurodegenerative disease diagnosis and provides a blueprint for developing clinically deployable AI systems that can meaningfully improve patient care, enable earlier intervention, support personalized treatment planning, and ultimately enhance quality of life for individuals affected by Parkinson&#x2019;s disease and related neurological disorders.</p>
</sec>
</body>
<back>
<sec id="S8" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in this study are included in this article/supplementary material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="S9" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Ethics Committee of the Department of Neurosurgery, Yancheng First Hospital Affiliated to Medical School of Nanjing University (Approval No. 2023-J-169). The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study. Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec id="S10" sec-type="author-contributions">
<title>Author contributions</title>
<p>TZ: Formal analysis, Methodology, Software, Writing &#x2013; original draft. HL: Formal analysis, Methodology, Visualization, Writing &#x2013; review &#x0026; editing XW: Conceptualization, Investigation, Methodology, Supervision, Writing &#x2013; review &#x0026; editing. CM: Conceptualization, Methodology, Software, Writing &#x2013; review &#x0026; editing. UI: Conceptualization, Formal analysis, Methodology, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec id="S12" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="S13" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="S14" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Acharya</surname> <given-names>T.</given-names></name></person-group> (<year>2024</year>). <source><italic>Cybernetical Intelligence: Engineering Cybernetics with Machine Intelligence.</italic></source> <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE Press</publisher-name>, <pub-id pub-id-type="doi">10.1002/9781394217519</pub-id></mixed-citation></ref>
<ref id="B2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Al-Azzawi</surname> <given-names>M. A.</given-names></name> <name><surname>Al-Ani</surname> <given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>Multimodal abstractive summarization using bidirectional encoder representations from transformers with attention mechanism.</article-title> <source><italic>Heliyon</italic></source> <volume>10</volume>:<fpage>e26162</fpage>. <pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e26162</pub-id> <pub-id pub-id-type="pmid">38420442</pub-id></mixed-citation></ref>
<ref id="B3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ayoub</surname> <given-names>M.</given-names></name> <name><surname>Liao</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name> <name><surname>Wong</surname> <given-names>K.</given-names></name></person-group> (<year>2023</year>). <article-title>HViT: Hybrid vision inspired transformer for the assessment of carotid artery plaque by addressing the cross-modality domain adaptation problem in MRI.</article-title> <source><italic>Comput. Med. Imaging Graph.</italic></source> <volume>109</volume>:<fpage>102295</fpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2023.102295</pub-id> <pub-id pub-id-type="pmid">37717365</pub-id></mixed-citation></ref>
<ref id="B4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ben&#x00ED;tez-Andrades</surname> <given-names>J.</given-names></name> <name><surname>Alija-P&#x00E9;rez</surname> <given-names>J.</given-names></name> <name><surname>Vidal</surname> <given-names>M.</given-names></name> <name><surname>Pastor-Vargas</surname> <given-names>R.</given-names></name> <name><surname>Garc&#x00ED;a-Ord&#x00E1;s</surname> <given-names>M.</given-names></name></person-group> (<year>2022</year>). <article-title>Traditional machine learning models and Bidirectional encoder representations from transformer (BERT)-based automatic classification of tweets about eating disorders: Algorithm development and validation study.</article-title> <source><italic>JMIR Med. Inform.</italic></source> <volume>10</volume>:<fpage>e34492</fpage>. <pub-id pub-id-type="doi">10.2196/34492</pub-id> <pub-id pub-id-type="pmid">35200156</pub-id></mixed-citation></ref>
<ref id="B5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benredjem</surname> <given-names>S.</given-names></name> <name><surname>Mekhaznia</surname> <given-names>T.</given-names></name> <name><surname>Rawad</surname> <given-names>A.</given-names></name> <name><surname>Turaev</surname> <given-names>S.</given-names></name> <name><surname>Bennour</surname> <given-names>A.</given-names></name> <name><surname>Sofiane</surname> <given-names>B.</given-names></name><etal/></person-group> (<year>2025</year>). <article-title>Parkinson&#x2019;s disease prediction: An attention-based multimodal fusion framework using handwriting and clinical data.</article-title> <source><italic>Diagnostics</italic></source> <volume>15</volume>:<fpage>4</fpage>. <pub-id pub-id-type="doi">10.3390/diagnostics15010004</pub-id> <pub-id pub-id-type="pmid">39795532</pub-id></mixed-citation></ref>
<ref id="B6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhagwat</surname> <given-names>N.</given-names></name> <name><surname>Viviano</surname> <given-names>J.</given-names></name> <name><surname>Voineskos</surname> <given-names>A.</given-names></name> <name><surname>Chakravarty</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>Modeling and prediction of clinical symptom trajectories in Alzheimer&#x2019;s disease using longitudinal data.</article-title> <source><italic>PLoS Comput. Biol.</italic></source> <volume>14</volume>:<fpage>e1006376</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1006376</pub-id> <pub-id pub-id-type="pmid">30216352</pub-id></mixed-citation></ref>
<ref id="B7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>X.</given-names></name> <name><surname>Luo</surname> <given-names>S.</given-names></name> <name><surname>Xiang</surname> <given-names>B.</given-names></name><etal/></person-group> (<year>2023</year>). <article-title>Case Instance segmentation of small farmland based on mask R-CNN of feature pyramid network with double attention mechanism in high resolution satellite images.</article-title> <source><italic>Comput. Electron. Agric.</italic></source> <volume>212</volume>:<fpage>108073</fpage>. <pub-id pub-id-type="doi">10.1016/j.compag.2023.108073</pub-id></mixed-citation></ref>
<ref id="B8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>N.</given-names></name></person-group> (<year>2020</year>). <article-title>Spatiotemporal convolutional neural network with convolutional block attention module for micro-expression recognition.</article-title> <source><italic>Information</italic></source> <volume>11</volume>:<fpage>380</fpage>. <pub-id pub-id-type="doi">10.3390/info11080380</pub-id></mixed-citation></ref>
<ref id="B9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dahbour</surname> <given-names>S.</given-names></name> <name><surname>Hashim</surname> <given-names>M.</given-names></name> <name><surname>Alhyasat</surname> <given-names>A.</given-names></name> <name><surname>Salameh</surname> <given-names>A.</given-names></name> <name><surname>Qtaishat</surname> <given-names>A.</given-names></name> <name><surname>Braik</surname> <given-names>R.</given-names></name><etal/></person-group> (<year>2021</year>). <article-title>Mini-mental state examination (MMSE) scores in elderly Jordanian population.</article-title> <source><italic>Cereb. Circ. Cogn. Behav.</italic></source> <volume>2</volume>:<fpage>100016</fpage>. <pub-id pub-id-type="doi">10.1016/j.cccb.2021.100016</pub-id> <pub-id pub-id-type="pmid">36324704</pub-id></mixed-citation></ref>
<ref id="B10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Delfan</surname> <given-names>N.</given-names></name> <name><surname>Shahsavari</surname> <given-names>M.</given-names></name> <name><surname>Hussain</surname> <given-names>S.</given-names></name> <name><surname>Damas&#x0306;evi&#x010D;ius</surname> <given-names>R.</given-names></name> <name><surname>Acharya</surname> <given-names>U. R.</given-names></name></person-group> (<year>2024</year>). <article-title>A hybrid deep spatiotemporal attention-based model for Parkinson&#x2019;s disease diagnosis using resting state EEG signals.</article-title> <source><italic>Int. J. Imaging Syst. Technol.</italic></source> <volume>34</volume>:<fpage>e23120</fpage>. <pub-id pub-id-type="doi">10.1002/ima.23120</pub-id></mixed-citation></ref>
<ref id="B11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dentamaro</surname> <given-names>V.</given-names></name> <name><surname>Impedovo</surname> <given-names>D.</given-names></name> <name><surname>Musti</surname> <given-names>L.</given-names></name> <name><surname>Pirlo</surname> <given-names>G.</given-names></name> <name><surname>Taurisano</surname> <given-names>P.</given-names></name></person-group> (<year>2024</year>). <article-title>Enhancing early Parkinson&#x2019;s disease detection through multimodal deep learning and explainable AI: Insights from the PPMI database.</article-title> <source><italic>Sci. Rep</italic>.</source> <volume>14</volume>:<fpage>20941</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-70165-4</pub-id> <pub-id pub-id-type="pmid">39251639</pub-id></mixed-citation></ref>
<ref id="B12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Heuveline</surname> <given-names>P.</given-names></name></person-group> (<year>2022</year>). <article-title>Global and national declines in life expectancy: An end-of-2021 assessment.</article-title> <source><italic>Popul. Dev. Rev.</italic></source> <volume>48</volume> <fpage>31</fpage>&#x2013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1111/padr.12477</pub-id> <pub-id pub-id-type="pmid">37325186</pub-id></mixed-citation></ref>
<ref id="B13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hwang</surname> <given-names>I.</given-names></name> <name><surname>Kang</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). <article-title>Anomaly detection based on a 3D convolutional neural network combining convolutional block attention module using merged frames.</article-title> <source><italic>Sensors</italic></source> <volume>23</volume>:<fpage>9616</fpage>. <pub-id pub-id-type="doi">10.3390/s23239616</pub-id> <pub-id pub-id-type="pmid">38067989</pub-id></mixed-citation></ref>
<ref id="B14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Kheradvar</surname> <given-names>A.</given-names></name> <name><surname>Mejia</surname> <given-names>M.</given-names></name></person-group> (<year>2024</year>). <article-title>Artificial intelligence applications in ophthalmology: Diagnostics, therapeutics, and personalized medicine.</article-title> <source><italic>Curr. Opin. Ophthalmol.</italic></source> <volume>35</volume> <fpage>245</fpage>&#x2013;<lpage>252</lpage>. <pub-id pub-id-type="doi">10.1097/ICU.0000000000001025</pub-id> <pub-id pub-id-type="pmid">38117663</pub-id></mixed-citation></ref>
<ref id="B15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>M.</given-names></name> <name><surname>Poston</surname> <given-names>K.</given-names></name> <name><surname>Pfefferbaum</surname> <given-names>A.</given-names></name> <name><surname>Sullivan</surname> <given-names>E.</given-names></name> <name><surname>Fei-Fei</surname> <given-names>L.</given-names></name> <name><surname>Pohl</surname> <given-names>K.</given-names></name><etal/></person-group> (<year>2020</year>). <article-title>Vision-based estimation of MDS-UPDRS gait scores for assessing Parkinson&#x2019;s disease motor severity.</article-title> <source><italic>Med Image Comput Comput. Assist. Interv.</italic></source> <volume>12263</volume> <fpage>637</fpage>&#x2013;<lpage>647</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-59716-0_61</pub-id> <pub-id pub-id-type="pmid">33103164</pub-id></mixed-citation></ref>
<ref id="B16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Marek</surname> <given-names>K.</given-names></name> <name><surname>Chowdhury</surname> <given-names>S.</given-names></name> <name><surname>Siderowf</surname> <given-names>A.</given-names></name> <name><surname>Lasch</surname> <given-names>S.</given-names></name> <name><surname>Coffey</surname> <given-names>C.</given-names></name> <name><surname>Caspell-Garcia</surname> <given-names>C.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>The Parkinson&#x2019;s progression markers initiative (PPMI) - establishing a PD biomarker cohort.</article-title> <source><italic>Ann. Clin. Transl. Neurol.</italic></source> <volume>5</volume> <fpage>1460</fpage>&#x2013;<lpage>1477</lpage>. <pub-id pub-id-type="doi">10.1002/acn3.644</pub-id> <pub-id pub-id-type="pmid">30564614</pub-id></mixed-citation></ref>
<ref id="B17"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Pereira</surname> <given-names>C. R.</given-names></name> <name><surname>Weber</surname> <given-names>S. A. T.</given-names></name> <name><surname>Hook</surname> <given-names>C.</given-names></name> <name><surname>Rosa</surname> <given-names>G. H.</given-names></name> <name><surname>Papa</surname> <given-names>J. P.</given-names></name></person-group> (<year>2016</year>). &#x201C;<article-title>Deep learning-aided Parkinson&#x2019;s disease diagnosis from handwritten dynamics</article-title>,&#x201D; in <source><italic>Proceedings of the 2016 29th SIBGRAPI Conference on Graphics, Patterns and Images (SIBGRAPI)</italic></source>, (<publisher-loc>Sao Paulo</publisher-loc>), <fpage>191</fpage>&#x2013;<lpage>197</lpage>. <pub-id pub-id-type="doi">10.1109/SIBGRAPI.2016.054</pub-id></mixed-citation></ref>
<ref id="B18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Qin</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Zhuo</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Ye</surname> <given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>Multimodal super-resolved q-space deep learning.</article-title> <source><italic>Med. Image Anal.</italic></source> <volume>71</volume>:<fpage>102085</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2021.102085</pub-id> <pub-id pub-id-type="pmid">33971575</pub-id></mixed-citation></ref>
<ref id="B19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Safai</surname> <given-names>A.</given-names></name> <name><surname>Vakharia</surname> <given-names>N.</given-names></name> <name><surname>Prasad</surname> <given-names>S.</given-names></name> <name><surname>Saini</surname> <given-names>J.</given-names></name> <name><surname>Shah</surname> <given-names>A.</given-names></name> <name><surname>Lenka</surname> <given-names>A.</given-names></name><etal/></person-group> (<year>2022</year>). <article-title>Multimodal brain connectomics-based prediction of Parkinson&#x2019;s disease using graph attention networks.</article-title> <source><italic>Front. Neurosci.</italic></source> <volume>15</volume>:<fpage>741489</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2021.741489</pub-id> <pub-id pub-id-type="pmid">35280342</pub-id></mixed-citation></ref>
<ref id="B20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sar</surname> <given-names>A.</given-names></name> <name><surname>Puri</surname> <given-names>P.</given-names></name> <name><surname>Naz</surname> <given-names>H.</given-names></name> <name><surname>Aich</surname> <given-names>S.</given-names></name> <name><surname>Choudhury</surname> <given-names>T.</given-names></name> <name><surname>Gabralla</surname> <given-names>L.</given-names></name></person-group> (<year>2025</year>). <article-title>Multi-modal deep learning framework for early detection of Parkinson&#x2019;s disease using neurological and physiological data for high-fidelity diagnosis.</article-title> <source><italic>Sci. Rep.</italic></source> <volume>15</volume>:<fpage>34835</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-21407-6</pub-id> <pub-id pub-id-type="pmid">41057513</pub-id></mixed-citation></ref>
<ref id="B21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>D.</given-names></name> <name><surname>Du</surname> <given-names>S.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Yan</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>S.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name></person-group> (<year>2026</year>). <article-title>An improved variational autoencoder and graph attention network method for wear prediction of aerospace self-lubricating bearing using acoustic emission signal.</article-title> <source><italic>IEEE Sensors J.</italic></source> in press. <pub-id pub-id-type="doi">10.1109/JSEN.2025.3650493</pub-id></mixed-citation></ref>
<ref id="B22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>T.</given-names></name> <name><surname>Cui</surname> <given-names>Y.</given-names></name> <name><surname>Feng</surname> <given-names>R.</given-names></name> <name><surname>Xiang</surname> <given-names>D.</given-names></name></person-group> (<year>2024</year>). <article-title>Vehicle target recognition in SAR images with complex scenes based on mixed attention mechanism.</article-title> <source><italic>Information</italic></source> <volume>15</volume>:<fpage>159</fpage>. <pub-id pub-id-type="doi">10.3390/info15030159</pub-id></mixed-citation></ref>
<ref id="B23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Teng</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Wei</surname> <given-names>B.</given-names></name></person-group> (<year>2025</year>). <article-title>ModFus-PD: Synergizing cross-modal attention and contrastive learning for enhanced multimodal diagnosis of Parkinson&#x2019;s disease.</article-title> <source><italic>Front. Comput. Neurosci.</italic></source> <volume>19</volume>:<fpage>1604399</fpage>. <pub-id pub-id-type="doi">10.3389/fncom.2025.1604399</pub-id> <pub-id pub-id-type="pmid">40741075</pub-id></mixed-citation></ref>
<ref id="B24"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Bao</surname> <given-names>H.</given-names></name> <name><surname>Dong</surname> <given-names>L.</given-names></name> <name><surname>Bjorck</surname> <given-names>J.</given-names></name> <name><surname>Peng</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>Q.</given-names></name><etal/></person-group> (<year>2023</year>). &#x201C;<article-title>Image as a foreign language: BEIT pretraining for vision and vision-language tasks</article-title>,&#x201D; in <source><italic>Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition</italic></source>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>).</mixed-citation></ref>
<ref id="B25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wong</surname> <given-names>K.</given-names></name> <name><surname>Ayoub</surname> <given-names>M.</given-names></name> <name><surname>Cao</surname> <given-names>Z.</given-names></name> <name><surname>Chen</surname> <given-names>C.</given-names></name> <name><surname>Chen</surname> <given-names>W.</given-names></name> <name><surname>Ghista</surname> <given-names>D.</given-names></name><etal/></person-group> (<year>2023a</year>). <article-title>The synergy of cybernetical intelligence with medical image analysis for deep medicine: A methodological perspective.</article-title> <source><italic>Comput Methods Programs Biomed.</italic></source> <volume>240</volume>:<fpage>107677</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2023.107677</pub-id> <pub-id pub-id-type="pmid">37390794</pub-id></mixed-citation></ref>
<ref id="B26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wong</surname> <given-names>K.</given-names></name> <name><surname>Xu</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>C.</given-names></name> <name><surname>Ghista</surname> <given-names>D.</given-names></name> <name><surname>Zhao</surname> <given-names>H.</given-names></name></person-group> (<year>2023c</year>). <article-title>Functional magnetic resonance imaging providing the brain effect mechanism of acupuncture and moxibustion treatment for depression.</article-title> <source><italic>Front. Neurol.</italic></source> <volume>14</volume>:<fpage>1151421</fpage>. <pub-id pub-id-type="doi">10.3389/fneur.2023.1151421</pub-id> <pub-id pub-id-type="pmid">37025199</pub-id></mixed-citation></ref>
<ref id="B27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wong</surname> <given-names>K.</given-names></name> <name><surname>Xu</surname> <given-names>W.</given-names></name> <name><surname>Ayoub</surname> <given-names>M.</given-names></name> <name><surname>Fu</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>H.</given-names></name> <name><surname>Shi</surname> <given-names>R.</given-names></name><etal/></person-group> (<year>2023b</year>). <article-title>Brain image segmentation of the corpus callosum by combining Bi-directional convolutional LSTM and U-Net using multi-slice CT and MRI.</article-title> <source><italic>Comput. Methods Programs Biomed.</italic></source> <volume>238</volume>:<fpage>107602</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2023.107602</pub-id> <pub-id pub-id-type="pmid">37244234</pub-id></mixed-citation></ref>
<ref id="B28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xin</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name></person-group> (<year>2023</year>). <article-title>Arbitrary style transfer with fused convolutional block attention modules.</article-title> <source><italic>IEEE Access.</italic></source> <volume>11</volume> <fpage>4497</fpage>&#x2013;<lpage>4507</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3273949</pub-id></mixed-citation></ref>
<ref id="B29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xue</surname> <given-names>W.</given-names></name> <name><surname>Bowman</surname> <given-names>F.</given-names></name> <name><surname>Kang</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>A bayesian spatial model to predict disease status using imaging data from various modalities.</article-title> <source><italic>Front. Neurosci.</italic></source> <volume>12</volume>:<fpage>184</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2018.00184</pub-id> <pub-id pub-id-type="pmid">29632471</pub-id></mixed-citation></ref>
<ref id="B30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Hu</surname> <given-names>L.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Gu</surname> <given-names>W.</given-names></name> <name><surname>Lin</surname> <given-names>G.</given-names></name> <name><surname>Xie</surname> <given-names>Y.</given-names></name><etal/></person-group> (<year>2025</year>). <article-title>Identification of Parkinson&#x2019;s disease using MRI and genetic data from the PPMI cohort: An improved machine learning fusion approach.</article-title> <source><italic>Front. Aging Neurosci.</italic></source> <volume>17</volume>:<fpage>1510192</fpage>. <pub-id pub-id-type="doi">10.3389/fnagi.2025.1510192</pub-id> <pub-id pub-id-type="pmid">39968123</pub-id></mixed-citation></ref>
<ref id="B31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zeng</surname> <given-names>T.</given-names></name> <name><surname>Chipusu</surname> <given-names>K.</given-names></name> <name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Muhammad Ibrahim</surname> <given-names>U.</given-names></name> <name><surname>Huang</surname> <given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Differential evolutionary optimization fuzzy entropy for gland segmentation based on breast mammography imaging.</article-title> <source><italic>J. Radiat. Res. Appl. Sci.</italic></source> <volume>17</volume>:<fpage>100966</fpage>. <pub-id pub-id-type="doi">10.1016/j.jrras.2024.100966</pub-id></mixed-citation></ref>
<ref id="B32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>P.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Vision-language pre-training for multimodal image understanding.</article-title> <source><italic>arXiv [Preprint]</italic></source> <pub-id pub-id-type="doi">10.48550/arXiv.2306.06494</pub-id></mixed-citation></ref>
<ref id="B33"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Y.</given-names></name> <name><surname>Lv</surname> <given-names>J.</given-names></name> <name><surname>Du</surname> <given-names>S.</given-names></name> <name><surname>Shen</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>M.</given-names></name></person-group> (<year>2026</year>). &#x201C;<article-title>Multi-resource constrained flexible job shop scheduling with fixture-pallets and setup stations under pallet automation systems</article-title>,&#x201D; in <source><italic>IEEE Transactions on Systems, Man and Cybernetics: Systems</italic></source>, (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>), in press. <pub-id pub-id-type="doi">10.1109/TSMC.2026.3655483</pub-id></mixed-citation></ref>
<ref id="B34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>F.</given-names></name> <name><surname>Ning</surname> <given-names>P.</given-names></name> <name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>K.</given-names></name><etal/></person-group> (<year>2024</year>). <article-title>Multimodal neuroimaging-based prediction of Parkinson&#x2019;s disease with mild cognitive impairment using machine learning technique.</article-title> <source><italic>NPJ Parkinsons Dis.</italic></source> <volume>10</volume>:<fpage>218</fpage>. <pub-id pub-id-type="doi">10.1038/s41531-024-00828-6</pub-id> <pub-id pub-id-type="pmid">39528560</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/675525/overview">Martin Jakobs</ext-link>, Toronto Western Hospital, Canada</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3090852/overview">Mohammed Ali Shaik</ext-link>, SR University, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3173502/overview">Saif Al-Jumaili</ext-link>, Alt&#x0131;nba&#x015F; University, T&#x00FC;rkiye</p></fn>
</fn-group>
</back>
</article>