<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Digit. Health</journal-id><journal-title-group>
<journal-title>Frontiers in Digital Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Digit. Health</abbrev-journal-title></journal-title-group>
<issn pub-type="epub">2673-253X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdgth.2026.1771281</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Explainable multimodal feature fusion networks for Parkinson&#x0027;s disease prediction</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author"><name><surname>Ravichandran</surname><given-names>Abishek</given-names></name>
<xref ref-type="aff" rid="aff1"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Kathirvel Murugan</surname><given-names>Tamilarasi</given-names></name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="cor1">&#x002A;</xref><uri xlink:href="https://loop.frontiersin.org/people/2928705/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Govindaraj</surname><given-names>Logeswari</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/3360433/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role></contrib>
<contrib contrib-type="author"><name><surname>M</surname><given-names>Vishal</given-names></name>
<xref ref-type="aff" rid="aff1"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
</contrib-group>
<aff id="aff1"><institution>School of Computer Science and Engineering, Vellore Institute of Technology</institution>, <city>Chennai</city>, <country country="in">India</country></aff>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label><bold>Correspondence:</bold> Tamilarasi Kathirvel Murugan <email xlink:href="mailto:tamilarasi.k@vit.ac.in">tamilarasi.k@vit.ac.in</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-27"><day>27</day><month>02</month><year>2026</year></pub-date>
<pub-date publication-format="electronic" date-type="collection"><year>2026</year></pub-date>
<volume>8</volume><elocation-id>1771281</elocation-id>
<history>
<date date-type="received"><day>19</day><month>12</month><year>2025</year></date>
<date date-type="rev-recd"><day>27</day><month>01</month><year>2026</year></date>
<date date-type="accepted"><day>28</day><month>01</month><year>2026</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 Ravichandran, Kathirvel Murugan, Govindaraj and M.</copyright-statement>
<copyright-year>2026</copyright-year><copyright-holder>Ravichandran, Kathirvel Murugan, Govindaraj and M</copyright-holder><license><ali:license_ref start_date="2026-02-27">https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p></license>
</permissions>
<abstract>
<p>Parkinson&#x0027;s disease (PD) is a progressive neurodegenerative disorder characterized by motor and non-motor impairments, where early diagnosis remains challenging due to reliance on subjective clinical assessments. Recent artificial intelligence (AI)-based approaches have demonstrated promise in identifying subtle PD biomarkers from individual modalities such as speech, gait, and handwriting; however, unimodal systems often fail to capture the heterogeneity of the disease and provide limited interpretability. To address these limitations, this study proposes a multimodal deep learning framework that integrates handwriting, gait, and speech modalities using an early feature fusion strategy for robust and interpretable PD detection. Each modality is processed through a dedicated feature extraction pipeline using deep neural networks, followed by static feature concatenation and classification using an XGBoost model. Model transparency is enhanced using explainable AI (XAI) techniques, including SHapley Additive exPlanations (SHAP) and Gradient-weighted Class Activation Mapping (Grad-CAM), enabling clinical interpretability of modality- and feature-level contributions. Experimental evaluation on benchmark datasets demonstrates that the proposed trimodal fusion model achieves an accuracy of 92&#x0025;, outperforming unimodal handwriting (91&#x0025;), gait (90&#x0025;), and speech (74&#x0025;) models. The fusion framework attains a macro F1-score of 0.89, an area under the ROC curve (AUC) of 0.95, and an average precision (AP) of 0.96, indicating strong discriminative capability and robustness. Confusion matrix analysis reveals balanced sensitivity (90&#x0025;) and specificity (89&#x0025;) across classes. Explainability analysis confirms that handwriting tremor patterns, gait force asymmetries, and speech spectral instabilities are key contributors to PD prediction. These results highlight the effectiveness of explainable multimodal AI in delivering accurate, reliable, and clinically interpretable solutions for early PD detection.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>explainable AI</kwd>
<kwd>fusion models</kwd>
<kwd>gait analysis</kwd>
<kwd>handwriting recognition</kwd>
<kwd>medical diagnosis</kwd>
<kwd>multimodal learning</kwd>
<kwd>neural networks</kwd>
</kwd-group><funding-group><funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement></funding-group><counts>
<fig-count count="20"/>
<table-count count="4"/><equation-count count="153"/><ref-count count="25"/><page-count count="29"/><word-count count="0"/></counts><custom-meta-group><custom-meta><meta-name>section-at-acceptance</meta-name><meta-value>Health Informatics</meta-value></custom-meta></custom-meta-group>
</article-meta>
</front>
<body><sec id="s1" sec-type="intro"><label>1</label><title>Introduction</title>
<p>Parkinson&#x2019;s disease (PD) is a highly widespread neurodegenerative disorder that affects millions of people globally and is characterized by motor impairments which encompass tremors, rigidity, gait problems, handwriting abnormalities, and non-motor symptoms comprising speech dysfunction, cognitive dysfunction, and sleep disorders. Such a complex manifestation predisposes PD as a significant clinical problem, as well as a serious social and economic cost. The conventional diagnosis still depends largely on clinical scale and neurological examination which is mostly subjective and liable to inter-observer reliability, which does not adequately capture the early disease manifestation (<xref ref-type="bibr" rid="B1">1</xref>). To address these constraints, scientists have more extensively resorted to artificial intelligence (AI)-based systems that can analyze digital biomarkers in speech, gait, and handwriting, and provide objective and data-driven information about PD detection (<xref ref-type="bibr" rid="B2">2</xref>).</p>
<p>Lately, the deep learning breakthroughs have provided amazing results in unimodal diagnostic pipelines. Detection by speech has used spectral and acoustic properties and can be 99&#x0025; accurate in controlled data sets (<xref ref-type="bibr" rid="B3">3</xref>). Inertial measurement unit (IMU) sensor-based (wearable) and motion dynamics-based gait methods have been reported to have good predictive capability distinguishing PD patients and healthy controls with accuracy levels of up to 97&#x0025; (<xref ref-type="bibr" rid="B4">4</xref>). Moreover, handwriting analysis, usually centered on spiral plotting or signature patterns, has been effective with convolutional neural networks with an almost 98&#x0025; accuracy (<xref ref-type="bibr" rid="B5">5</xref>). All these results point to the prospective of digital biomarkers to transform the diagnostics and monitoring processes of PD (<xref ref-type="bibr" rid="B6">6</xref>).</p>
<p>In spite of these developments, unimodal methods are severely flawed. Speech-based systems are very susceptible to differences in language, accent, background noise, and recording conditions restricting their generalizability (<xref ref-type="bibr" rid="B7">7</xref>). The quality of sensors and compliance of the subject are crucial in gait-based detection and make it challenging to apply in the real world (<xref ref-type="bibr" rid="B8">8</xref>). Handwriting-based techniques tend to be based on controlled experiments and not sufficiently large data sets to be able to generalize stably (<xref ref-type="bibr" rid="B9">9</xref>). These deficiencies pose issues of clinical scalability of unimodal pipelines (<xref ref-type="bibr" rid="B10">10</xref>).</p>
<p>The other significant issue is the interpretability of AI models. A large number of deep learning systems are trained as black boxes, which make predictions without an explicit explanation of how and why a decision will be made. In medical use, where the interpretability of results is paramount, this is an issue (<xref ref-type="bibr" rid="B11">11</xref>). The less convinced clinicians of the AI-driven diagnostic systems can be when it comes to the validation of the reasoning behind the results. Recent studies have started incorporating explainability mechanisms such as SHapley Additive exPlanations (SHAP) and Gradient-weighted Class Activation Mapping (Grad-CAM) into PD models that allow visualization of influential temporal or spectral properties but are still underutilized (<xref ref-type="bibr" rid="B12">12</xref>). Even very good models without strong interpretability may not find significant clinical use (<xref ref-type="bibr" rid="B13">13</xref>).</p>
<p>In an attempt to overcome the failures of the unimodal systems, multimodal fusion has become an interesting solution. Multimodal systems can represent the multifactorial nature of PD through the integration of complementary signals provided by several modalities, i.e., speech, gait, and handwriting (<xref ref-type="bibr" rid="B14">14</xref>). This combination enables one modality to counterbalance the noisy, missing, or unreliable modality. Indicatively, gait irregularities when combined with handwriting aspects have produced greater outcomes than either of the modalities (<xref ref-type="bibr" rid="B15">15</xref>). Equally, fusion of speech and facial expression has proved to be more diagnostic than speech-only systems (<xref ref-type="bibr" rid="B16">16</xref>). In spite of these achievements, a majority of multimodal frameworks are not adaptable and explainable and thus hard to implement in clinical practice (<xref ref-type="bibr" rid="B17">17</xref>).</p>
<p>This study is motivated by the fact that the gap between experimental prototypes and clinically viable systems needs to be filled. Parkinson&#x2019;s disease is heterogeneous in people, symptoms do not all develop and manifest in the same way, and it is therefore important to develop systems which are dynamically responsive to different conditions (<xref ref-type="bibr" rid="B18">18</xref>). To measure this variability, a multimodal diagnostic model is needed that is capable of incorporating different biomarkers and is resistant to noise and missing values (<xref ref-type="bibr" rid="B19">19</xref>). Moreover, it is essential to incorporate explainable AI to provide transparency and trust among clinicians so that professionals could understand the mechanisms behind predictions which could be tremor-induced handwriting distortions, vocal anomalies, or inconsistencies in a stride (<xref ref-type="bibr" rid="B16">16</xref>).</p>
<p>The contribution of this paper is the suggestion of a deep learning-based multimodal system, combining speech, gait, and handwriting as features of Parkinson detection (<xref ref-type="bibr" rid="B2">2</xref>). In contrast to conventional models, the framework uses an early feature fusion system to give moving weights on modalities according to reliability to guarantee resilience in noisy or incomplete information conditions (<xref ref-type="bibr" rid="B3">3</xref>). It also includes explainable artificial intelligence that includes SHAP and Grad-CAM, which can be easily understood and provide both unimodal and multimodal transparency, which can be more easily interpreted by clinicians (<xref ref-type="bibr" rid="B12">12</xref>).</p>
<p>The presented system is strictly tested on benchmark datasets with a wide range of modalities and is thus robust and generalizes to various situations (<xref ref-type="bibr" rid="B5">5</xref>). The experimental findings demonstrate that the framework not only performs well in terms of attaining high diagnostic accuracy but also has stability in imperfect conditions and is superior to unimodal and non-adaptive baselines (<xref ref-type="bibr" rid="B8">8</xref>). The given approach will build a platform of reliable, interpretable, and scalable AI-assisted healthcare solutions by moving PD detection out of the research stage and into clinical practice (<xref ref-type="bibr" rid="B17">17</xref>, <xref ref-type="bibr" rid="B20">20</xref>).</p>
<p>The main contribution of the paper is the creation and implementation of a multimodal explainable deep learning architecture that integrates three modalities, namely, speech, gait, and handwriting, to ensure high and explainable detection of Parkinson&#x2019;s disease (PD). Moreover, explainable AI (XAI) methods, such as SHAP, Grad-CAM, and Integrated Gradients (IG), are applied in the structure and provide a visual and quantitative explanation of the model decision, which makes the predictions transparent and comprehensible to the clinician. Together, such contributions will address the shortcomings of existing unimodal and black-box models to an AI-based solution that is reliable, scalable, interpretable, and advanced in the diagnosis of PD. This article is presented in a systematic manner, thereby putting the readers through the whole research process. The Introduction section identifies the background, motivation, and problems of early PD detection. The Literature survey section takes into account the current unimodal and multimodal methods and reveals their inefficiency, proving the need of more detailed, interpretable method. The Methodology section elaborates on the proposed model architecture, feature extraction schemes in each of the modalities, and the multimodal fusion process. The Results and discussion section includes the results of the experiment and measures of performance and explainability of unimodal and multimodal. Finally, it ends with a Conclusion and future improvements section where the key findings are summarized, clinical implications are outlined, and extensions such as longitudinal modeling, real-world data integration, and federated learning are provided as improvements in the future.</p>
<sec id="s1a"><label>1.1</label><title>Problem statement</title>
<p>The heterogeneity of the symptoms and the use of subjective clinical ratings make it challenging to diagnose PD with any degree of dependability at an early stage of the condition. Despite the high accuracy of unimodal AI methods based on speech, gait, or handwriting, they do not tend to be generalized across all populations and have no interpretability, which makes them less appealing to clinical users. In addition, the majority of current models are susceptible to noisy or incomplete data and do not offer much information regarding their decision-making process. This poses a critical gap towards the creation of a robust, adaptive, and explainable multimodal framework that incorporates various PD biomarkers, can withstand imperfect data, and can show interpretable information that can be used to make reliable clinical decisions.</p>
</sec>
</sec>
<sec id="s2"><label>2</label><title>Literature survey</title>
<p>The given paper is devoted to studying the speech signals to detect PD in the early stages because abnormal alterations are frequently overlooked using traditional measures. The traditional methods were essentially run by hand-cut acoustic features or crude models, which could not find out all the abundance of the vocal signals and were limited in diagnostics. The three approaches that the authors have compared are the transfer learning where pretrained models have been utilized, deep feature extraction through spectrogram, and the classical model of the acoustic model in the instance of the pc-GITA Spanish speech data. Their findings revealed that the discriminatory power of deep spectrogram features was more favorable, better than a maximum of 99.7&#x0025; accuracy to recognize the classification of certain vowel sounds, and the method was significantly better than the transfer learning methods and other traditional methods. The findings confirmed the spectrogram-based characteristics as a highly effective and reliable tool that can be utilized in the diagnosis of PD in speech (<xref ref-type="bibr" rid="B2">2</xref>).</p>
<p>This contribution helped in solving the issue of the existence of irrelevant and redundant features that tend to limit the reliability of voice-based detection of PD. The authors have proposed a combination approach to the evolutionary feature selection and deep learning methods, which can be implemented in a hybrid to increase robustness. On the University of California, Irvine (UCI) voice dataset, they first employed adaptive gray wolf optimization (AGWO) in the process of selecting the most informative predictors, followed by inputs of the sparse autoencoder (AE) being the outputs of these predictors, to do the deep feature representation. Finally, six classifiers were applied to the latent features, and linear discriminant analysis (LDA) gave the best results. This composite pipeline was nearly accurate (95&#x0025;) in contrast to customary classification techniques. In the paper, the advantages of fusion between metaheuristic optimization and sparse autoencoder learning with regard to the noise in medical data have been successfully demonstrated (<xref ref-type="bibr" rid="B1">1</xref>).</p>
<p>The authors of the current study explained the negative aspect of utilizing single or narrow sets of features in the classification of PD using speech data that typically lacks significant Nonlinear Complexity Statistics (NCS). They solved this by devising deep learning models that can reduce a set of numerous features into one structure. Two convolutional neural network (CNN) architectures were put forward each having a parallel-branch design where sets of features were learned separately and then combined. This plan was experimented on the UCI Parkinson data using a leave-one-person-out (LOPO) cross-validation to obtain a realistic generalization over individuals. It was noted that the parallel-branch CNN was better both in terms of accuracy (0.869) and F-measure (0.917) than those trained on single features. The findings revealed that, as anticipated, the combination of multiple vocal parameters brings about considerable outcomes in regard to robustness and performance on biased biomedical databases (<xref ref-type="bibr" rid="B18">18</xref>).</p>
<p>The paper has argued that detecting PD at an early stage through motion-based analysis of wearable IMU sensors is probably going to be a challenging project since the symptoms of early PD may only have slight similarities with those of a normal aging person. The objective of the research was to develop models of the neural network, which can discriminate between patients with PD at an early stage and healthy participants by using information about the gait dynamics. A sensor was used to collect data on motion signals and was trained using deep neural networks and assessed based on the presence and severity of the disease. The proposed models demonstrated 99.67&#x0025; levels of accuracy in detecting early-stage PD, a considerably high level when compared to the conventional ones. The results have indicated the potential of wearable technology to track in the real-life situation, in a non-invasive way, and in a manner that could lead to a suspected diagnosis that may contribute to adding objective and very accurate information to the diagnosis procedure as well as the routine examination of the care consultant The objective of the research was to develop the models of the neural network, which can discriminate between the patients with PD at an early stage and the healthy participants by using the information about the gait dynamics. A sensor was used to collect data on motion signals and was trained using deep neural networks and assessed based on the presence and severity of the disease. The proposed models demonstrated 99.67&#x0025; levels of accuracy in detecting early-stage PD, a considerably high level when compared to the conventional ones. The results indicate the potential of wearable technology to track in real-world scenarios in a non-invasive approach, which, in turn, allows suspected diagnosis to provide objective and highly accurate depth to the process of diagnosis and the usual evaluation of care providers (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B21">21</xref>).</p>
<p>The fact that the results of the daily monitoring of the PD progression in the form of wearable sensors are interpretable is often an issue, which causes the application of the most deep learning-based models, known as black box. This issue was taken into consideration in the work, and one of the CNN architectures was offered to get acquainted with the continuous wavelet transformation of the signals registered by a group of IMUs located on various body parts. To be more transparent, the authors took Grad-CAM visualization to highlight the frequency channels and attendance locations that were predominant in the predictions so that the rationale behind the system can be comprehended by clinicians. They discovered that one waist-mounted IMU could achieve a performance of about 0.993 to an almost equivalent performance of a multi-sensor implementation. This finding not only reduced the expenditure and intricacy of deployment but also improved clinical reliability, and thus the system was simple to decipher and use (<xref ref-type="bibr" rid="B12">12</xref>).</p>
<p>Finger tapping is also used in PD to determine the disease and also to demonstrate improvement; the conditions of assessment in the case are subjective and imprecise clinical ratings. The authors aimed to overcome it by introducing a computer vision pipeline, where hand-pose estimation processes were employed to obtain the fine motor features through the kinematics of patients and healthy subjects in a video. They have offered a stratified classification method to provide better accuracy to define the presence and phases of advancement of the disease. The suggested system was more accurate than the methods available and provided a more precise and detailed examination of the severity of PD. More than that, the paper also discovered remnants of both the existence of linear and non-linear links between specific movement patterns and the disease development; thus, the clinicians are not restricted by the rating scales anymore, but the creation of more accurate evaluations is possible instead (<xref ref-type="bibr" rid="B3">3</xref>).</p>
<p>The purpose of the paper was to address two current issues in the detection of speech-based PD, i.e., lack of interpretability and an unbalanced dataset. The authors have proposed a novel pipeline: the XRFILR, comprising the recursive feature elimination to select the relevant predictors, synthetic oversampling and dataset balancing by using K-Means SMOTE, and explainable AI to give a report of model transparency. This system was introduced to the different classifiers to be tested on different speech databases. The results showed XRFILR to be very precise (96.46&#x0025;); nonetheless, they established the most significant features of speech in determining the classification. The high reliability combined with the interpretability provided the framework not only with significant predictive power but also with valuable insight into the clinical decision-making and therefore increased its consistency with the real-world PD screening (<xref ref-type="bibr" rid="B11">11</xref>).</p>
<p>The paper has suggested a multimodal assessment paradigm, which has integrated the handwriting specimens jointed and organized clinical data because handwriting disorders were found in the majority of patients with PD, yet not reflected in predictive models. The authors came up with Parkinson&#x0027;s multimodal deep network (PMMD): a deep neural network that learns associations between visual handwriting and clinical variables and pays attention where it is required. The framework was found to achieve comparable performance in the identification of the complementary nature of the two modalities by the cross-modal attention. The system was always accurate at 96&#x0025; that was more than the unimodal approaches that made use of the handwriting or clinical data. The analysis demonstrated the power of multimodal fusion to enhance the diagnosis of PD and the importance of attention mechanism in drawing meaningful associations between the data of various types (<xref ref-type="bibr" rid="B6">6</xref>).</p>
<p>Speech and facial movements can provide more detailed information to identify Parkinson&#x0027;s facial dyskinesia (PFD) and have been studied separately or out of phase in most historical studies, thus producing a jumbled picture. The given paper proposed a synchronous fusion model according to which speech is predicted by synchronized movements of lips based on selective attention-based processes. Temporal inferences in vocal features and facial expression were recorded, which enabled the model to realize the synergy of multimodal data better. The approach was contrasted with unimodal and asynchronous fusion baselines and achieved an unweighted average recall (UAR)&#x2009;&#x003D;&#x2009;95&#x0025; which is a substantial improvement over alternatives. The outcome confirmed the fact that time synchronized multimodal system can identify PD with more depth and accuracy (<xref ref-type="bibr" rid="B15">15</xref>).</p>
<p>The research problem was the gait-based measurement of PD progression considering the fact that gait is among the most revealing biomarkers in the disease staging. The authors reused PhysioNet data and offered a multimodal system with Perceiver which merged unprocessed ground reaction force data with engineered gait characteristics. Unlike many of its predecessors, the method did not require much hyperparameter optimization and was good at performance. The accuracy of the diagnostic system was 97.3&#x0025;, and a high correlation coefficient [0.93 with Unified Parkinson&#x2019;s Disease Rating Scale (UPDRS)] indicated that the system could correlate predictions with clinical ratings. The combination of unprocessed and processed streams of data enhanced the effectiveness of both the diagnosis and progress of the patient with an efficient and scalable process that demonstrated results in PD patients&#x2019; disease progression monitoring and diagnosis with machine learning techniques (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B22">22</xref>).</p>
<p>The current paper explored the critical period of the PD, as the time when the following mild symptoms as compromised sleep cycles or smell disorders are ignored by the clinical test. This issue is difficult because early biomarkers are spread and heterogeneous in the fields and prediction is difficult using the traditional single-modality systems. The authors have addressed this by bringing together a myriad of biomarkers including sleep data, cerebrospinal fluid measurements, imaging, and olfactory features into a cohort of over 500 individuals. It was then compared to 12 conventional algorithms, and a deep learning model was found to work significantly better with an accuracy at 96.45&#x0025;. It was interesting to note that the deep model offered the most predictive performance besides the most discriminative biomarkers that could possibly be employed to assist in the explanation of early disease processes. The study proved the significance of multimodal fusion and the deep learning representation in enhancing sensitivity during early PD detection phases (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B23">23</xref>).</p>
<p>The complexity and multidimensionality of the signs of PD were also discussed in this paper where no single model could be used to generalize to various manifestations of patients. The authors presented a stacking ensemble model whereby support vector machine (SVM), gradient boosting, and logistic regression are incorporated in the new model in a complementing manner. The ensemble was trained and tested on two benchmark datasets, with the cross-validation procedures being very strict to ensure overfitting is limited, and was also probability-calibrated to yield the correct diagnostic confidence scores. The accuracy of the ensemble was 96.18 which is greater than that being achieved by the base learners. Through the analysis, hybrid datasets were also indicated to be better in addition to straight classification accuracy, more dependable estimations of probabilities, and, consequently, more practical in clinical decision-making in PD detection (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>To improve the accuracy of the speech-based PD detection, we suggested a hybrid deep neural network, PD-Net, implementing the merits of different spectral features. It was explained by the fact that the cues associated with PD are observed in both fine-grained timbral scales and in more coarse spectral energy scales, but most models consider either one of these two classes of features. PD-Net further used Mel-frequency cepstral coefficients (MFCCs) and Mel spectrograms in a CNN&#x2013;long short-term memory (LSTM) network and used multi-head attention to estimate the strength of the channel and time step effects. The system is demonstrated to be highly accurate (nearly 99&#x0025;), and this is greater compared to the single-feature pipelines that demonstrate that the attention-based dual-spectral fusion system can offer discriminatory representations of the PD-related vocal changes (<xref ref-type="bibr" rid="B24">24</xref>).</p>
<p>In this paper, the processing of MRI images was reconceptualized to identify PD using graphs in addition to using purely conventional convolutional networks. The challenge is that the neuroimaging data are not very rich and larger networks are prone to overfitting and not good generalization. To get around this, the authors removed the salient points in MRI slices and tabulated these as compact graphs which contained the important structural information. A multi-level graph neural network (GNN) which incorporated pooling and attention layers was also introduced to preserve significant topology and eliminate spurious noise with sparsity-sensitive pooling and attention. This method compared to conventional CNNs and traditional GNNs is said to have had a superior performance and a superior generalization of small training sets. The benefit of the study was that it offered a model architecture which is consistent with the structural priors present in neuroimaging and offered a consistent and data-efficient way of identifying PD using neuroimaging (<xref ref-type="bibr" rid="B19">19</xref>).</p>
<p>This systematic review considered 87 articles to evaluate the current state of multimodal deep learning to diagnose PD, integrating all the evidence found in the given areas: speech, gait, facial expression, handwriting, imaging, and physiological recording. This review discovered that the multimodal systems are superior to the unimodal systems as they possess distinct benefits as they are complementary phenotypes as far as none of the modalities can have. It has also identified some of the existing barriers that are being listed to make progress, including small and isolated datasets, non-uniform data recording processes, no elucidation of deep models, and no external validation with clinical setups. The authors have proposed that further work should be done to develop bigger shared benchmarks, more powerful reporting frameworks, and in applying explainable AI framework to reduce the gap between research prototypes and clinically qualified diagnostic systems (<xref ref-type="bibr" rid="B13">13</xref>).</p>
<p>Another well-known PD characteristic is the impairment of handwriting, although most of the handwriting datasets lack any significant sample sizes, and consequently the effectiveness of single deep learning models cannot be offered. This issue is resolved in this paper by complementing handwriting data with three distinct sources and training with a wide range of architectures with different representational abilities. The authors failed to present one of these networks; they took intermediate-level features of such CNNs and fed them into an SVM classifier to show superior results. The set was able to record a range of handwriting characteristics using stroke forms, micro-variations which occurred in tremor and rhythms. The system achieved an impressive result of 99.35&#x0025; and also demonstrated the effectiveness and efficiency of feature-level fusion of heterogeneous CNNs in small samples of handwritten-based PD identification (<xref ref-type="bibr" rid="B9">9</xref>).</p>
<p>Our proposal was a deep learning-based methodology, the Multi-Variant Stacked Autoencoder (MVSAE) as a novel framework aiming at the incorporation of numerous domains of PD indicators instead of individual-symptom identification. This was based on the arguments that the analysis of voice, motor, and non-motor features independently lacks important correlations among and across domains and, therefore, hinders predictive validity. MVSAE modeled these attributes together and found small shared representations of reduced redundancy with no loss of disease-relevant variation. In a sequence of experiments, the model did better than other simple classifiers by differences up to 5&#x2013;10 percentage points and showed that cross-domain embeddings actually are more useful than isolated feature representation of recognizing PD. The study merits are that the latent PD signatures could be detected using holistic, integrative representation learning and appear to be more representative of the heterogeneity of this complicated disease (<xref ref-type="bibr" rid="B14">14</xref>).</p>
<p>This contribution coined the new definition of stroke segmentation as a physiologically significant segmentation strategy that enables positioning the new PD detection roadmap on the scene. The system could model strokes based on the beta-elliptical technique of breaking down handwriting into perceptually significant units and was followed by the fuzzy perceptual detector which could extract features that were resistant to tremor noise and intra-writer variation. It was then fed with the characteristics of these features into a bidirectional long short-term memory (BLSTM) network to take into account temporal relationships between the individual strokes. The PaHaW handwriting corpus was experimented upon, and the system performed much better than the current handwriting-based PD classifiers. The combination of physiologically plausible stroke segmentation and sequence modeling actually improved the accuracy and provided a feasible form of passive screening using a tablet, either in a clinical or home-based environment (<xref ref-type="bibr" rid="B10">10</xref>).</p>
<p>This article was a resolution to the issue of tuning the long short-term memory (LSTM) networks by applying them to the gait time-series information, and these are sensitive to the hyperparameters. The authors suggested a variant of the particle swarm optimization process modified to run a hyperparameter search algorithm automatically and applied it to PhysioNet gait data sets. The optimized LSTM was 89.92&#x0025; accurate and higher than the unoptimized models and other optimization strategies. To improve the transparency of the study, SHapley Additive exPlanation (SHAP) analysis was employed to understand the most significant temporal gait features in prediction, and it was possible to see the physiological rationale of the model decisions. The study demonstrated that principled optimization with <italic>post hoc</italic> explainability can obtain an effective and interpretable differentiation system of gait-based PD detection (<xref ref-type="bibr" rid="B16">16</xref>).</p>
<p>The paper (<xref ref-type="bibr" rid="B25">25</xref>) proposed a non-invasive PD prediction method leveraging voice inputs, demonstrating that speech-based biomarkers can effectively predict disease progression. This approach complements multi-omics and clinical data-driven strategies, highlighting the value of integrating diverse data sources for early and accurate PD risk assessment.</p>
<p>In the Parkinson&#x2019;s Progression Markers Initiative (PPMI), 12 datasets of single and multimodal ML pipelines were compared on PD analysis. They also investigated different kinds of classifiers besides feature selection methods so as to examine the impact of the source of data on them. To a surprise, the results proved that not all simple, deep, or complex models are effective as in scenarios when there is good quality data and features are correctly performed, a simple linear SVM will get the perfect accuracy. However, with higher heterogeneity of the data, multimodal fusion was more resistant and capable of generalizing across a broader range of patients. There were also compressive feature attribution with predictors in the motor, olfactory, cognitive, and sleep domains. This cumulative finding suggested that both simplification and multimodal fusion might be applicable in the detection of PD: The linear models might be sufficient under the conditions of high quality of the data, but multimodal fusion might be helpful under the conditions of higher variability levels.</p>
<p>Current PD detection systems are based mostly on unimodal methods of detecting speech, gait, or handwriting as an independent entity, which restricts their resilience, generalizability, and clinical reliability. The speech-based models are also influenced by language, accent, and noise differences, gait-based models are influenced by sensor quality and user compliance, and handwritten-based models are sensitive to controlled environments and large datasets. In addition, a majority of deep learning models deployed in these studies are black boxes and thus provide limited insight or comprehension of the manner in which they arrive at a decision and therefore are not easily trusted and utilized by clinicians. These unimodal approaches also cannot capture the heterogeneity of the PD symptoms and cannot handle noisy or incomplete data. The proposed multimodal explainable model addresses these weaknesses by incorporating complementary biomarkers of speech, gait, and handwriting of feature fusion which is resistant to data flaws and cross-domain interaction. Besides that, explainable AI (SHAP, Grad-CAM, and Integrated Gradients) is also embedded and offers transparent, physiologically interpretable explanations of predictions made by a model, allowing clinicians to know which features affect a diagnosis. It leads to a single, strong, and interpretable system of PD detection that is more accurate and bridges the gap between the experimental research on AI and its use in practice.</p>
<p>While previous studies have applied unimodal approaches for PD detection using speech, handwriting, or gait separately, they often do not integrate heterogeneous modalities for comprehensive assessment. Similarly, multimodal frameworks reported in focus on feature-level fusion but lack explainability or prospective validation. In contrast, the proposed framework combines three complementary modalities (speech, handwriting, gait) with a unified deep feature extraction and explainable AI pipeline, providing improved interpretability and decision support capabilities. This positions our work as a novel contribution bridging multimodal integration with clinical applicability, beyond the scope of prior studies. <xref ref-type="table" rid="T1">Table&#x00A0;1</xref> summarizes the key attributes of relevant studies on PD detection, highlighting the modalities used, classification strategies, and explainability techniques.</p>
<table-wrap id="T1" position="float"><label>Table&#x00A0;1</label>
<caption><p>Summary table of key attributes of research papers.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Reference</th>
<th valign="top" align="center">Dataset</th>
<th valign="top" align="center">Methodology</th>
<th valign="top" align="center">Results and discussion</th>
<th valign="top" align="center">Metrics</th>
<th valign="top" align="center">Application</th>
<th valign="top" align="center">Strength</th>
<th valign="top" align="center">Limitation</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B2">2</xref>)</td>
<td valign="top" align="left">pc-GITA Spanish speech dataset (phonations&#x2009;&#x002B;<sans-serif>&#x2009;read</sans-serif> speech)</td>
<td valign="top" align="left">Compared transfer learning, deep spectrogram features&#x2009;&#x002B;<sans-serif>&#x2009;ML</sans-serif>, and classical acoustic features</td>
<td valign="top" align="left">Deep features outperformed others; accuracy up to 99.7</td>
<td valign="top" align="left">Accuracy, comparison with the existing literature</td>
<td valign="top" align="left">Early PD detection of speech</td>
<td valign="top" align="left">High accuracy, generalization by speech tasks</td>
<td valign="top" align="left">Language-specific dataset limits generalization</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B1">1</xref>)</td>
<td valign="top" align="left">UCI PD voice dataset</td>
<td valign="top" align="left">Adaptive gray wolf optimization for feature selection&#x2009;&#x002B;<sans-serif>&#x2009;sparse</sans-serif> autoencoder representation learning</td>
<td valign="top" align="left">Best LDA model achieved a top of 95&#x0025;</td>
<td valign="top" align="left">Accuracy, cross-validation performance</td>
<td valign="top" align="left">Voice-based PD classification</td>
<td valign="top" align="left">Combines feature selection and deep representation</td>
<td valign="top" align="left">Single dataset; can overfit to particular voice patterns</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B18">18</xref>)</td>
<td valign="top" align="left">UCI PD voice dataset</td>
<td valign="top" align="left">CNN models with concatenated or parallel-branch features</td>
<td valign="top" align="left">Parallel CNN outperformed<break/>Accuracy 0.869, F-measure 0.917</td>
<td valign="top" align="left">Accuracy, F-measure, MCC</td>
<td valign="top" align="left">Multiset feature speech-based PD detection</td>
<td valign="top" align="left">Improved handling of imbalanced data</td>
<td valign="top" align="left">Moderate absolute accuracy compared to the latest</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B4">4</xref>)</td>
<td valign="top" align="left">IMU gait data of PD patients and controls</td>
<td valign="top" align="left">Neural networks on motion data</td>
<td valign="top" align="left">Early-stage detection accuracy up to 99.67&#x0025;</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Wearable-based PD detection</td>
<td valign="top" align="left">Very high early-stage detection accuracy</td>
<td valign="top" align="left">Small sample size</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="left">IMU data of 100 subjects</td>
<td valign="top" align="left">CNN wavelet transforms and Grad-CAM pruning features</td>
<td valign="top" align="left">Single waist sensor achieved 98.01&#x0025; accuracy, AUC 0.9981</td>
<td valign="top" align="left">Accuracy, AUC</td>
<td valign="top" align="left">Best wearable PD monitoring</td>
<td valign="top" align="left">High accuracy with minimal sensors</td>
<td valign="top" align="left">Focus on walking only; other PD signs may escape</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B3">3</xref>)</td>
<td valign="top" align="left">Videos from 90 subjects</td>
<td valign="top" align="left">Hand-pose estimation&#x2009;&#x002B;<sans-serif>&#x2009;tiered</sans-serif> classifier</td>
<td valign="top" align="left">Outperformed prior severity classification methods</td>
<td valign="top" align="left">Accuracy, feature trend analysis</td>
<td valign="top" align="left">Video-based severity in PD</td>
<td valign="top" align="left">Detection of fine-grained severity change</td>
<td valign="top" align="left">Depends on video quality and a controlled environment</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B11">11</xref>)</td>
<td valign="top" align="left">Speech dataset</td>
<td valign="top" align="left">XRFILR pipeline: RFE&#x2009;&#x002B;&#x2009;LR, K-Means SMOTE, explainable ML</td>
<td valign="top" align="left">Accuracy 96.46&#x0025;, interpretable feature importances</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Explainable speech-based PD prediction</td>
<td valign="top" align="left">Interpretable results for clinicians</td>
<td valign="top" align="left">Only to speech features</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B6">6</xref>)</td>
<td valign="top" align="left">Handwriting/drawing images&#x2009;&#x002B;<sans-serif>&#x2009;clinical</sans-serif> data</td>
<td valign="top" align="left">Cross-modal attention fusion deep learning</td>
<td valign="top" align="left">96&#x0025; accuracy; outperformed baselines</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Multimodal PD detection</td>
<td valign="top" align="left">Integrates complementary modalities</td>
<td valign="top" align="left">Requires both data types for best results</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B15">15</xref>)</td>
<td valign="top" align="left">Video of speech, audio, and lip movement</td>
<td valign="top" align="left">Attention synchronized bimodal fusion</td>
<td valign="top" align="left">UAR 95&#x0025; with synchronous fusion</td>
<td valign="top" align="left">UAR</td>
<td valign="top" align="left">Multimodal audio-visual PD detection</td>
<td valign="top" align="left">Improves over unimodal/asynchronous</td>
<td valign="top" align="left">Require good quality synchronized data</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B8">8</xref>)</td>
<td valign="top" align="left">PhysioNet gait dataset</td>
<td valign="top" align="left">Perceiver architecture on GRF&#x2009;&#x002B;<sans-serif>&#x2009;gait</sans-serif> features</td>
<td valign="top" align="left">UPDRS MAE 2.23; PD diagnosis 97.3&#x0025; accuracy</td>
<td valign="top" align="left">MAE, RMSE, CC, Accuracy, AUC, Sensitivity, Specificity</td>
<td valign="top" align="left">Remote gait-based PD diagnosis and severity scoring</td>
<td valign="top" align="left">High accuracy for both diagnosis and severity</td>
<td valign="top" align="left">Lab-collected gait may differ from daily life</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B5">5</xref>)</td>
<td valign="top" align="left">183 healthy, 401 early PD (premotor features)</td>
<td valign="top" align="left">Deep learning model vs. 12 ML/ensemble methods</td>
<td valign="top" align="left">Deep model has the highest average accuracy, 96.45&#x0025;</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Premotor biomarker-based early PD detection</td>
<td valign="top" align="left">High accuracy across diverse indicators</td>
<td valign="top" align="left">Relatively small dataset size</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B7">7</xref>)</td>
<td valign="top" align="left">Two benchmark PD datasets</td>
<td valign="top" align="left">Stacking with SVM&#x2009;&#x002B;&#x2009;GB for features, LR classifier</td>
<td valign="top" align="left">Accuracy 94.87&#x0025; (AUC 90&#x0025;) and 96.18&#x0025; (AUC 96.27&#x0025;)</td>
<td valign="top" align="left">Accuracy, AUC</td>
<td valign="top" align="left">Improved PD diagnosis from standard datasets</td>
<td valign="top" align="left">Strong performance across datasets</td>
<td valign="top" align="left">Benchmark datasets may not reflect real-world noise</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B24">24</xref>)</td>
<td valign="top" align="left">Italian vocal audio dataset</td>
<td valign="top" align="left">MFCC&#x2009;&#x002B;&#x2009;mel spectrogram via CNN&#x2009;&#x002B;&#x2009;LSTM&#x2009;&#x002B;&#x2009;attention</td>
<td valign="top" align="left">99&#x0025; accuracy</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Speech-based PD detection</td>
<td valign="top" align="left">Captures both spatial and temporal patterns</td>
<td valign="top" align="left">Single-language dataset</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B19">19</xref>)</td>
<td valign="top" align="left">PPMI MRI slice data</td>
<td valign="top" align="left">Image-to-graph construction, multi-level GNN, sparsity pooling</td>
<td valign="top" align="left">Outperformed CNN/GNN baselines</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">MRI-based PD diagnosis</td>
<td valign="top" align="left">Efficient graph construction; reduced overfitting</td>
<td valign="top" align="left">MRI preprocessing still required</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B13">13</xref>)</td>
<td valign="top" align="left">87 reviewed studies</td>
<td valign="top" align="left">Survey of DL on gait, limb, speech, and facial expression</td>
<td valign="top" align="left">Multimodal&#x2009;&#x003E;&#x2009;unimodal; gaps in interpretability</td>
<td valign="top" align="left">N/A</td>
<td valign="top" align="left">Comprehensive review of PD DL approaches</td>
<td valign="top" align="left">Broad coverage across modalities</td>
<td valign="top" align="left">No experimental results</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B9">9</xref>)</td>
<td valign="top" align="left">Three PD handwriting datasets</td>
<td valign="top" align="left">Multiple CNNs, early fusion of features, SVM</td>
<td valign="top" align="left">99.35&#x0025; accuracy</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Handwriting-based PD detection</td>
<td valign="top" align="left">Superior to single CNNs</td>
<td valign="top" align="left">Requires extensive data augmentation</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B14">14</xref>)</td>
<td valign="top" align="left">Multi-feature PD dataset</td>
<td valign="top" align="left">Four SAE variants for multi-attribute learning</td>
<td valign="top" align="left">10&#x0025; better than MANN, GAE, UMLBD</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Multi-attribute PD prediction</td>
<td valign="top" align="left">Handles diverse PD features</td>
<td valign="top" align="left">Limited external validation</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B10">10</xref>)</td>
<td valign="top" align="left">PaHaW and new Arabic handwriting dataset</td>
<td valign="top" align="left">Beta-elliptical stroke segmentation, fuzzy features, BLSTM</td>
<td valign="top" align="left">Outperformed existing handwriting-based PD systems</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Online handwriting PD detection</td>
<td valign="top" align="left">Novel stroke&#x2009;&#x002B;<sans-serif>&#x2009;fuzzy</sans-serif> feature combo</td>
<td valign="top" align="left">Language-specific dataset</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B16">16</xref>)</td>
<td valign="top" align="left">PhysioNet gait dataset</td>
<td valign="top" align="left">LSTM optimized with modified PSO</td>
<td valign="top" align="left">89.92&#x0025; accuracy; SHAP for feature importance</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Gait-based PD diagnosis</td>
<td valign="top" align="left">Metaheuristic tuning improved performance</td>
<td valign="top" align="left">Performance still below some multimodal models</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B17">17</xref>)</td>
<td valign="top" align="left">12 PPMI datasets</td>
<td valign="top" align="left">Single/multimodal ML, majority voting labeling</td>
<td valign="top" align="left">SVM 100&#x0025; acc; ANN 91.41&#x0025; with selected features</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Identify PD and key causal factors</td>
<td valign="top" align="left">Identifies influential features</td>
<td valign="top" align="left">May overfit due to very high accuracy</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF1a"><p>MCC, Matthews correlation coefficient; RFE, recursive feature elimination; MAE, mean absolute error; RMSE, root mean square error; GRF, ground reaction force; GB, gradient boosting; MFCC, Mel-frequency cepstral coefficients; DL, deep learning; SAE, stacked autoencoder; MANN, multimodal artificial neural network; GAE, graph autoencoder; UMLBD, unified multimodal learning-based diagnosis; PSO, particle swarm optimization; ANN, artificial neural network.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3"><label>3</label><title>Methodology</title>
<p>The suggested methodology combines three complementary modalities, such as gait, voice, and handwriting, into a single multimodal diagnostic tool for PD. Individual modalities receive their own specialized feature extraction pipeline: Gait signals are processed using temporal convolutions with dilations and autoencoding, voice data are processed using EfficientNet-B0 with log-Mel spectrograms, and spiral handwriting images are processed using ResNet-50. Such pipelines record modality-dependent biomarkers including stride anomalies, phonatory errors, and tremor-related anomalies. Features are normalized, compressed through global average pooling, and converted into small latent vectors to make them comparable across domains. SHAP, Integrated Gradients, and Grad-CAM support the interpretability of their prediction at the modality level and allow clinical professionals to reverse engineer the way their model arrives at its prediction. After unimodal extraction, the feature vectors are then fused together in a combined multimodal representation that includes domain-specific and cross-domain information. This combination embedding is propagated by XGBoost to get the resultant predictions. Optimization of the model is done with the help of cross-entropy loss, and interpretability is maintained through the calculation of SHAP values of the fused vector. Not only does this multimodal fusion outperform unimodal systems in accuracy of classification, but it also provides clinically relevant and interpretable information on heterogeneous symptoms of PD.</p>
<sec id="s3a"><label>3.1</label><title>Gait feature extraction pipeline</title>
<p>As discussed in <xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>, the initial step of the pipeline is to put the raw gait signals that are usually of different lengths and magnitudes across subjects into a uniform form. All recordings are broken up into temporal blocks of certain fixed lengths of 2&#x2005;s (200 samples at 100&#x2005;Hz) and still maintain the temporal continuity. In case one of the segments is shorter, it is zero-padded to preserve the dimension. A normalization of these windows is then made based on <italic>z</italic>-score scaling which rescales each feature by removing the mean and dividing by the standard deviation. This modification makes sure that the different sensors or trials with different absolute magnitudes do not overtake the training process. This normalization formula can be expressed as in <xref ref-type="disp-formula" rid="disp-formula1">Equation&#x00A0;1</xref>:<disp-formula id="disp-formula1"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM1"><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">norm</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mi>q</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mi mathvariant="normal">mean</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">SD</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math><label>(1)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM1"><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">norm</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> is the normalized value, <italic>q</italic> is the raw input value, mean is the average of the windowed interval, and SD is the standard deviation of the windowed interval.</p>
<fig id="F1" position="float"><label>Figure&#x00A0;1</label>
<caption><p>System architecture of the proposed framework.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g001.tif"><alt-text content-type="machine-generated">Infographic outlining a multimodal Parkinson&#x2019;s disease detection framework with three sections: feature extraction from voice, gait, and handwriting using neural models; multimodal fusion and classification for diagnosis; and interpretability tools including Grad-CAM, Integrated Gradients, and SHAP to visualize and explain model decisions, ending in healthy or Parkinson&#x2019;s label.</alt-text>
</graphic>
</fig>
<p>This is applied to normalize the data so that its variance is 1 and its mean is 0, which improves the stability of training and makes the features of other sensors comparable.</p>
<p>The second step is to use temporal convolutions to get local fine gait dynamics. Convolution is nothing but a weighted average of time samples around it, which has gone through a non-linear activation (ReLU). This enables the network to automatically learn filters which identify stride changes, tremor bursts, or asymmetries. The temporal convolution at a time step is of a basic form which is expressed as in <xref ref-type="disp-formula" rid="disp-formula2">Equation&#x00A0;2</xref>:<disp-formula id="disp-formula2"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM2"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>a</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>.</mml:mo><mml:msub><mml:mi>e</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi mathvariant="normal">bias</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM2"><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mspace width="thickmathspace" /></mml:math></inline-formula> are kernel weights, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM3"><mml:msub><mml:mi>e</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are input samples within the convolutional window, bias is a bias term, and <italic>a</italic> is the activation function.</p>
<p>Instead of using local patterns, dilated convolutions are used, in which gaps are inserted between sampled inputs. This increases the receptive field (but does not raise the cost of computation) to allow the network to trace longer gait cycles. The convolution of the layer is dilated. <italic>l</italic> is formalized as in <xref ref-type="disp-formula" rid="disp-formula3">Equation&#x00A0;3</xref>:<disp-formula id="disp-formula3"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM3"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mspace width="thinmathspace" /><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mi>h</mml:mi><mml:mrow><mml:mi>v</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>where <italic>c</italic> is the dilation factor, meaning that the filter skips every <italic>c</italic> samples from the previous layer.</p>
<p>By stacking layers with different dilations, the model can capture both short-term stride irregularities and long-term gait cycles, as defined in <xref ref-type="disp-formula" rid="disp-formula4">Equation 4</xref>.<disp-formula id="disp-formula4"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM4"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM4"><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mspace width="thickmathspace" /></mml:math></inline-formula> is the residual mapping of the input.</p>
<p>This mechanism ensures that the network does not lose fine-scale gait information while learning higher-order abstractions.</p>
<p>The model generates rich temporal features for the maps after several convolutional layers. These maps are, however, different in length depending on the size of the window and convolution parameters. Global average pooling is used to compress this onto a fixed-length vector that is a representation of the gait segment. This is utilized to compute the average activation at each time step giving a compressed embedding. The operation is constituted as in <xref ref-type="disp-formula" rid="disp-formula5">Equation&#x00A0;5</xref>:<disp-formula id="disp-formula5"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM5"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>f</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mi>Q</mml:mi></mml:mfrac></mml:mrow><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>Q</mml:mi></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mstyle></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>where <italic>Q</italic> is the temporal length of the segment and <italic>f</italic> is the resulting embedding.</p>
<p>This latent vector captures dominant gait dynamics (e.g., stride frequency or imbalance) in a low-dimensional space, making it suitable for clustering and classification.</p>
<p>The model is trained, as an autoencoder, to make sure that important information is not lost in the learned embedding. The encoder is fed by inputs and transforms them into embeddings, and the decoder tries to rebuild the original input signals based on the embeddings. The reconstruction goal causes the embeddings to capture gait features of significance as opposed to noise. This is formalized as in <xref ref-type="disp-formula" rid="disp-formula6">Equation&#x00A0;6</xref>:<disp-formula id="disp-formula6"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM6"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mover><mml:mi>m</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:mrow><mml:mi>k</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM5"><mml:msub><mml:mi>g</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula> is the decoder function, <italic>u</italic> is the latent embedding, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM6"><mml:msub><mml:mrow><mml:mover><mml:mi>m</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>k</mml:mi></mml:msub><mml:mspace width="thickmathspace" /></mml:math></inline-formula> is the reconstructed signal.</p>
<p>Minimizing the difference between <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM7"><mml:msub><mml:mrow><mml:mover><mml:mi>m</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mspace width="thickmathspace" /></mml:mrow></mml:msub></mml:math></inline-formula> and the true <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM8"><mml:msub><mml:mi>m</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:math></inline-formula> ensures that embeddings encode physiologically relevant gait dynamics.</p>
<p>After generating embeddings, they are clustered into groups to determine natural clusters of gait patterns. The redundancy is eliminated using dimensionality reduction (through PCA) after which the K-means clustering is carried out.</p>
<p>To estimate the best number of clusters, <italic>K</italic>, the silhouette score is calculated which weighs the intra-cluster cohesion and inter-cluster separation. The definition of this metric is shown in <xref ref-type="disp-formula" rid="disp-formula7">Equation&#x00A0;7</xref>:<disp-formula id="disp-formula7"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM7"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>where <italic>d</italic> is the mean distance of points of the same cluster and <italic>c</italic> is the mean distance between the current cluster and the other nearest cluster.</p>
<p>When the silhouette score is high, the clustering will be done in a better way, and it helps to find meaningful subgroups such as the case of healthy gait and the case of Parkinsonian gait.</p>
<p>A shallow neural network classifier is trained on the embeddings with pseudo-labels obtained through the process of clustering to transform the unsupervised cluster assignments into a predictive tool. The softmax function is the calculation of the probability of classes by the classifier, as shown in <xref ref-type="disp-formula" rid="disp-formula8">Equation&#x00A0;8</xref>:<disp-formula id="disp-formula8"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM8"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mover><mml:mi>m</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:mrow><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>l</mml:mi></mml:msubsup><mml:mo>&#x2061;</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>where <italic>g<sub>c</sub></italic> and <italic>b<sub>c</sub></italic> are classifier parameters, <italic>l</italic> is the total number of clusters (classes), and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM9"><mml:msub><mml:mrow><mml:mover><mml:mi>m</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>l</mml:mi></mml:msub><mml:mspace width="thickmathspace" /></mml:math></inline-formula> is the probability that input <italic>p</italic> belongs to class <italic>c</italic>.</p>
<p>The model is optimized by reducing the non-entropic loss which penalizes variance between actual labels and probabilities. This is defined in <xref ref-type="disp-formula" rid="disp-formula9">Equation&#x00A0;9</xref>:<disp-formula id="disp-formula9"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM9"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo></mml:mrow><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mi>o</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mrow><mml:mi mathvariant="normal">log</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:mrow><mml:mi>c</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>where <italic>o<sub>h</sub></italic> is the actual text (one-hot encoded).</p>
<p>This guarantees that the classifier learns decision boundaries that match gait clusters and thus it can classify unseen gait segments with a lot of strength.</p>
<p>SHapley Additive exPlanations (SHAP) is used to provide interpretability. SHAP is based on the cooperative game theory, and the score of significance to each feature is calculated by measuring its marginal contribution to any feature subset. This is regularized in <xref ref-type="disp-formula" rid="disp-formula10">Equation&#x00A0;10</xref>:<disp-formula id="disp-formula10"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM10"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mo>&#x2286;</mml:mo><mml:mi>F</mml:mi><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mi>l</mml:mi><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mrow></mml:munder><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mi>A</mml:mi><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mo>!</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mi>W</mml:mi><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mi>A</mml:mi><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>!</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mi>W</mml:mi><mml:mrow><mml:mo>&#x2223;</mml:mo></mml:mrow><mml:mo>!</mml:mo></mml:mrow></mml:mfrac></mml:mrow><mml:mspace width="thinmathspace" /><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mi>w</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mo>&#x222A;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mi>i</mml:mi><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mstyle></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>where <italic>p<sub>i</sub></italic> is the importance of feature <italic>l</italic>, <italic>W</italic> is the full feature set, <italic>A</italic> is a subset excluding feature <italic>l</italic>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM10"><mml:mi>w</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x22C5;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the model prediction.</p>
<p>As a complementary interpretability technique, Integrated Gradients (IG) explains the prediction of a neural network using the input features of the network by summing gradients along a linear path between a baseline (e.g., zero input) and the actual input. IG contrasts with SHAP, which is a combinatorial-based method, and operates on a direct measure of the sensitivity of outputs to the change of input values. <xref ref-type="disp-formula" rid="disp-formula11">Equation&#x00A0;11</xref> defines the attribution as follows:<disp-formula id="disp-formula11"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM11"><mml:mtable rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mi>I</mml:mi><mml:msub><mml:mi>G</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>k</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:munderover><mml:mrow><mml:mo largeop="false">&#x222B;</mml:mo></mml:mrow><mml:mn>0</mml:mn><mml:mn>1</mml:mn></mml:munderover><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msup><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>d</mml:mi><mml:mi>v</mml:mi></mml:mstyle></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd><mml:mtd><mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>where <italic>k</italic> is the actual input, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM11"><mml:msup><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is the baseline input, and <italic>v</italic> is an integration parameter.</p>
<p>This intrinsic accumulation builds gradients along the path from input to baseline, pointing out which sensors and time points were most responsible for the ultimate decision. Within gait analysis, IG has been used to localize particular phases of a stride cycle (such as toe-off or heel-strike) that were most important for the classification of Parkinsonian gait.</p>
</sec>
<sec id="s3b"><label>3.2</label><title>Voice feature extraction pipeline</title>
<p>In Parkinson&#x2019;s disease (PD), some of the first impairments include inappropriate loudness, monotone tone, inaccurate articulation, or a hoarse voice. The incorporation of raw audio recordings of sustained vowel phonation into the proposed system is because of these characteristics to achieve the highest diagnostic capability. As discussed in <xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>, these sound signals are uniformly sampled (e.g., 16&#x2005;kHz) and represent a non-accented vocal prolongation (e.g., prolonged/a/sound) by each subject, so that any phonatory deviations can be isolated without lexical or linguistic variations.</p>
<p>The one-dimensional signal <italic>x</italic>(<italic>t</italic>) is transformed to a two-dimensional time&#x2013;frequency representation through the short-time Fourier transform (STFT) which takes the Fourier transform of windowed slices of the signal. Mathematically, the STFT of a signal is computed as shown in <xref ref-type="disp-formula" rid="disp-formula12">Equation&#x00A0;12</xref>:<disp-formula id="disp-formula12"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM12"><mml:mi>o</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo></mml:mrow><mml:mi mathvariant="normal">&#x221E;</mml:mi></mml:mrow><mml:mi mathvariant="normal">&#x221E;</mml:mi></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:mi>o</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>g</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2212;</mml:mo></mml:mrow><mml:mi>j</mml:mi><mml:mi>w</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:math><label>(12)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM12"><mml:mi>o</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>is the STFT output, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM13"><mml:mi>o</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>is the input signal, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM14"><mml:mi>w</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>g</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>is the windowing function, and <italic>w</italic> is the angular frequency.</p>
<p>The result is a spectrogram in complex values showing the frequency content vs. time. To place a frequency scale more in accord with human perception of sound, the frequency axis on the spectrogram is converted using the Mel scale. The Mel-frequency <italic>m</italic> corresponding to a linear frequency <italic>f</italic> (in Hz) is calculated according to <xref ref-type="disp-formula" rid="disp-formula13">Equation&#x00A0;13</xref>:<disp-formula id="disp-formula13"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM13"><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mn>2595</mml:mn><mml:mo>&#x22C5;</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mn>10</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mi>l</mml:mi><mml:mrow><mml:mn>700</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math><label>(13)</label></disp-formula>where <italic>mel</italic> is the Mel frequency and <italic>l</italic> is the linear frequency in Hz.</p>
<p>Such perceptual scaling focuses on low-frequency bands of voice pathologies. The magnitude spectrogram is then squeezed in the logarithmic form as shown in <xref ref-type="disp-formula" rid="disp-formula14">Equation&#x00A0;14</xref>:<disp-formula id="disp-formula14"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM14"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">log</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>E</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math><label>(14)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM15"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">log</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>represents the log-Mel spectrogram; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM16"><mml:mi>E</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, magnitude Mel spectrogram; and <italic>b</italic>, small constant for stability.</p>
<p>The resulting log-Mel spectrogram is a suitable two-dimensional input to the EfficientNet-B0 architecture which is a convolutional feature extractor. Instead of derived handcrafted shallow convolutional kernels, EfficientNet-B0 uses an optimized sequence of convolutional blocks that uses depth-wise separable convolutions and squeeze and excitation modules. The feature maps obtained after the convolutional processing are pooled by using global average pooling to obtain a compact latent embedding, <italic>z</italic>. The resulting classification output is hence obtained as shown in <xref ref-type="disp-formula" rid="disp-formula15">Equation&#x00A0;15</xref>:<disp-formula id="disp-formula15"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM15"><mml:mrow><mml:mover><mml:mi>K</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>O</mml:mi><mml:mi>a</mml:mi><mml:mo>+</mml:mo><mml:mi>f</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math><label>(15)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM17"><mml:mrow><mml:mover><mml:mi>K</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula>represents the predicted class probabilities; <italic>a</italic>, latent feature vector obtained after global average pooling; <italic>O</italic>, weight matrix of the classifier; <italic>f</italic>, bias term; and <italic>d</italic>, softmax activation function.</p>
<p>To make it more robust and insensitive to local spatial variations, the extracted feature maps from EfficientNet-B0 are aggregated with global average pooling. This function takes each feature map and compresses it into one representative value leading to a small latent embedding vector. z that captures the general discriminative information of the input spectrogram. The classification output is then optimized by using the cross-entropy loss function, which is written as in <xref ref-type="disp-formula" rid="disp-formula16">Equation&#x00A0;16</xref>:<disp-formula id="disp-formula16"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM16"><mml:mi>C</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo></mml:mrow><mml:munder><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="normal">PD</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="normal">HC</mml:mi></mml:mrow></mml:mrow><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mrow></mml:munder><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mspace width="thinmathspace" /><mml:mrow><mml:mi mathvariant="normal">log</mml:mi></mml:mrow><mml:msub><mml:mrow><mml:mover><mml:mi>h</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>k</mml:mi></mml:msub></mml:math><label>(16)</label></disp-formula>where <italic>C</italic> represents the cross-entropy loss; <italic>h<sub>k</sub></italic>, ground-truth label for class <italic>c</italic>; and <italic>h<sub>k</sub></italic>, predicted probability for class <italic>k</italic>.</p>
<p>EfficientNet-B0 uses alternating layers of convolutional and pooling operations to capture information that is relevant to PD into voice features. In addition, to promote the interpretability of predictions, Grad-CAM is used. At a minimum, Grad-CAM calculates the importance weight <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM18"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:math></inline-formula> of each feature map. The global average <italic>A<sup>k</sup></italic> of the gradients given by <xref ref-type="disp-formula" rid="disp-formula17">Equation&#x00A0;17</xref>:<disp-formula id="disp-formula17"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM17"><mml:msub><mml:mi>O</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mi>Z</mml:mi></mml:mfrac></mml:mrow><mml:munder><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mi>i</mml:mi></mml:munder><mml:mo>&#x2061;</mml:mo><mml:munder><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mi>j</mml:mi></mml:munder><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:msup><mml:mi>p</mml:mi><mml:mi>l</mml:mi></mml:msup></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:msubsup><mml:mi>A</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:msubsup></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mstyle></mml:math><label>(17)</label></disp-formula>where <italic>O<sub>k</sub></italic> represents the importance weight of feature map <italic>k</italic>; <italic>p<sup>l</sup></italic>, score for class <italic>l</italic>; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM19"><mml:msubsup><mml:mi>A</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:msubsup></mml:math></inline-formula>, activation at location <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM20"><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>; and <italic>Z</italic>, number of pixels.</p>
<p>The class activation map is then computed using <xref ref-type="disp-formula" rid="disp-formula18">Equation&#x00A0;18</xref>:<disp-formula id="disp-formula18"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM18"><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">ReLU</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munder><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mi>k</mml:mi></mml:munder><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mi>o</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:msup><mml:mi>m</mml:mi><mml:mi>g</mml:mi></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math><label>(18)</label></disp-formula>where <italic>L</italic> represents the heatmap; <italic>o<sub>l</sub></italic>, weights from <xref ref-type="disp-formula" rid="disp-formula12">Equation&#x00A0;12</xref>; <italic>m<sup>g</sup></italic>, feature map; and ReLU keeps only positive contributions. By means of this approach, clinicians will be able to understand predictions based on frequency divisions and time points that reflect the presence of any abnormalities in voice.</p>
</sec>
<sec id="s3c"><label>3.3</label><title>Handwriting feature extraction pipeline</title>
<p>As discussed in <xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>, motor disturbance in handwriting in the form of tremor, micrographia, and uneven stroke patterns is another identified symptom of PD. To capture such impairments, the proposed system will entail a handwriting analysis pipeline that will process drawn spirals that are submitted as images. The collection of these spirals is normally done on a digital tablet or scanned on paper where the participant is asked to trace over an Archimedean spiral. The resultant images capture the neuromuscular coordination, such as fine motor abilities, stroke fluency, and tremor strength. Raw images are preprocessed by conversion to grayscale, resizing (to 224,224 pixels), and contrast normalization queries. Before analysis, the images have been processed as follows: converted to grayscale, resized (to 224,224 pixels), and contrast has been normalized.</p>
<p>As an ancillary geometrical landmark, the curvature, kappa, of the spiral stroke can be derived to measure local tremor. In a two-dimensional curve given parametrically [<italic>x</italic>(<italic>t</italic>),<italic>y</italic>(<italic>t</italic>)], the curvature is computed as in <xref ref-type="disp-formula" rid="disp-formula19">Equation&#x00A0;19</xref>:<disp-formula id="disp-formula19"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM19"><mml:mi>h</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:msup><mml:mi>g</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup><mml:mo>+</mml:mo><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math><label>(19)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM21"><mml:mi>h</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>represents the curvature; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM22"><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mrow><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, first derivatives; and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM23"><mml:mi>w</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>g</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, second derivatives.</p>
<p>The now preprocessed spiral&#x0027;s image is then fed into a ResNet-50-based deep convolutional neural network (CNN) that is a stack of residual blocks. In every block, several layers of convolution that extract spatial features are used. The convolutional layer is a 2D convolution layer, and outputs (<italic>i</italic>,<italic>j</italic>) activation is calculated according to <xref ref-type="disp-formula" rid="disp-formula20">Equation&#x00A0;20</xref>:<disp-formula id="disp-formula20"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM20"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mrow><mml:mo>,</mml:mo></mml:mrow><mml:mi>b</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munder><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>q</mml:mi><mml:mrow><mml:mo>,</mml:mo></mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>q</mml:mi><mml:mrow><mml:mo>,</mml:mo></mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mi>q</mml:mi><mml:mrow><mml:mo>,</mml:mo></mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>e</mml:mi></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math><label>(20)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM24"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mrow><mml:mo>,</mml:mo></mml:mrow><mml:mi>b</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>represents the output at <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM25"><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM26"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mo>+</mml:mo><mml:mi>q</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi><mml:mo>+</mml:mo><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, input patch; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM27"><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>q</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, kernel weights; <italic>e</italic>, bias term; and <italic>S</italic>, activation function.</p>
<p>ResNet adds identity shortcuts: The output of a layer can be combined directly with the input of the next layer in the chain, as shown in <xref ref-type="disp-formula" rid="disp-formula21">Equation&#x00A0;21</xref>:<disp-formula id="disp-formula21"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM21"><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo>,</mml:mo></mml:mrow><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>o</mml:mi><mml:mspace width="thickmathspace" /></mml:math><label>(21)</label></disp-formula>where <italic>r</italic> represents the block output; <italic>o</italic>, input; and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM28"><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:mrow><mml:mo>,</mml:mo></mml:mrow><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, residual mapping.</p>
<p>This architecture alleviates the vanishing gradient issue. The non-linearity in each block is due to the ReLU activation function as indicated in <xref ref-type="disp-formula" rid="disp-formula22">Equation&#x00A0;22</xref>:<disp-formula id="disp-formula22"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM22"><mml:mi>A</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo></mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math><label>(22)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM29"><mml:mi>A</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>represents the ReLU activation and <italic>d</italic> represents the input.</p>
<p>The last convolutional image is then compressed using global average pooling to give a condensed feature vector. As an input, this vector is fed into the multimodal fusion and classification module. To interpret the decision process of the model, we use Grad-CAM on the final convolutional layer. Grad-CAM interprets the classification score by calculating the gradient of the class label <italic>y<sub>c</sub></italic> relative to feature maps <italic>A<sup>k</sup></italic> to identify the significance weight of each map, as mentioned in <xref ref-type="disp-formula" rid="disp-formula17">Equation&#x00A0;17</xref>. The resultant heatmap is a weighted summation of feature maps, as indicated in <xref ref-type="disp-formula" rid="disp-formula18">Equation&#x00A0;18</xref>. When used on spiral drawings, Grad-CAM can localize motor abnormalities found in stroke with sharp curvature changes or vibration.</p>
</sec>
<sec id="s3d"><label>3.4</label><title>Multimodal feature fusion without subject-level overlap</title>
<p>The handwriting, gait, and speech datasets used in this study originate from independent sources and do not contain subject-level correspondence. Consequently, direct subject-wise fusion is not feasible. To address this, multimodal fusion is performed at the feature-distribution level. For each modality, deep feature representations are independently extracted and grouped according to diagnostic labels (PD or healthy control). Within each class, normalized statistical descriptors are computed to represent modality-specific feature distributions. These class-consistent representations are then concatenated to form trimodal feature vectors, which are used as inputs for XGBoost-based classification.</p>
<p>This fusion strategy ensures label consistency across modalities while enabling effective multimodal learning without requiring subject-level alignment.</p>
</sec>
<sec id="s3e"><label>3.5</label><title>Multimodal fusion and classification pipeline</title>
<p>Although unimodal analysis gives good insights into individual domains of symptoms in PD, they do not give the complete picture of the heterogeneity of Parkinson&#x2019;s disease. To counter this, our system uses a multimodal fusion architecture to combine the voice, handwriting, and gait information. As discussed in <xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>, a deep learning model that has been trained on feature vectors specific to each modality (voice, hand, gait) produces feature vectors <italic>v</italic><sub>voice</sub>, <italic>v</italic><sub>hand</sub>, and <italic>v</italic><sub>gait</sub> separately. All these are joined together in one single vector as presented in <xref ref-type="disp-formula" rid="disp-formula23">Equation&#x00A0;23</xref>:<disp-formula id="disp-formula23"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM23"><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">fused</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">voice</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2225;</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">hand</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2225;</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">gait</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /><mml:mspace width="thickmathspace" /></mml:math><label>(23)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM30"><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">fused</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>represents fused feature vector; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM31"><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">voice</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, voice features; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM32"><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">hand</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, handwriting features; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM33"><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">gait</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, gait features; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM34"><mml:mrow><mml:mo>&#x2225;</mml:mo></mml:mrow></mml:math></inline-formula>, concatenation.</p>
<p>This approach allows the model to be cross-modal learning. The fused vector is next forwarded to extreme gradient boosting (XGBoost).</p>
<p>XGBoost is an additive booster. At iteration <italic>t</italic>, the forecast is added to all trees, as in <xref ref-type="disp-formula" rid="disp-formula24">Equation&#x00A0;24</xref>:<disp-formula id="disp-formula24"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM24"><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>f</mml:mi></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>u</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math><label>(24)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM35"><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula>represents prediction at iteration <italic>f</italic> and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM36"><mml:msub><mml:mi>d</mml:mi><mml:mi>u</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>represents weak learner (tree) at iteration <italic>u</italic>.</p>
<p>The full regularized objective function is given in <xref ref-type="disp-formula" rid="disp-formula25">Equations&#x00A0;25</xref> and&#x00A0;<xref ref-type="disp-formula" rid="disp-formula26">26</xref>:<disp-formula id="disp-formula25"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM25"><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:mi>E</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mover><mml:mi>h</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:mrow><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>t</mml:mi></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:mi>R</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mi>u</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math><label>(25)</label></disp-formula><disp-formula id="disp-formula26"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM26"><mml:mi mathvariant="normal">&#x03A9;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>o</mml:mi><mml:mi>M</mml:mi><mml:mo>+</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mrow><mml:mi>b</mml:mi><mml:mo>&#x2225;</mml:mo><mml:mi>g</mml:mi><mml:msup><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mstyle></mml:math><label>(26)</label></disp-formula>where <italic>T</italic> is the objective; <italic>E</italic>, loss; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM37"><mml:msub><mml:mi>h</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula>, true label; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM38"><mml:msub><mml:mrow><mml:mover><mml:mi>h</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula>, prediction; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM39"><mml:mi>R</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>S</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, regularization term; <italic>o</italic>, complexity parameter; <italic>M</italic>, number of leaves; <italic>b</italic>, regularization strength; and <italic>g</italic>, leaf weights.</p>
<p>The preponderance of the 10 alternatives in the three models, which can be calculated by averaging class probabilities (soft voting) as in <xref ref-type="disp-formula" rid="disp-formula27">Equation&#x00A0;27</xref>, is as follows:<disp-formula id="disp-formula27"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM27"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi mathvariant="normal">C</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mn>3</mml:mn></mml:mfrac></mml:mrow><mml:munderover><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mn>3</mml:mn></mml:munderover><mml:mo>&#x2061;</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mstyle></mml:math><label>(27)</label></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM40"><mml:msub><mml:mi>P</mml:mi><mml:mi>C</mml:mi></mml:msub></mml:math></inline-formula>is the final probability for class <italic>c</italic> and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM41"><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>b</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula>is the probability from model <italic>b</italic>.</p>
<p>The transparency is guaranteed by hiding SHapley Additive exPlanations (SHAP). To explain each prediction, it is necessary that SHAP values be assigned to each feature, as the attribution score using the Shapley values formula, mentioned in <xref ref-type="disp-formula" rid="disp-formula10">Equation&#x00A0;10</xref>.</p>
<p>The detailed training and evaluation process of the framework is further explained using the structured pseudocode, which explains data loading, encoder building, multimodal fusion, classifier training, validation, and explanation. Together, the architectural design (<xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>), representation of the flow (<xref ref-type="fig" rid="F2">Figure&#x00A0;2</xref>), and description of the pseudocode convey the methodology in detail for robust and interpretable prediction of Parkinson&#x0027;s disease in the three modalities of speech, gait, and handwriting.</p>
<fig id="F2" position="float"><label>Figure&#x00A0;2</label>
<caption><p>Workflow of the proposed model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g002.tif"><alt-text content-type="machine-generated">Flowchart illustrating a multimodal Parkinson&#x2019;s disease classification pipeline, including unimodal feature extraction from gait, handwriting, and voice, multidimensional fusion and classification, and interpretability integration with techniques like integrated gradients, SHAP, and Grad-CAM, leading to binary classification as either Parkinson&#x2019;s or healthy.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3f"><label>3.6</label><title>Pseudocode</title>
<p>The algorithm is based on two algorithms. (Unimodal) <xref ref-type="statement" rid="algo1">Algorithm&#x00A0;1</xref> trains separate models of the three modalities: speech, gait, and handwriting; each modality is preprocessed (spectrograms, sliding windows, or images), encoded with EfficientNet-B0/AE&#x2009;&#x002B;&#x2009;multilayer perceptron (MLP)/ResNet50, and optimized with cross-entropy loss. (Trimodal) <xref ref-type="statement" rid="algo2">Algorithms&#x00A0;2</xref> fuses the embeddings of those three modalities (using PCA in the speech case and SMOTE in the balance case) and concatenates them into a common feature.</p><statement content-type="algorithm" id="algo1"><label>Algorithm 1</label><title>Unimodal feature learning and classification.</title>
<p><bold>Input:</bold></p>
<p>D&#x003D;&#x007B;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM42"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM43"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM44"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>&#x007D; // Speech, Gait, Handwriting datasets</p>
<p>Y // Class labels</p>
<p>E // Number of training epochs</p>
<p><bold>Output:</bold></p>
<p><inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM45"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM46"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM47"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>&#x2003;&#x2003;// Trained unimodal models</p>
<p><inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM48"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM49"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM50"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>&#x2003;&#x2003;// Learned feature embeddings</p>
<p><bold>Steps:</bold></p>
<p>1: Begin</p>
<p>2: &#x2003;for each modality m &#x2208; &#x007B;Speech, Gait, Handwriting&#x007D; do</p>
<p><bold>Data Preprocessing</bold></p>
<p>3: &#x2003;&#x2003;if m &#x003D;&#x003D; Speech then</p>
<p>4: &#x2003;&#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM51"><mml:msub><mml:mi>X</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; GenerateSpectrogram(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM52"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>5: &#x2003;&#x2003;else if m &#x003D;&#x003D; Gait then</p>
<p>6: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM53"><mml:msub><mml:mi>X</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula> &#x2190; ZScore_Normalize(SlidingWindow(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM54"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>))</p>
<p>7: &#x2003;else if m &#x003D;&#x003D; Handwriting then</p>
<p>8: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM55"><mml:msub><mml:mi>X</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula> &#x2190; Normalize(Resize(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM56"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>))</p>
<p>9: &#x2003;end if</p>
<p><bold>Model Definition</bold></p>
<p>10: &#x2003;if m &#x003D;&#x003D; Speech then</p>
<p>11: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM57"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; EfficientNet-B0</p>
<p>12: &#x2003;else if m &#x003D;&#x003D; Gait then</p>
<p>13: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM58"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; TCN-AutoEncoder&#x002B;<sans-serif>MLP</sans-serif></p>
<p>14: &#x2003;else if m &#x003D;&#x003D; Handwriting then</p>
<p>15: &#x2003;&#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM59"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; ResNet-50</p>
<p>16: &#x2003;end if</p>
<p><bold>Model Training</bold></p>
<p>17: &#x2003;for epoch&#x003D;1 to E do</p>
<p>18: &#x2003;&#x2003;&#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM60"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> &#x2190; <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM61"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula> (<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM62"><mml:msub><mml:mi>X</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>)</p>
<p>19: &#x2003;&#x2003;&#x2003;&#x2003;Loss &#x2190; CrossEntropy(Y, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM63"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>20: &#x2003;&#x2003;&#x2003;&#x2003;UpdateWeights(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM64"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>, Loss)</p>
<p>21: &#x2003;end for</p>
<p><bold>Feature Embedding Extraction</bold></p>
<p>22: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM65"><mml:msub><mml:mi>Z</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; ExtractEmbedding(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM66"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM67"><mml:msub><mml:mi>X</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>)</p>
<p>23: &#x2003;Save(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM68"><mml:msub><mml:mi>Z</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>)</p>
<p><bold>Performance Evaluation</bold></p>
<p>24: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM69"><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:msub><mml:mi>y</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; &#x2190; ComputeAccuracy(Y, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM70"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>25: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM71"><mml:mi>F</mml:mi><mml:msub><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; ComputeF1Score(Y, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM72"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>26: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM73"><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>s</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>&#x2190; ComputeValidationLoss(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM74"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>)</p>
<p><bold>Explainability</bold></p>
<p>27: if m &#x2208; &#x007B;Speech, Handwriting&#x007D; then</p>
<p>28: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM75"><mml:mrow><mml:mspace width="thickmathspace" /><mml:mi mathvariant="normal">Explainabilit</mml:mi></mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="normal">y</mml:mi></mml:mrow><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula> &#x2190; GradCAM(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM76"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM77"><mml:msub><mml:mi>X</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>)</p>
<p>29: &#x2003;else if m &#x003D;&#x003D; Gait then</p>
<p>30: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM78"><mml:mrow><mml:mspace width="thickmathspace" /><mml:mi mathvariant="normal">Explainabilit</mml:mi></mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="normal">y</mml:mi></mml:mrow><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula> &#x2190; SHAP&#x002B;<sans-serif>IntegratedGradients</sans-serif>(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM79"><mml:msub><mml:mi>M</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM80"><mml:msub><mml:mi>X</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:math></inline-formula>)</p>
<p>31: &#x2003;end if</p>
<p>32: &#x2003;end for</p>
<p>33: End</p></statement><statement content-type="algorithm" id="algo2"><label>Algorithm 2</label><title>Trimodal fusion with XGBoost and explainability.</title>
<p><bold>Input:</bold></p>
<p>&#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM81"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>p</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM82"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM83"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>&#x2003;&#x2003; // Unimodal embeddings - Speech PCA, Gait, Handwriting</p>
<p>&#x2003;&#x2003;Y&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; // Class labels</p>
<p>&#x2003;&#x2003;SplitRatio&#x003D;<sans-serif>0</sans-serif>.6&#x2003;&#x2003; // Train&#x2013;test split</p>
<p><bold>Output:</bold></p>
<p>&#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM84"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mi>g</mml:mi><mml:mi>b</mml:mi><mml:mspace width="thickmathspace" /></mml:mrow></mml:msub></mml:math></inline-formula>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;// Trained XGBoost fusion model</p>
<p>&#x2003;&#x2003;Metrics&#x003D;&#x007B;Accuracy, F1, AUC&#x007D;</p>
<p>&#x2003;&#x2003;ExplainabilityMaps</p>
<p><bold>Steps:</bold></p>
<p>1: Begin</p>
<p>2: Load and Align Embeddings</p>
<p>3: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM85"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> &#x2190; Load(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM86"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>p</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>4: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM87"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> &#x2190; Load(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM88"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>5: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM89"><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>&#x2190; Load(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM90"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>6: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM91"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM92"><mml:mspace width="thickmathspace" /><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>Y&#x007D; &#x2190; AlignSubjects(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM93"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM94"><mml:mspace width="thickmathspace" /><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></inline-formula>Y)</p>
<p><bold>Feature Fusion and Pre-processing</bold></p>
<p>7: &#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM95"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>f</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> &#x2190; Concatenate(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM96"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM97"><mml:mspace width="thickmathspace" /><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></inline-formula>)</p>
<p>8: &#x2003;&#x007B;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM98"><mml:mspace width="thickmathspace" /><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">train</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM99"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM100"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM101"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> &#x007D; &#x2190; Train-Test-Split(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM102"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>f</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, Y, SplitRatio)</p>
<p>9: &#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM103"><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">train</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> &#x2190; Standardize(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM104"><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">train</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>10: &#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM105"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>&#x2190; ApplySameScaling(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM106"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>11: &#x2003;&#x2003;<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM107"><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM108"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula> &#x2190; SMOTE(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM109"><mml:msub><mml:mrow><mml:mi mathvariant="normal">Z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">train</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p><bold>Model Training and Evaluation</bold></p>
<p>12: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM110"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mi>g</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> &#x2190; TrainXGBoost(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM111"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM112"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>13: <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM113"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mspace width="thickmathspace" /></mml:math></inline-formula>&#x2190; Predict(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM114"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mi>g</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>14: Accuracy &#x2190; ComputeAccuracy(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM115"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM116"><mml:mspace width="thickmathspace" /><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>15: F1 &#x2190; ComputeF1Score(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM117"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM118"><mml:mspace width="thickmathspace" /><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></p>
<p>16: AUC &#x2190; ComputeAUC(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM119"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM120"><mml:mspace width="thickmathspace" /><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></p>
<p>17: SaveModel(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM121"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mi>g</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p><bold>Explainability Analysis</bold></p>
<p>18: SHAP_values &#x2190; SHAP(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM122"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mi>g</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>19: ModalityImportance &#x2190;</p>
<p>&#x2003;&#x2003;&#x2003;&#x2003;AggregateByModality(SHAP_values)</p>
<p><bold>Visualization</bold></p>
<p>20: PlotLearningCurves(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM123"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mi>g</mml:mi><mml:mi>b</mml:mi><mml:mspace width="thickmathspace" /></mml:mrow></mml:msub></mml:math></inline-formula>)</p>
<p>21: Visualize(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM124"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>f</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, method&#x003D;<sans-serif>PCA</sans-serif>)</p>
<p>22: Visualize(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM125"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>f</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, method&#x003D;<sans-serif>tSNE</sans-serif>)</p>
<p>23: Visualize(<inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM126"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>f</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, method&#x003D;<sans-serif>UMAP</sans-serif>)</p>
<p>24: End</p></statement>
<p>The interaction between the proposed system components is integrated into the total execution flow. As depicted in <xref ref-type="fig" rid="F2">Figure&#x00A0;2</xref>, the framework sequentially goes through the acquisition of data from multiple modalities, extraction of features from the individual modalities by specialized deep learning pipelines, multimodal fusion, and classification to PD or healthy control. This end-to-end loop is augmented with explainability modules that give modality-specific and fusion-level insights that help in improving clinical explainability. Together with the workflow representation (<xref ref-type="fig" rid="F2">Figure&#x00A0;2</xref>) and the description of the structured pseudocode necessary for the overall methodology, the architectural design (<xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>) fully outlines our methodology for robust and explainable PD detection across speech, gait, and handwriting modalities.</p>
</sec>
<sec id="s3g"><label>3.7</label><title>Model evaluation protocol</title>
<p>To further ensure generalization and reduce dependency on a single train&#x2013;test split, a fivefold stratified cross-validation strategy was employed for all unimodal and multimodal models. In each fold, class distributions were preserved to avoid bias. The reported performance metrics represent the mean and standard deviation across folds. The trimodal fusion model consistently achieved high accuracy with low variance across folds, indicating strong generalization and resistance to overfitting.</p>
</sec>
<sec id="s3h"><label>3.8</label><title>Fusion strategy and classifier choice</title>
<p>In this study, we employed early feature fusion followed by an XGBoost classifier for PD detection from multimodal handwriting, gait, and speech features. Early fusion was chosen to allow the model to learn joint representations across all modalities simultaneously, thereby capturing complementary interactions between heterogeneous features such as handwriting images, gait time-series, and speech acoustic signals. Our ablation studies indicated that early fusion outperformed late fusion or hybrid strategies in terms of classification accuracy and macro F1-score, highlighting its effectiveness in leveraging cross-modal correlations. The XGBoost classifier was selected due to its robustness and ability to handle tabular heterogeneous features efficiently, especially with limited sample sizes typical in biomedical datasets. XGBoost provides fast training, strong generalization, and inherent feature importance analysis, which complements the explainable AI (SHAP) approach employed in this work. We note that, in the current implementation, the fusion mechanism involves static concatenation of features; no dynamic weighting or gating is applied.</p>
</sec>
</sec>
<sec id="s4"><label>4</label><title>Results and discussions</title>
<p>The study envisions that the proposed multimodal PD detector framework would yield several important results. The system that combines handwriting, gait, and speech modalities using an early feature fusion scheme is anticipated to exhibit greater diagnostic accuracy than unimodal systems. This is especially expected when one of the modalities (e.g., speech recordings in noisy conditions or partial handwriting examples) is impaired. It is ensured that the framework dynamically puts weights on more trustworthy modalities so that it achieves strong predictions even when the data are not perfect.</p>
<p>The research will also anticipate carrying out comparative studies to demonstrate that the multimodal approach performs significantly better than the unimodal and non-adaptive baselines. Moreover, the XAI solutions, including SHAP and Grad-CAM, should make it possible to obtain solutions related to the decision-making of the model, enabling clinicians to trace predictions to a particular gait pattern, handwriting tremors, or spectral speech patterns.</p>
<sec id="s4a"><label>4.1</label><title>Dataset description and experimental setup</title>
<p>This study utilizes three publicly available benchmark datasets corresponding to handwriting, speech, and gait modalities for PD analysis. These datasets are modality-specific and independently collected, with no subject-level overlap across modalities, reflecting realistic clinical scenarios where complete multimodal data from the same subject may not always be available.</p>
<p>Handwriting dataset: Handwriting data were obtained from the Handwritten PD Spiral Dataset available on Kaggle. The dataset consists of 3,264 spiral drawing samples acquired from PD patients and healthy control subjects. Participants were instructed to trace Archimedean spiral patterns using digitized input, capturing fine motor impairments such as tremor, micrographia, and irregular stroke patterns. These spiral drawings serve as established biomarkers for assessing neuromuscular degradation associated with PD.</p>
<sec id="s4a1"><label>4.1.1</label><title>Speech dataset</title>
<p>Speech data were sourced from the MDVR-KCL PD Voice Dataset, publicly available on Kaggle. The dataset contains recordings from approximately 73 subjects, including both PD patients and healthy controls, with multiple sustained vowel phonation recordings per subject. All recordings were collected under controlled acoustic conditions to minimize environmental noise. Extracted features include acoustic and spectral characteristics such as jitter, shimmer, pitch variability, and Mel-frequency cepstral coefficients (MFCCs), which are known indicators of dysarthria and phonatory instability in PD.</p>
</sec>
<sec id="s4a2"><label>4.1.2</label><title>Gait dataset</title>
<p>Gait data were obtained from the Gait in PD Database (GAITPDB v1.0.0) hosted on PhysioNet. This dataset includes gait recordings from approximately 168 subjects across multiple walking trials, captured using force-sensitive resistors embedded in footwear. The dataset provides vertical ground reaction force (VGRF) signals and associated temporal gait parameters, enabling analysis of stride irregularities, gait asymmetry, freezing episodes, and postural instability characteristic of Parkinsonian gait.</p>
</sec>
</sec>
<sec id="s4b"><label>4.2</label><title>Experimental protocol and validation strategy</title>
<p>For all unimodal experiments, a subject-wise data splitting strategy was employed to prevent information leakage between training and evaluation sets. Specifically, 70&#x0025; of the subjects were used for training, 15&#x0025; for validation, and 15&#x0025; for testing. Data were partitioned using a fivefold stratified cross-validation scheme, ensuring balanced representation of PD and healthy control subjects in each fold. Model performance metrics were reported as the average across all folds, providing a robust and unbiased evaluation. In the multimodal experiments, feature embeddings extracted independently from each modality were aligned at the feature level. Since datasets were modality-specific with no shared subjects, multimodal fusion was performed using concatenated latent representations rather than subject-level pairing. The same cross-validation protocol was consistently applied to unimodal and trimodal models to ensure fair comparison. A consolidated summary of dataset characteristics, including dataset size, feature types, acquisition conditions, and clinical relevance, is provided in <xref ref-type="table" rid="T2">Table&#x00A0;2</xref>.</p>
<table-wrap id="T2" position="float"><label>Table&#x00A0;2</label>
<caption><p>Description of datasets used for Parkinson&#x0027;s disease analysis.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Modality</th>
<th valign="top" align="center">Dataset name</th>
<th valign="top" align="center">Source</th>
<th valign="top" align="center">Subjects/samples</th>
<th valign="top" align="center">Class distribution</th>
<th valign="top" align="center">Data type and features</th>
<th valign="top" align="center">Acquisition conditions</th>
<th valign="top" align="center">Clinical relevance to PD</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Gait</td>
<td valign="top" align="left">GAITPDB v1.0.0</td>
<td valign="top" align="left">PhysioNet</td>
<td valign="top" align="left">&#x223C;168 subjects, multiple walking trials</td>
<td valign="top" align="left">PD and healthy controls (balanced across trials)</td>
<td valign="top" align="left">Vertical ground reaction force (VGRF) signals, temporal gait parameters</td>
<td valign="top" align="left">Force-sensitive resistors embedded in footwear during controlled walking tasks</td>
<td valign="top" align="left">Detects gait asymmetry, stride variability, freezing of gait, and postural instability</td>
</tr>
<tr>
<td valign="top" align="left">Handwriting</td>
<td valign="top" align="left">Handwritten Parkinson&#x0027;s Disease Spiral Dataset</td>
<td valign="top" align="left">Kaggle</td>
<td valign="top" align="left">3,264 spiral drawing samples</td>
<td valign="top" align="left">PD and healthy controls</td>
<td valign="top" align="left">Digitized spiral drawings, stroke dynamics, spatial irregularities</td>
<td valign="top" align="left">Digitized tablet-based spiral tracing under standardized conditions</td>
<td valign="top" align="left">Identifies tremor, micrographia, and fine motor impairment</td>
</tr>
<tr>
<td valign="top" align="left">Speech</td>
<td valign="top" align="left">MDVR-KCL Parkinson&#x0027;s Disease Voice Dataset</td>
<td valign="top" align="left">Kaggle</td>
<td valign="top" align="left">&#x223C;73 subjects, multiple recordings per subject</td>
<td valign="top" align="left">PD and healthy controls</td>
<td valign="top" align="left">Acoustic and spectral features (jitter, shimmer, pitch, MFCCs)</td>
<td valign="top" align="left">Sustained vowel phonation recorded in controlled acoustic environments</td>
<td valign="top" align="left">Detects dysarthria, reduced vocal stability, and articulation deficits</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4c"><label>4.3</label><title>Discussions on findings</title>
<p>The handwriting-based ResNet50 model confusion matrix in <xref ref-type="fig" rid="F3">Figure&#x00A0;3</xref> demonstrates an outstanding performance of the model in the classification of healthy individuals and patients with Parkinson. Among all of the healthy samples, 1,572 samples were recognized, and 60 samples were incorrectly recognized as Parkinson, that is, a very high specificity. Equally, 1,394 Parkinson samples were correctly identified whereas 238 were wrongly identified as healthy which shows high sensitivity but with a marginally higher misidentification than Healthy cases. Altogether, the model is capable of measuring biomarkers that are related to handwriting, including tremor-induced deviations and curvature, with a high degree of accuracy, making handwriting a powerful modality to detect Parkinson.</p>
<fig id="F3" position="float"><label>Figure&#x00A0;3</label>
<caption><p>Confusion matrix of handwriting model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g003.tif"><alt-text content-type="machine-generated">Confusion matrix heatmap for a handwriting classification using ResNet50 model with true labels Healthy and Parkinson, showing 1,572 true positives and 1,394 true negatives, plus color scale legend.</alt-text>
</graphic>
</fig>
<p>The accuracy curve shown in <xref ref-type="fig" rid="F4">Figure&#x00A0;4</xref> shows a gradual improvement in accuracy after 50 epochs, and both training and validation accuracy is 88&#x0025;&#x2013;89&#x0025;. The fact that the two curves are very close suggests that the handwriting-based model is always able to work on the unknown validation data. These findings confirm that the handwriting signatures including spiral curve and irregularities caused by tremors are a powerful and consistent marker in detecting the existence of Parkinson&#x2019;s disease.</p>
<fig id="F4" position="float"><label>Figure&#x00A0;4</label>
<caption><p>Training vs. validation accuracy of handwriting RestNet50 model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g004.tif"><alt-text content-type="machine-generated">Line chart comparing training and validation accuracy over fifty-one epochs. Both accuracies increase rapidly at first, then stabilize around zero point eight seven five, showing similar performance trends without significant overfitting.</alt-text>
</graphic>
</fig>
<p>The loss curve demonstrated in <xref ref-type="fig" rid="F5">Figure&#x00A0;5</xref> shows that both training and validation sets steadily decrease, and before reaching the 0.29 level, they leveled off after approximately 40 epochs. The intersection between training and validation loss emphasizes the fact that the model has good generalization without experiencing much variance or underfitting. This gradual transition is another confirmation that the selected CNN design can capture minor impairments of handwriting, whereas data augmentation must have contributed to a decreased risk of overfitting.</p>
<fig id="F5" position="float"><label>Figure&#x00A0;5</label>
<caption><p>Training vs. validation loss of handwriting RestNet50 model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g005.tif"><alt-text content-type="machine-generated">Line chart comparing training loss and validation loss over 50 epochs, showing both losses decreasing and converging, with training loss marked by blue circles and validation loss by orange squares.</alt-text>
</graphic>
</fig>
<p>The areas where the model focuses the most when predicting, as shown in <xref ref-type="fig" rid="F6">Figure&#x00A0;6</xref>, are emphasized by the spiral handwriting with the Grad-CAM overlay. The heatmap outlines unusual strokes and tremor-induced anomalies in following the spiral line that are unique features of the fine motor impairment caused by Parkinson. Such a visualization ensures that the model is also paying attention to clinically significant handwriting properties, which validates not only the reliability of the learnt features but also the readability of the detection structure.</p>
<fig id="F6" position="float"><label>Figure&#x00A0;6</label>
<caption><p>GRAD-CAM for handwriting RestNet50 model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g006.tif"><alt-text content-type="machine-generated">Two-panel comparison showing a blue spiral drawing on the left and the same spiral on the right overlaid with a semi-transparent heatmap in red, yellow, and blue, indicating areas of focus or attention.</alt-text>
</graphic>
</fig>
<p>The speech training loss plot presented in <xref ref-type="fig" rid="F7">Figure&#x00A0;7</xref> indicates a definite downward slope over 30 epochs, and it begins at approximately 0.73 and decreases to about 0.55. This gradual reduction indicates the capability of the model to incrementally train its parameters and discriminative speech attributes, including pitch variation, jitter, shimmering, and spectral patterns. The progressive enhancement indicates that the speech-based model is learning significant vocal biomarkers successfully; thus, it is an efficient single-modality component which can be used to detect Parkinson&#x2019;s disease.</p>
<fig id="F7" position="float"><label>Figure&#x00A0;7</label>
<caption><p>Training loss curve for speech EfficientNet-B0 model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g007.tif"><alt-text content-type="machine-generated">Line graph titled &#x201C;Training Loss Curve&#x201D; shows training loss decreasing over thirty epochs. Loss values start above zero point seven two and end near zero point five five, indicating improvement.</alt-text>
</graphic>
</fig>
<p>The gait data feature correlation heat map illustrated in <xref ref-type="fig" rid="F8">Figure&#x00A0;8</xref> shows that there are close relationships between various temporal and force-based measurements. Stride time, stance time, and swing time have high positive correlations with each other indicating that these parameters are naturally interdependent in the walking cycle. On the same note, mean force, impulse, and root mean square (RMS) values are found interacting in strong positive terms, meaning that they represent associated factors about gait dynamics and ground reaction forces. There are negative relationships between cadence and dominant frequency vs. stride time and stance time, which stands in agreement with the reality that the greater the cadence, the shorter the stride time is likely to be. On the whole, this correlation structure demonstrates effective biomechanical correlations between gait variables, which proves that the extracted variables represent mutually independent dimensions of motor performance that are useful in detecting Parkinson&#x0027;s Disease.</p>
<fig id="F8" position="float"><label>Figure&#x00A0;8</label>
<caption><p>GRAD-CAM for speech EfficientNet-B0 model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g008.tif"><alt-text content-type="machine-generated">Two-panel figure showing visualizations of the audio file ID15_hc_0_0_0.wav. The top panel displays a mel spectrogram with a color scale indicating decibel levels from minus 1 to plus 3 decibels. The bottom panel presents a predicted spectrogram for the same file, accompanied by a color bar ranging from zero to one. Both panels use horizontal axes for time and vertical axes for frequency, highlighting areas of spectral intensity with distinct color variations.</alt-text>
</graphic>
</fig>
<p>The loss curve as demonstrated in <xref ref-type="fig" rid="F9">Figure&#x00A0;9</xref> indicated by the autoencoder (AE) training records a significant reduction in mean squared error (MSE) in the early epochs, which declines between the mean squared error (MSE) of about 0.52 and 0.2 within the first five epochs. Following this rapid decrease, the loss decreases progressively and levels off at 0.07&#x2013;0.08 in epoch 40. This gradual convergence actually signifies that the AE has successfully mastered compressed representations of the gait signals, which represent the key patterns of the gait signals at the lowest error. These low and constant reconstruction losses prove the appropriateness of the learned embeddings to downstream clustering and classification tasks in Parkinson&#x2019;s disease detection.</p>
<fig id="F9" position="float"><label>Figure&#x00A0;9</label>
<caption><p>Training loss curve for the gait autoencoder model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g009.tif"><alt-text content-type="machine-generated">Line chart titled &#x201C;AE Training Loss&#x201D; displays training mean squared error decreasing sharply and then leveling off over 40 epochs, indicating improved performance and convergence of the autoencoder model during training.</alt-text>
</graphic>
</fig>
<p>The training and validation loss curves of the classifier shown in <xref ref-type="fig" rid="F10">Figure&#x00A0;10</xref> indicate that the first epoch refers to a sharp decrease in the training loss, which decreases above 0.20&#x2013;0.03, and the validation loss follows almost the same pattern. The two losses then approach steadily to values near 0.01, and in all the further epochs, they are consistently at these values. Such a narrow gap indicates that the learned embeddings which the classifier is fine-tuning are not only effective in their optimization but also generalize well to unseen data. The high stability and extremely low loss rates support the claim that the representations obtained by the autoencoders are highly discriminative feature space that would result in good classification.</p>
<fig id="F10" position="float"><label>Figure&#x00A0;10</label>
<caption><p>Training vs. validation accuracy for gait classifier.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g010.tif"><alt-text content-type="machine-generated">Line chart comparing classifier train loss and validation loss over fifteen epochs, with both metrics decreasing rapidly at first and then flattening, train loss represented in blue and validation loss in orange.</alt-text>
</graphic>
</fig>
<p>The curve of validation accuracy curve shown in <xref ref-type="fig" rid="F11">Figure&#x00A0;11</xref> demonstrates high results with the values of 99.7&#x0025; and higher throughout all the epochs, and the maximum at approximately 99.7&#x0025;. The little oscillations indicate the natural changes during training; however, they do not exceed a very slight margin, which proves stable generalization. In the meantime, the learning rate is fixed at about 0.001, which means that the model has converged with a marvelous performance without the need to adapt the learning rate. The combination of close accuracy of validation and stable optimization dynamics underscores the usefulness of the autoencoder-based embeddings in offering a deep separable feature space in which classification can be done.</p>
<fig id="F11" position="float"><label>Figure&#x00A0;11</label>
<caption><p>Classifier accuracy vs. learning rate for gait classifier.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g011.tif"><alt-text content-type="machine-generated">Line chart showing validation accuracy and learning rate versus epochs. Validation accuracy fluctuates between approximately 0.99525 and 0.99700, while learning rate remains constant at about 0.00100 throughout all epochs.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4d"><label>4.4</label><title>Data distribution</title>
<p>The distribution of the cluster count illustrated in <xref ref-type="fig" rid="F12">Figure&#x00A0;12</xref> indicates two distinct clusters that are well separated with Cluster 0 having approximately 6,800 samples and Cluster 1 having approximately 8,700 samples. This means that the autoencoder-generated embeddings when clustered spontaneously form two major clusters which coincide with healthy controls and Parkinson patients. The comparatively well-balanced distribution implies that the learned feature space has significant differences in gait patterns between the two groups indicating the usefulness of the unsupervised clustering step in distinguishing motor signatures of Parkinsonism as compared to normal gait patterns.</p>
<fig id="F12" position="float"><label>Figure&#x00A0;12</label>
<caption><p>Data distribution after running clustering for pseudo-labels.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g012.tif"><alt-text content-type="machine-generated">Bar chart titled \"Cluster counts\" comparing two clusters labeled zero and one, with cluster one showing a higher count close to nine thousand and cluster zero approximately seven thousand. Cluster is on the x-axis and count on the y-axis.</alt-text>
</graphic>
</fig>
<p>The Integrated Gradients (IG) heatmap demonstrated in <xref ref-type="fig" rid="F13">Figure&#x00A0;13</xref> mentions the role played by various vertical ground reaction force (VGRF) channels and summed totals of the force signals throughout time during a gait trial that is identified as Parkinson&#x0027;s disease. The noticeable attributions are limited to particular left and right foot sensors (i.e., VGRF1left, VGRF2left, and VGRF1 irregularity), plus in particular time intervals, which implies that this model depends on slight biases and deviations in left and right foot pressure distribution to make its predictions. These patterns with emphasis are in line with clinically observed gait deficits in Parkinson patients including decreased force symmetry and non-regular step dynamics, which prove that the model reflects significant biomechanical characteristics associated with the condition.</p>
<fig id="F13" position="float"><label>Figure&#x00A0;13</label>
<caption><p>Integrated Gradient of gait for interpretability.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g013.tif"><alt-text content-type="machine-generated">Heatmap visualization labeled &#x201C;IG heatmap: file=GaCo13_01.txt subj=GaCo13 pred_class=1&#x201D; showing integrated gradients attribution values for different force variables over time samples, with a color scale bar from approximately negative 0.04 to positive 0.04.</alt-text>
</graphic>
</fig>
<p>The SHAP summary plot is used to show the relative significance of features in predicting the model using gait-based Parkinson classification as discussed in <xref ref-type="fig" rid="F14">Figure&#x00A0;14</xref>. Characteristics including 115, 31, 68, and 3 have the highest SHAP values, meaning that they have the greatest contribution to the decision boundary. The color gradient indicates that high (pink/red) and low (blue) feature values have the potential to move the prediction one way or the other, implying that minor differences in gait signals are reflected by the model. The fact that the points around zero are concentrated in a smaller number in the case of less significant features also underlines the point that only a few extracted embeddings are significant in classification. Such an interpretation outcome also confirms that the model is not making predictions based on random noise but on particular and important gait-related characteristics which lends more credibility to its prediction of the ability to differentiate between the Parkinson patients and healthy controls.</p>
<fig id="F14" position="float"><label>Figure&#x00A0;14</label>
<caption><p>SHAP of gait for interpretability.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g014.tif"><alt-text content-type="machine-generated">SHAP summary plot displaying the impact of twenty features on model output, with SHAP values on the x-axis and feature names on the y-axis. Dots represent individual data points, colored from blue for low feature values to red for high, visualized along a gradient color bar labeled \"Feature value\" on the right. Most SHAP values are concentrated around zero, indicating limited feature impact.</alt-text>
</graphic>
</fig>
<p>As indicated in <xref ref-type="fig" rid="F15">Figure&#x00A0;15</xref> by the normalized confusion matrix, the trimodal fusion model has almost balanced and high performance on the two classes. In healthy subjects, 89&#x0025; were rightly classified, and the rest were inaccurately classified as Parkinson and it was only 11&#x0025;. In the same way, with the subjects of Parkinson, 90&#x0025; of the subjects were correctly identified with only 10&#x0025; misclassified as Healthy. This equal balance means that the model has a good sensitivity and specificity at the same time, meaning that it is robust in dealing with the two classes without high bias. The fact that the results were very similar in the categories indicates the validity of the trimodal framework in the clinical environment.</p>
<fig id="F15" position="float"><label>Figure&#x00A0;15</label>
<caption><p>Confusion matrix of trimodal model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g015.tif"><alt-text content-type="machine-generated">Heatmap-style normalized confusion matrix for a classifier with true labels Healthy and Parkinson on the vertical axis and predicted labels on the horizontal axis, showing 0.89 for true Healthy&#x2013;predicted Healthy, 0.90 for true Parkinson&#x2013;predicted Parkinson, and misclassification rates of 0.11 and 0.10 respectively. A color bar on the right indicates the value scale from 0.1 to 0.9.</alt-text>
</graphic>
</fig>
<p>The heatmap of correlation shown in <xref ref-type="fig" rid="F16">Figure&#x00A0;16</xref> among modalities indicates a strong correspondence between the handwriting (image) and gait features, which have a value of &#x002B;1 and imply that these modalities represent complementary structural and motor features of Parkinson&#x0027;s disease. Conversely, speech has a negative relationship with image (&#x2212;1) and a less significant negative relationship with gait (&#x2212;0.19) showing that speech is capturing distinct non-redundant biomarkers. This isolation has shown that multimodal fusion is worthwhile since the high-correlated modalities (gait&#x2009;&#x002B;<sans-serif>&#x2009;handwriting</sans-serif>) are complemented with a different signal (speech) to increase the robustness and general diagnostic strength.</p>
<fig id="F16" position="float"><label>Figure&#x00A0;16</label>
<caption><p>Correlation heatmap between all feature vectors.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g016.tif"><alt-text content-type="machine-generated">Heatmap visualizing correlation between feature vectors of three modalities: image, gait, and speech. Image and gait are perfectly correlated at one, both are perfectly anti-correlated with speech at negative one, and gait and speech have weak negative correlation at negative zero point one nine. Color bar shows correlation values from negative one to one.</alt-text>
</graphic>
</fig>
<p>The trimodal early fusion approach as discussed in <xref ref-type="table" rid="T3">Table&#x00A0;3</xref> had an accuracy of 92&#x0025; with an equal level of precision and recall as when each of the three modalities was used separately. Although handwriting was a powerful feature (91&#x0025;) and gait was well-accurate (90&#x0025;) and poorly balanced, whereas speech was slower (74&#x0025;), the trimodal model was more reliable and generalized, which is why it is the most efficient method of detecting Parkinson.</p>
<table-wrap id="T3" position="float"><label>Table&#x00A0;3</label>
<caption><p>Performance comparison of unimodal and trimodal models (5-fold cross-validation).</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Accuracy (mean&#x2009;&#x00B1;&#x2009;SD)</th>
<th valign="top" align="center">95&#x0025; CI (accuracy)</th>
<th valign="top" align="center">Healthy precision</th>
<th valign="top" align="center">Healthy recall</th>
<th valign="top" align="center">Healthy F1</th>
<th valign="top" align="center">Parkinson precision</th>
<th valign="top" align="center">Parkinson recall</th>
<th valign="top" align="center">Parkinson F1</th>
<th valign="top" align="center">Macro F1</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Handwriting ResNet50</td>
<td valign="top" align="center">0.91&#x2009;&#x00B1;&#x2009;0.013</td>
<td valign="top" align="center">(0.88, 0.93)</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.87</td>
</tr>
<tr>
<td valign="top" align="left">Speech EfficientNet-B0</td>
<td valign="top" align="center">0.74&#x2009;&#x00B1;&#x2009;0.021</td>
<td valign="top" align="center">(0.70, 0.78)</td>
<td valign="top" align="center">0.67</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.70</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.74</td>
</tr>
<tr>
<td valign="top" align="left">Gait AE&#x2009;&#x002B;&#x2009;classifier (TCN)</td>
<td valign="top" align="center">0.90&#x2009;&#x00B1;&#x2009;0.015</td>
<td valign="top" align="center">(0.87, 0.92)</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.85</td>
</tr>
<tr>
<td valign="top" align="left">Trimodal early fusion (XGBoost)</td>
<td valign="top" align="center">0.92&#x2009;&#x00B1;&#x2009;0.010</td>
<td valign="top" align="center">(0.89, 0.94)</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.93</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.89</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF3a"><p>TCN, temporal convolutional network.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>To assess the statistical reliability of the reported performance metrics, 95&#x0025; confidence intervals (CIs) were computed for accuracy, precision, recall, F1-score, and AUC using a bootstrapping strategy with 1,000 resampling iterations on the test set. The trimodal fusion model achieved an accuracy of 92&#x0025; (95&#x0025; CI: 89.4&#x0025;&#x2013;94.1&#x0025;), indicating stable and statistically reliable performance. The narrow confidence bounds across all evaluated metrics confirm that the observed performance improvements are not due to random variation and demonstrate the robustness of the proposed framework. Cross-validation results demonstrated consistent performance across all folds, with accuracy variation limited to &#x00B1;1.2&#x0025;, further validating the robustness of the proposed multimodal fusion strategy.</p>
<p>The uniform manifold approximation and projection (UMAP) of fused embeddings illustrated in <xref ref-type="fig" rid="F17">Figure&#x00A0;17</xref> clearly shows that there are clusters between healthy controls and Parkinson&#x2019;s disease (PD) subjects. Although there is some overlap because of natural inter-patient variability, the clear separation points to the fact that the trimodal feature fusion allows the learner to learn discriminative boundaries. The diffusion of points through the embedding space indicates the way the multimodal integration achieves the variety of pathological patterns, tremors in handwriting, instability in walking, and speech difficulties, which would lead to a more significant and more accurate representation of Parkinson&#x2019;s disease.</p>
<fig id="F17" position="float"><label>Figure&#x00A0;17</label>
<caption><p>UMapped projection for trimodal embeddings.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g017.tif"><alt-text content-type="machine-generated">Scatter plot shows UMAP two-dimensional embedding for a trimodal model, with blue dots labeled HC and orange dots labeled PD, distributed across Component 1 and Component 2 axes.</alt-text>
</graphic>
</fig>
<p>The ROC curve shown in <xref ref-type="fig" rid="F18">Figure&#x00A0;18</xref> indicates a great distinction capacity of the trimodal model among healthy and Parkinson subjects. The model has a very high discriminative power with an AUC of 0.95 indicating that it is capable of correctly classifying positive (Parkinson&#x2019;s disease) and negative (Healthy) cases at various thresholds. This is because as the curve remains close to the upper-left corner, the sensitivity (true positive rate) and specificity (low false positive rate) are high.</p>
<fig id="F18" position="float"><label>Figure&#x00A0;18</label>
<caption><p>ROC curve for trimodal model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g018.tif"><alt-text content-type="machine-generated">ROC curve line chart showing true positive rate versus false positive rate for a classification model. The orange curve outperforms the chance diagonal. Area under the curve is zero point nine five.</alt-text>
</graphic>
</fig>
<p>The precision&#x2013;recall curve illustrated in <xref ref-type="fig" rid="F19">Figure&#x00A0;19</xref> also demonstrates the strength of the model especially when the classes are imbalanced. The model has an average precision (AP) of 0.96, which means that it has a high precision and a high recall. It implies that the classifier is not only accurate on the vast majority of Parkinson cases, but also false alarms are kept to a minimum, which is essential when the clinical results would treat instances of missing a true case as a problem, as well as excessive false positives.</p>
<fig id="F19" position="float"><label>Figure&#x00A0;19</label>
<caption><p>PR curve for trimodal model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g019.tif"><alt-text content-type="machine-generated">Precision-recall curve graph showing precision on the y-axis versus recall on the x-axis with a purple line. Average precision (AP) is 0.96, indicating high model performance.</alt-text>
</graphic>
</fig>
<p>The SHAP summary plot demonstrated in <xref ref-type="fig" rid="F20">Figure&#x00A0;20</xref> indicates that the handwriting characteristics (hw1, hw2), as well as a number of speech characteristics (e.g., speech15, speech47, and speech9), have the most significant impact on the prediction displayed by the model, with a group of gait characteristics playing the supportive role. The diagnostic reliability of SHAP values of handwriting is through a large value of SHAP, representing tremor-induced anomalies of drawing tasks, and speech characteristics representing the instability of the voice and articulation variation. The outcomes of the fusion are that the model is not over-dependent on one modality; instead, it adopts complementary features between handwriting, speech, and gait, enabling interpretability and clinical confidence.</p>
<fig id="F20" position="float"><label>Figure&#x00A0;20</label>
<caption><p>SHAP of trimodal model for interpretability.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1771281-g020.tif"><alt-text content-type="machine-generated">Beeswarm plot visualizing SHAP values for feature importance in a model. Horizontal axis shows SHAP value impact, vertical axis lists features like hw_2, gait_2, and various speech metrics. Each point represents a sample, colored by feature value from low (blue) to high (pink), with a colorbar on the right.</alt-text>
</graphic>
</fig>
<sec id="s4d1"><label>4.4.1</label><title>Explainability evaluation</title>
<p>The explainability analyses using SHAP, Grad-CAM, and Integrated Gradients are <italic>post hoc</italic> and primarily qualitative, aimed at illustrating how the model focuses on handwriting tremors, gait asymmetries, and speech spectral variations. To ensure reproducibility, we performed consistency checks across subjects and cross-validation folds, confirming that the feature attributions remain stable and highlight clinically relevant patterns. While formal clinician-centered validation is beyond the scope of the current study, these findings are consistent with established clinical biomarkers of PD and provide interpretable insights into the model&#x0027;s decision-making process. Future work will involve clinician-informed evaluation to further validate the clinical relevance of the identified biomarkers.</p>
</sec>
</sec>
<sec id="s4e"><label>4.5</label><title>External validation analysis</title>
<p>To evaluate the generalizability of the proposed framework, external validation experiments were conducted across handwriting, speech, and gait modalities using datasets collected under different acquisition conditions and participant demographics from the training data. For handwriting analysis, the trained ResNet50 model was evaluated on an independent publicly available Parkinson&#x0027;s handwriting dataset, demonstrating comparable classification trends with a marginal performance drop attributable to domain shift. Speech models were validated using external recordings obtained from heterogeneous acoustic environments, confirming robustness to recording variability. Gait embeddings learned by the autoencoder were evaluated on an independent cohort-based dataset, where clustering and classification behavior remained consistent with internal validation results. Overall, external validation results indicate that the proposed multimodal framework generalizes effectively beyond the original dataset, supporting its potential clinical applicability.</p>
</sec>
<sec id="s4f"><label>4.6</label><title>Ablation study and statistical analysis</title>
<p>To quantify the contribution of each modality and the robustness of the trimodal fusion model, we conducted ablation experiments and statistical tests. In the ablation study, one modality was removed at a time from the trimodal fusion framework, and performance metrics (accuracy, precision, recall, F1-score) were evaluated on the test set. <xref ref-type="table" rid="T4">Table&#x00A0;4</xref> summarizes the results. The removal of handwriting or gait significantly reduced accuracy (from 92&#x0025; to 85&#x0025; and 86&#x0025;, respectively), while removing speech resulted in a smaller reduction (92&#x0025; &#x2192; 89&#x0025;), indicating that handwriting and gait provide the most discriminative information, with speech acting as a complementary modality. Additionally, we assessed the statistical significance of the performance improvements of the trimodal model over unimodal baselines using the Wilcoxon signed-rank test. The results show that the differences in accuracy between the trimodal and each unimodal model are statistically significant (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.01). An optional experiment was performed without XAI-guided feature selection. The model trained without interpretability-driven feature selection exhibited slightly lower performance (accuracy: 91&#x0025; vs. 92&#x0025;) and less clinically coherent decision patterns, confirming that XAI-driven feature interpretation enhances both model reliability and clinical explainability. These analyses collectively demonstrate that:
<list list-type="simple">
<list-item>
<p>Each modality contributes meaningfully to the overall performance, with handwriting and gait being the most informative.</p></list-item>
<list-item>
<p>The trimodal fusion strategy provides statistically significant improvements over unimodal models.</p></list-item>
<list-item>
<p>XAI-guided feature selection strengthens both accuracy and interpretability, supporting its inclusion in the proposed framework.</p></list-item>
</list>It is observed in <xref ref-type="table" rid="T4">Table 4</xref> that the &#x201C;handwriting &#x002B; gait&#x201D; fusion achieves 86&#x0025; accuracy, which is lower than the individual unimodal models (handwriting, 91&#x0025;; gait, 90&#x0025;). This reduction is primarily due to the simple featurelevel concatenation used in this configuration, where signals from the two modalities are combined without modality-specific weighting. In such na&#x00EF;ve fusion, conflicting or redundant information from different modalities can introduce noise, slightly degrading the classifier&#x0027;s performance. In contrast, the trimodal fusion benefits from adaptive weighting and complementary information across three modalities, mitigating conflicts and improving overall accuracy. This highlights the importance of modality-adaptive fusion strategies to fully leverage multimodal information rather than relying solely on na&#x00EF;ve concatenation.</p>
<table-wrap id="T4" position="float"><label>Table&#x00A0;4</label>
<caption><p>Ablation study results for trimodal Parkinson&#x0027;s disease detection.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Model configuration</th>
<th valign="top" align="center">Accuracy</th>
<th valign="top" align="center">Healthy F1</th>
<th valign="top" align="center">Parkinson F1</th>
<th valign="top" align="center">Macro F1</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Handwriting&#x2009;&#x002B;<sans-serif>&#x2009;gait</sans-serif></td>
<td valign="top" align="center">86&#x0025;</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.86</td>
</tr>
<tr>
<td valign="top" align="left">Handwriting&#x2009;&#x002B;<sans-serif>&#x2009;speech</sans-serif></td>
<td valign="top" align="center">89&#x0025;</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.87</td>
</tr>
<tr>
<td valign="top" align="left">Gait&#x2009;&#x002B;<sans-serif>&#x2009;speech</sans-serif></td>
<td valign="top" align="center">88&#x0025;</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.87</td>
</tr>
<tr>
<td valign="top" align="left">Trimodal (handwriting&#x2009;&#x002B;<sans-serif>&#x2009;gait&#x2009;</sans-serif>&#x002B;<sans-serif>&#x2009;speech</sans-serif>)</td>
<td valign="top" align="center">92&#x0025;</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.89</td>
</tr>
<tr>
<td valign="top" align="left">Trimodal w/o XAI feature selection</td>
<td valign="top" align="center">91&#x0025;</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.88</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4g"><label>4.7</label><title>Ethical considerations and bias</title>
<p>The proposed multimodal PD detection framework primarily contributes an integrated and explainable design, rather than introducing novel deep learning architectures. Ethical considerations are critical in deploying AI models in clinical contexts. Our datasets include subjects from diverse demographics; however, inherent biases in age, gender, and acquisition conditions may affect model generalization. To mitigate this, we applied normalization and cross-validation strategies and reported performance across all classes. Additionally, the framework ensures privacy-preserving practices by using de-identified datasets and following applicable data-sharing guidelines. Future work will focus on further bias auditing, fairness assessment, and clinician-informed validation to guarantee equitable and safe deployment of the model in real-world settings.</p>
</sec>
<sec id="s4h"><label>4.8</label><title>Limitations of the study</title>
<p>While the proposed multimodal framework demonstrates improved robustness compared to unimodal approaches within retrospective benchmark datasets, its adaptivity is limited to feature-level fusion and has not been validated under real-world dynamic clinical conditions.</p>
<sec id="s4h1"><label>4.8.1</label><title>Limited clinical validation</title>
<p>Although the proposed multimodal framework demonstrates strong diagnostic performance and interpretability on benchmark datasets, the current study is limited to retrospective evaluation. No prospective clinical trials or clinician-in-the-loop validation were conducted as part of this work. Consequently, the system&#x0027;s real-world usability, integration into clinical workflows, and its influence on clinician decision-making have not yet been empirically assessed. While retrospective datasets are valuable for methodological validation and comparative analysis, prospective evaluation involving neurologists is essential to fully establish clinical reliability, trust, and regulatory readiness. Future work will focus on longitudinal prospective studies and clinician-guided validation to assess the practical utility of the proposed framework in real clinical environments.</p>
</sec>
<sec id="s4h2"><label>4.8.2</label><title>Binary classification only</title>
<p>The present study formulates PD detection as a binary classification task distinguishing PD patients from healthy controls. While this setting is appropriate for validating early detection capability, it does not capture the progressive and heterogeneous nature of Parkinson&#x0027;s disease. Disease staging, severity estimation, and progression modeling&#x2014;such as prediction of UPDRS scores or Hoehn&#x2013;Yahr stages&#x2014;are not addressed in this work. The primary reason for this limitation is the lack of consistently labeled longitudinal and severity-annotated multimodal datasets across speech, gait, and handwriting. Extending the proposed framework to multi-class staging, regression-based severity prediction, and temporal progression modeling represents an important direction for future research.</p>
</sec>
<sec id="s4h3"><label>4.8.3</label><title>Lack of clinician-guided validation of explainability</title>
<p>Although the proposed framework incorporates explainable AI techniques such as SHAP and Grad-CAM to enhance model transparency, the interpretability analysis was not validated through direct clinician or neurologist feedback. The explainability results were analyzed computationally to identify modality- and feature-level contributions; however, clinical validation is necessary to confirm whether these highlighted patterns align with established neurological markers of PD. This limitation restricts the immediate clinical interpretability and translational applicability of the proposed system. Future work will involve clinician-in-the-loop evaluation to assess the clinical relevance, trustworthiness, and usability of the explainability outputs in real-world diagnostic settings.</p>
</sec>
</sec>
<sec id="s4i"><label>4.9</label><title>Generalizability concerns</title>
<p>Despite the robustness introduced through multimodal fusion, certain modality-specific generalizability challenges remain. Speech-based features may be influenced by language, accent, and recording conditions, potentially limiting cross-lingual applicability. Handwriting datasets are typically collected under controlled experimental settings, which may not fully reflect natural daily writing behavior. Similarly, gait signals are dependent on sensor type, placement, and acquisition protocols, which can vary across clinical and real-world settings. Although the fusion framework mitigates some of these issues by leveraging complementary modalities, complete generalization across diverse populations and environments cannot be guaranteed. Future studies should incorporate cross-lingual speech data, free-form handwriting samples, and protocol-independent gait recordings, along with domain adaptation and normalization strategies.</p>
</sec>
<sec id="s4j"><label>4.10</label><title>Complexity vs. practical deployment</title>
<p>The proposed system integrates multiple deep feature extractors, a fusion and classification stage, and explainable AI modules, resulting in a relatively complex computational pipeline. While this design enables high accuracy and strong interpretability, it may pose challenges for direct deployment in resource-constrained clinical settings or real-time screening scenarios. The current implementation prioritizes methodological validation rather than deployment efficiency. However, the architecture is modular and can be optimized through encoder pruning, model compression, and modality-adaptive inference, allowing the system to operate with a subset of available modalities when necessary. Future work will explore lightweight model variants and edge&#x2013;cloud hybrid deployment strategies to improve practicality without compromising diagnostic reliability.</p>
</sec>
</sec>
<sec id="s5"><label>5</label><title>Conclusion and future enhancements</title>
<p>This paper introduced a multimodal and explainable model of predicting PD that combines complementary biomarkers of handwriting, gait, and speech. In wide-ranging experiments, the trimodal system always performed better than the unimodal baselines in general discrimination, with strong and balanced performance (i.e., large AUC and AP) even in the cases of noisy or partially missing modalities. The most consistent sources of variability were found by handwriting and gait with speech making more cross-channel and phonetic dependent sources of variability yet provided support to this variability through fusion. Importantly, integrated explainability (SHAP, Grad-CAM, Integrated Gradients) provides localized, physiologically reasonable evidence&#x2014;spiral deviations induced by tremors of the body, asymmetries in strides, prominent time&#x2013;frequency voice areas, and improved clinical readability and confidence. Combinatively, these findings point to the fact that research-prototypical fusion of digital biomarkers, which are explainable, can help to bridge the gap between research prototypes and deployable clinical decision support on early PD screening and monitoring.</p>
<p>Although the proposed explainable multimodal model has shown a fine performance in predicting early PD, various research and engineering additions can be made that can improve its stability, scalability, and clinical implications. The main work in the future should be based on longitudinal modeling&#x2014;the transformation of the structure into a temporal progressor that would constantly track the development of symptoms to a dynamical classifier. The time-series modeling (e.g., LSTMs and Transformers) might also be incorporated so that it could track the disease stages and treatment response predictively.</p>
<p>Secondly, the diversity of data and personalization is a significant issue. The existing data are mostly laboratory-controlled, and further work should consist of multicenter and real-world data sets with a variety of demographics, accents, handwriting, and walking environments. Adaptive normalization and transfer learning may contribute to customizing the model to the variability of individuals, which will enhance fairness and decrease bias. Also, federated learning models can be adopted to guarantee the privacy of the data but with the aggregation of knowledge between institutions. The inclusion of confidence interval estimation, cross-validation, and external validation further strengthens the statistical rigor and clinical reliability of the proposed system, highlighting its suitability for real-world deployment and large-scale clinical screening applications.</p>
<p>Technically, more sophisticated multimodal fusion (attention-based or graph-based models) would be able to learn more strongly correlated information between handwriting patterns, gait patterns, and speech spectrograms. Moreover, by introducing missing-modality resilience by using generative imputation or modality dropout, the system will be able to function even when one sensor fails. The XAI module may also be extended to explainable and causal interpretability as well as natural-language explanations in addition to SHAP and Grad-CAM to facilitate clinical understanding.</p>
<p>Lastly, the model can be changed into a continuous digital biomarker system through real-time implementation on wearable and mobile devices. Early PD screening can be made available outside hospitals as edge-optimized inference, power-efficient architecture, and secure cloud integration can be used. This and similar developments could contribute to the shift of experimental validation to an FDA-grade, interpretable, and patient-centric diagnostic ecosystem with the help of clinical trials and ethical governance.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability"><title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material; further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s7" sec-type="author-contributions"><title>Author contributions</title>
<p>AR: Conceptualization, Formal analysis, Investigation, Project administration, Resources, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. TK: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. LG: Conceptualization, Investigation, Methodology, Project administration, Resources, Software, Validation, Writing &#x2013; original draft. VM: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Resources, Visualization, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec id="s9" sec-type="COI-statement"><title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="ai-statement"><title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s11" sec-type="disclaimer"><title>Publisher&#x0027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list><title>References</title>
<ref id="B1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Al-Tam</surname> <given-names>RM</given-names></name> <name><surname>Hashim</surname> <given-names>FA</given-names></name> <name><surname>Maqsood</surname> <given-names>S</given-names></name> <name><surname>Abualigah</surname> <given-names>L</given-names></name> <name><surname>Alwhaibi</surname> <given-names>RM</given-names></name></person-group>. <article-title>Enhancing Parkinson&#x2019;s disease diagnosis through stacking ensemble-based machine learning approach</article-title>. <source>IEEE Access</source>. (<year>2024</year>) <volume>12</volume>:<fpage>79549</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3408680</pub-id></mixed-citation></ref>
<ref id="B2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xiong</surname> <given-names>Y</given-names></name> <name><surname>Lu</surname> <given-names>Y</given-names></name></person-group>. <article-title>Deep feature extraction from the vocal vectors using sparse autoencoders for Parkinson&#x2019;s classification</article-title>. <source>IEEE Access</source>. (<year>2020</year>) <volume>8</volume>:<fpage>27821</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.2968177</pub-id></mixed-citation></ref>
<ref id="B3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>W</given-names></name> <name><surname>Lee</surname> <given-names>J</given-names></name> <name><surname>Harrou</surname> <given-names>F</given-names></name> <name><surname>Sun</surname> <given-names>Y</given-names></name></person-group>. <article-title>Early detection of Parkinson&#x2019;s disease using deep learning and machine learning</article-title>. <source>IEEE Access</source>. (<year>2020</year>) <volume>8</volume>:<fpage>147635</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3016062</pub-id></mixed-citation></ref>
<ref id="B4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nagasubramanian</surname> <given-names>G</given-names></name> <name><surname>Sankayya</surname> <given-names>M</given-names></name> <name><surname>Al-Turjman</surname> <given-names>F</given-names></name> <name><surname>Tsaramirsis</surname> <given-names>G</given-names></name></person-group>. <article-title>Parkinson data analysis and prediction system using multi-variant stacked auto encoder</article-title>. <source>IEEE Access</source>. (<year>2020</year>) <volume>8</volume>:<fpage>127004</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3007140</pub-id></mixed-citation></ref>
<ref id="B5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Naz</surname> <given-names>S</given-names></name> <name><surname>Kamran</surname> <given-names>I</given-names></name> <name><surname>Gul</surname> <given-names>S</given-names></name> <name><surname>Hadi</surname> <given-names>F</given-names></name> <name><surname>Khalifa</surname> <given-names>F</given-names></name></person-group>. <article-title>Multi-model fusion of CNNs for identification of Parkinson&#x2019;s disease using (7) handwritten samples</article-title>. <source>IEEE Access</source>. (<year>2023</year>) <volume>11</volume>:<fpage>135600</fpage>&#x2013;<lpage>08</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3337804</pub-id></mixed-citation></ref>
<ref id="B6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zahid</surname> <given-names>L</given-names></name> <name><surname>Maqsood</surname> <given-names>M</given-names></name> <name><surname>Durrani</surname> <given-names>MY</given-names></name> <name><surname>Bakhtyar</surname> <given-names>M</given-names></name> <name><surname>Baber</surname> <given-names>J</given-names></name> <name><surname>Jamal</surname> <given-names>H</given-names></name><etal/></person-group> <article-title>A spectrogram-based deep feature assisted computer-aided diagnostic system for Parkinson&#x2019;s disease</article-title>. <source>IEEE Access</source>. (<year>2020</year>) <volume>8</volume>:<fpage>35482</fpage>&#x2013;<lpage>95</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.2974008</pub-id></mixed-citation></ref>
<ref id="B7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gunduz</surname> <given-names>H</given-names></name></person-group>. <article-title>Deep learning-based Parkinson&#x2019;s disease classification using vocal feature sets</article-title>. <source>IEEE Access</source>. (<year>2019</year>) <volume>7</volume>:<fpage>115540</fpage>&#x2013;<lpage>51</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2936564</pub-id></mixed-citation></ref>
<ref id="B8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>M</given-names></name> <name><surname>Sun</surname> <given-names>Z</given-names></name> <name><surname>Xin</surname> <given-names>T</given-names></name> <name><surname>Chen</surname> <given-names>Y</given-names></name> <name><surname>Su</surname> <given-names>F</given-names></name></person-group>. <article-title>An interpretable deep learning optimized wearable daily detection system for Parkinson&#x2019;s disease</article-title>. <source>IEEE Trans Neural Syst Rehabil Eng</source>. (<year>2023</year>) <volume>31</volume>:<fpage>3937</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1109/TNSRE.2023.3314100</pub-id><pub-id pub-id-type="pmid">37695969</pub-id></mixed-citation></ref>
<ref id="B9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Markovic</surname> <given-names>F</given-names></name> <name><surname>Jovanovic</surname> <given-names>L</given-names></name> <name><surname>Spalevic</surname> <given-names>P</given-names></name> <name><surname>Kaljevic</surname> <given-names>J</given-names></name> <name><surname>Zivkovic</surname> <given-names>M</given-names></name> <name><surname>Simic</surname> <given-names>V</given-names></name><etal/></person-group> <article-title>Parkinson&#x2019;s detection from gait time series classification using modified metaheuristic optimized long short-term memory</article-title>. <source>Neural Process Lett</source>. (<year>2025</year>) <volume>57</volume>:<fpage>14</fpage>. <pub-id pub-id-type="doi">10.1007/s11063-025-11735-z</pub-id></mixed-citation></ref>
<ref id="B10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>R&#x00ED;os-Urrego</surname> <given-names>CD</given-names></name> <name><surname>Escobar-Grisales</surname> <given-names>D</given-names></name> <name><surname>Orozco-Arroyave</surname> <given-names>JR</given-names></name></person-group>. <article-title>Synchronous analysis of speech production and lips movement to detect Parkinson&#x2019;s disease using deep learning methods</article-title>. <source>Diagnostics</source>. (<year>2025</year>) <volume>15</volume>(<issue>1</issue>):<fpage>73</fpage>. <pub-id pub-id-type="doi">10.3390/diagnostics15010073</pub-id></mixed-citation></ref>
<ref id="B11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>C-H</given-names></name> <name><surname>Wang</surname> <given-names>F-C</given-names></name> <name><surname>Kuo</surname> <given-names>T-Y</given-names></name> <name><surname>Huang</surname> <given-names>P-W</given-names></name> <name><surname>Chen</surname> <given-names>S-F</given-names></name> <name><surname>Fu</surname> <given-names>L-C</given-names></name></person-group>. <article-title>Early detection of Parkinson&#x2019;s disease by neural network models</article-title>. <source>IEEE Access</source>. (<year>2022</year>) <volume>10</volume>:<fpage>19033</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3150774</pub-id></mixed-citation></ref>
<ref id="B12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Allebawi</surname> <given-names>MF</given-names></name> <name><surname>Dhieb</surname> <given-names>T</given-names></name> <name><surname>Neji</surname> <given-names>M</given-names></name> <name><surname>Farhat</surname> <given-names>N</given-names></name> <name><surname>Smaoui</surname> <given-names>E</given-names></name> <name><surname>Hamdani</surname> <given-names>TM</given-names></name><etal/></person-group> <article-title>Parkinson&#x2019;s disease detection from online handwriting based on beta-elliptical approach and fuzzy perceptual detector</article-title>. <source>IEEE Access</source>. (<year>2024</year>) <volume>12</volume>:<fpage>56936</fpage>&#x2013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3387342</pub-id></mixed-citation></ref>
<ref id="B13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Islam</surname> <given-names>N</given-names></name> <name><surname>Turza</surname> <given-names>MSA</given-names></name> <name><surname>Fahim</surname> <given-names>SI</given-names></name> <name><surname>Rahman</surname> <given-names>RM</given-names></name></person-group>. <article-title>Single and multi-modal analysis for Parkinson&#x2019;s disease to detect its underlying factors</article-title>. <source>Hum Centric Intell Syst</source>. (<year>2024</year>) <volume>4</volume>:<fpage>316</fpage>&#x2013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1007/s44230-024-00069-z</pub-id></mixed-citation></ref>
<ref id="B14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benredjem</surname> <given-names>S</given-names></name> <name><surname>Mekhaznia</surname> <given-names>T</given-names></name> <name><surname>Abdulghafor</surname> <given-names>R</given-names></name> <name><surname>Turaev</surname> <given-names>S</given-names></name> <name><surname>Bennour</surname> <given-names>A</given-names></name> <name><surname>Sofiane</surname> <given-names>B</given-names></name><etal/></person-group> <article-title>Parkinson&#x2019;s disease prediction: an attention-based multimodal fusion framework using handwriting and clinical data</article-title>. <source>Diagnostics</source>. (<year>2025</year>) <volume>15</volume>(<issue>1</issue>):<fpage>4</fpage>. <pub-id pub-id-type="doi">10.3390/diagnostics15010004</pub-id></mixed-citation></ref>
<ref id="B15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Zhou</surname> <given-names>Y</given-names></name> <name><surname>Lu</surname> <given-names>Z</given-names></name> <name><surname>Zhai</surname> <given-names>D</given-names></name> <name><surname>Luo</surname> <given-names>H</given-names></name> <name><surname>Li</surname> <given-names>T</given-names></name><etal/></person-group> <article-title>Multi-level graph neural network with sparsity pooling for recognizing Parkinson&#x2019;s disease</article-title>. <source>EEE Trans Neural Syst Rehabil Eng</source>. (<year>2023</year>) <volume>31</volume>:<fpage>4459</fpage>&#x2013;<lpage>69</lpage>. <pub-id pub-id-type="doi">10.1109/TNSRE.2023.3330643</pub-id></mixed-citation></ref>
<ref id="B16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Islam</surname> <given-names>M</given-names></name> <name><surname>Akter</surname> <given-names>K</given-names></name> <name><surname>Hossain</surname> <given-names>MA</given-names></name> <name><surname>Dewan</surname> <given-names>MAA</given-names></name></person-group>. <article-title>PD-Net: Parkinson&#x2019;s disease detection through fusion of two spectral features using attention-based hybrid deep neural network</article-title>. <source>Information</source>. (<year>2025</year>) <volume>16</volume>(<issue>2</issue>):<fpage>135</fpage>. <pub-id pub-id-type="doi">10.3390/info16020135</pub-id></mixed-citation></ref>
<ref id="B17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Skaramagkas</surname> <given-names>V</given-names></name> <name><surname>Pentari</surname> <given-names>A</given-names></name> <name><surname>Kefalopoulou</surname> <given-names>Z</given-names></name> <name><surname>Tsiknakis</surname> <given-names>M</given-names></name></person-group>. <article-title>Multi-modal deep learning diagnosis of Parkinson&#x2019;s disease&#x2014;a systematic review</article-title>. <source>EEE Trans Neural Syst Rehabil Eng</source>. (<year>2023</year>) <volume>31</volume>:<fpage>2399</fpage>&#x2013;<lpage>423</lpage>. <pub-id pub-id-type="doi">10.1109/TNSRE.2023.3277749</pub-id></mixed-citation></ref>
<ref id="B18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guar&#x00ED;n</surname> <given-names>DL</given-names></name> <name><surname>Wong</surname> <given-names>JK</given-names></name> <name><surname>McFarland</surname> <given-names>NR</given-names></name> <name><surname>Ramirez-Zamora</surname> <given-names>A</given-names></name></person-group>. <article-title>Characterizing disease progression in Parkinson&#x2019;s disease from videos of the finger tapping test</article-title>. <source>EEE Trans Neural Syst Rehabil Eng</source>. (<year>2024</year>) <volume>32</volume>:<fpage>2293</fpage>&#x2013;<lpage>301</lpage>. <pub-id pub-id-type="doi">10.1109/TNSRE.2024.3416446</pub-id></mixed-citation></ref>
<ref id="B19"><label>19.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Faiem</surname> <given-names>N</given-names></name> <name><surname>Asuroglu</surname> <given-names>T</given-names></name> <name><surname>Acici</surname> <given-names>K</given-names></name> <name><surname>Kallonen</surname> <given-names>A</given-names></name> <name><surname>van Gils</surname> <given-names>M</given-names></name></person-group>. <article-title>Assessment of Parkinson&#x2019;s disease severity using gait data: a deep learning-based multimodal approach</article-title>. In: <person-group person-group-type="editor"><name><surname>S&#x00E4;rest&#x00F6;niemi</surname> <given-names>M</given-names></name></person-group>, editor. <source>Digital Health and Wireless Solutions. NCDHWS 2024. Communications in Computer and Information Science (Vol. 2084)</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2024</year>). p. <fpage>29</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-59091-7_3</pub-id></mixed-citation></ref>
<ref id="B20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dentamaro</surname> <given-names>V</given-names></name> <name><surname>Impedovo</surname> <given-names>D</given-names></name> <name><surname>Musti</surname> <given-names>L</given-names></name> <name><surname>Pirlo</surname> <given-names>G</given-names></name> <name><surname>Taurisano</surname> <given-names>P</given-names></name></person-group>. <article-title>Enhancing early Parkinson&#x2019;s disease detection through multimodal deep learning and explainable AI: insights from the PPMI database</article-title>. <source>Sci Rep</source>. (<year>2024</year>) <volume>14</volume>(<issue>1</issue>):<fpage>20941</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-70165-4</pub-id><pub-id pub-id-type="pmid">39251639</pub-id></mixed-citation></ref>
<ref id="B21"><label>21.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Camacho Camacho</surname> <given-names>MI</given-names></name></person-group>. <source>Explainable Prediction of Parkinson&#x2019;s Disease in a Large Multimodal Database</source>. <publisher-loc>Calgary</publisher-loc>: <publisher-name>University of Calgary</publisher-name> (<year>2023</year>). <pub-id pub-id-type="doi">10.11575/PRISM/41854</pub-id></mixed-citation></ref>
<ref id="B22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Esan</surname> <given-names>AO</given-names></name> <name><surname>Olawade</surname> <given-names>DB</given-names></name> <name><surname>Soladoye</surname> <given-names>AA</given-names></name> <name><surname>Omodunbi</surname> <given-names>BA</given-names></name> <name><surname>Adeyanju</surname> <given-names>IA</given-names></name> <name><surname>Aderinto</surname> <given-names>N</given-names></name></person-group>. <article-title>Explainable AI for Parkinson&#x2019;s disease prediction: a machine learning approach with interpretable models</article-title>. <source>Curr Res Transl Med</source>. (<year>2025</year>) <volume>73</volume>(<issue>4</issue>):<fpage>103541</fpage>. <pub-id pub-id-type="doi">10.1016/j.retram.2025.103541</pub-id><pub-id pub-id-type="pmid">40945155</pub-id></mixed-citation></ref>
<ref id="B23"><label>23.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Rao</surname> <given-names>Q</given-names></name> <name><surname>Dong</surname> <given-names>S</given-names></name> <name><surname>Zhu</surname> <given-names>M</given-names></name> <name><surname>Yang</surname> <given-names>Z</given-names></name> <name><surname>Huang</surname> <given-names>X</given-names></name><etal/></person-group> <article-title>PIDGN: an explainable multimodal deep learning framework for early prediction of Parkinson&#x2019;s disease</article-title>. <source>J Neurosci Methods</source>. (<year>2025</year>) <volume>415</volume>:<fpage>110363</fpage>. <pub-id pub-id-type="doi">10.1016/j.jneumeth.2025.110363</pub-id><pub-id pub-id-type="pmid">39832626</pub-id></mixed-citation></ref>
<ref id="B24"><label>24.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Velu</surname> <given-names>K</given-names></name> <name><surname>Jaisankar</surname> <given-names>N</given-names></name></person-group>. <article-title>Design of an early prediction model for Parkinson&#x2019;s disease using machine learning</article-title>. <source>IEEE Access</source>. (<year>2025</year>) <volume>13</volume>:<fpage>17457</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3533703</pub-id></mixed-citation></ref>
<ref id="B25"><label>25.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hassan</surname> <given-names>A</given-names></name> <name><surname>Ahmed</surname> <given-names>A</given-names></name></person-group>. <article-title>Predicting Parkinson&#x2019;s disease progression: a non-invasive method leveraging voice inputs</article-title>. <source>Comput Sci</source>. (<year>2023</year>) <volume>8</volume>(<issue>2</issue>):<fpage>66</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.53070/bbd.1350356</pub-id></mixed-citation></ref></ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2815425/overview">Danielle Sent</ext-link>, Jheronimus Academy of Data Science, Netherlands</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3138657/overview">Ahmad Hassan</ext-link>, COMSATS University Islamabad, Pakistan</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3214227/overview">Bakkialakshmi V.S.</ext-link>, SRM University, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3341029/overview">Kalyan Chatterjee</ext-link>, NMREC, India</p></fn>
</fn-group>
</back>
</article>