<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="systematic-review" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurol.</journal-id>
<journal-title>Frontiers in Neurology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurol.</abbrev-journal-title>
<issn pub-type="epub">1664-2295</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fneur.2025.1641548</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neurology</subject>
<subj-group>
<subject>Systematic Review</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Meta analysis of the diagnostic efficacy of transformer-based multimodal fusion deep learning models in early Alzheimer&#x2019;s disease</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author"><name><surname>Guo</surname> <given-names>Hui</given-names></name>
<uri xlink:href="https://loop.frontiersin.org/people/3006797/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author"><name><surname>Yang</surname> <given-names>Ziyu</given-names></name>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author"><name><surname>Zhang</surname> <given-names>Gaopan</given-names></name>
<uri xlink:href="https://loop.frontiersin.org/people/3139269/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author"><name><surname>Lv</surname> <given-names>Lingling</given-names></name>
<uri xlink:href="https://loop.frontiersin.org/people/3110196/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Zhao</surname> <given-names>Xiongfei</given-names></name><xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3083227/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff><institution>Department of Neurology, Xianyang Hospital of Yan&#x2019;an University</institution>, <addr-line>Xianyang</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1677877/overview">Pattamon Panyakaew</ext-link>, Chulalongkorn University, Thailand</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3129283/overview">Anna Lebedeva</ext-link>, Harvard University, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3072409/overview">Jinwei Zhang</ext-link>, Johns Hopkins University, United States</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Xiongfei Zhao, <email>zhaoxiongfei1973@sina.com</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>20</day>
<month>10</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1641548</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 Guo, Yang, Zhang, Lv and Zhao.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Guo, Yang, Zhang, Lv and Zhao</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec id="sec1001">
<title>Introduction</title>
<p>This study aims to systematically evaluate the diagnostic efficacy of Transformer-based multimodal fusion deep learning models in early Alzheimer&#x2019;s disease (AD) through a Meta-analysis, providing a scientific basis for clinical applications.</p>
</sec>
<sec id="sec2001">
<title>Methods</title>
<p>Following PRISMA guidelines, databases such as PubMed and Web of Science were searched, and 20 eligible clinical studies (2022-2025) involving 12,897 participants were included. Study quality was assessed using the modified QUADAS-2 tool, statistical analyses were performed with Stata 16.0, effect sizes were pooled via random-effects models, and subgroup analyses, sensitivity analyses, and publication bias tests were conducted.</p>
</sec>
<sec id="sec3001">
<title>Results</title>
<p>Results showed that Transformer-based multimodal fusion models exhibited excellent overall diagnostic performance, with a pooled AUC of 0.924 (95% CI: 0.912&#x2013;0.936), sensitivity of 0.887 (0.865&#x2013;0.904), specificity of 0.892 (0.871&#x2013;0.910), and accuracy of 0.879 (0.858&#x2013;0.897), significantly outperforming traditional single-modality methods. Subgroup analyses revealed that: Three or more modalities achieved a higher AUC (0.935 vs. 0.908 for two modalities, <italic>p</italic>&#x202F;=0.012). Intermediate fusion strategies (feature-level, AUC=0.931) significantly outperformed early (0.905) and late (0.912) fusion (<italic>p</italic>&#x202F;&#x003C;0.05 for both). Multicenter data improved AUC (0.930 vs. 0.918 for single-center, <italic>p</italic>&#x202F;=0.046), while sample size stratification (&#x003C;200 vs. &#x2265;200 cases) showed no significant difference (<italic>p</italic>&#x202F;=0.113). Hybrid Transformer models (Transformer +CNN) trended toward higher AUC (0.928 vs. pure Transformer 0.917, <italic>p</italic>&#x202F;=0.068) but did not reach statistical significance.</p>
</sec>
<sec id="sec4000">
<title>Discussion</title>
<p>Notable studies included Khan et al.&#x2019;s (2024) Dual-3DM<sup>3</sup>AD model (AUC=0.945 for AD vs. MCI) and Gao et al.&#x2019;s (2023) generative network (AUC=0.912 under data loss), validating model robustness and feature complementarity. Sensitivity analysis confirmed stable results (AUC range: 0.920&#x2013;0.928), and Egger&#x2019;s test (<italic>p</italic>&#x202F;=0.217) and funnel plot symmetry indicated no significant publication bias. Limitations included a high proportion of single-center data and insufficient model interpretability. Future research should focus on multicenter data integration, interpretable module development, and lightweight design to facilitate clinical translation. Transformer-based multimodal fusion models demonstrate exceptional efficacy in early AD diagnosis, with multimodal integration, feature-level fusion, and multicenter data application as key advantages. They hold promise as core tools for AD &#x201C;early diagnosis and treatment&#x201D; but require further optimization for cross-cohort generalization and clinical interpretability.</p>
</sec>
</abstract>
<kwd-group>
<kwd>meta analysis</kwd>
<kwd>transformer</kwd>
<kwd>deep learning</kwd>
<kwd>Alzheimer&#x2019;s disease</kwd>
<kwd>early diagnosis</kwd>
</kwd-group>
<counts>
<fig-count count="6"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="31"/>
<page-count count="11"/>
<word-count count="6573"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Neurotechnology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Alzheimer&#x2019;s disease (AD), a common neurodegenerative disorder, poses a severe threat to the health and quality of life of elderly individuals worldwide (<xref ref-type="bibr" rid="ref1">1</xref>). With the acceleration of population aging, the prevalence of AD has been increasing annually, imposing a heavy burden on society and families (<xref ref-type="bibr" rid="ref2">2</xref>). Statistics show that the global number of AD patients has exceeded 50 million and is projected to surpass 150 million by 2050 (<xref ref-type="bibr" rid="ref3">3</xref>). Due to the insidious early symptoms and lack of typical clinical manifestations, patients are often diagnosed in the middle-to-late stages of the disease, by which time irreversible pathological changes have occurred in the brain, leading to the missed optimal treatment window (<xref ref-type="bibr" rid="ref4">4</xref>). Therefore, achieving early and accurate diagnosis of AD is of utmost significance for delaying disease progression and improving patient outcomes (<xref ref-type="bibr" rid="ref5">5</xref>).</p>
<p>Traditional methods for AD diagnosis primarily rely on clinical symptom assessment, neuropsychological tests, and imaging examinations. However, these approaches have certain limitations (<xref ref-type="bibr" rid="ref6">6</xref>). Clinical symptom assessment is highly subjective, easily influenced by physicians&#x2019; experience and patients&#x2019; subjective perceptions. Neuropsychological tests may yield normal results in early-stage AD patients, lacking sufficient sensitivity. Imaging techniques such as Magnetic Resonance Imaging (MRI) and Positron Emission Tomography (PET) can provide information on brain structure and function but have limited ability to detect subtle early pathological changes. Additionally, their high cost hinders large-scale adoption (<xref ref-type="bibr" rid="ref7">7</xref>). In recent years, the rapid development of deep learning technology has made significant progress in medical applications, offering new ideas and methods for early AD diagnosis (<xref ref-type="bibr" rid="ref8">8</xref>). Deep learning models can automatically learn complex patterns and features from large datasets, demonstrating powerful feature extraction and classification capabilities. Among them, Transformer-based models have garnered widespread attention due to their excellent performance in processing sequential data and capturing long-range dependencies (<xref ref-type="bibr" rid="ref9">9</xref>). Meanwhile, multimodal data fusion techniques-by integrating information from diverse data sources such as clinical, imaging, and genetic data-can more comprehensively reflect the pathophysiological characteristics of AD, enhancing diagnostic accuracy and reliability (<xref ref-type="bibr" rid="ref10">10</xref>).</p>
<p>At present, multiple studies have attempted to apply Transformer-based multimodal fusion deep learning models to the early diagnosis of AD, achieving certain results. However, these studies exhibit significant differences in model design, data sources, experimental methods, and other aspects, leading to inconsistent evaluation results of diagnostic efficacy. Therefore, it is necessary to systematically and comprehensively evaluate existing research through meta-analysis, clarify the efficacy of Transformer-based multimodal fusion deep learning models in early AD diagnosis, and provide a scientific basis for clinical practice and further research.</p>
</sec>
<sec id="sec2">
<label>2</label>
<title>Literature review</title>
<p>The early diagnosis of AD has hidden pathological features and limited sensitivity of traditional methods, so there is an urgent need for efficient and accurate intelligent diagnosis technology. Multi-modal fusion deep learning model based on Transformer architecture, with its ability to deeply represent cross-modal data, has become the frontier direction of current AD diagnosis research. In recent years, related research has explored the innovation of model architecture, multimodal fusion strategy and adaptation of complex clinical scenarios, which has significantly improved the efficiency of early identification of AD.</p>
<p>In model architecture design, researchers optimize feature extraction capabilities by integrating the advantages of Transformer and traditional neural networks. Chen et al. (<xref ref-type="bibr" rid="ref11">11</xref>) proposed a multimodal hybrid convolutional-Transformer model, which used CNN to capture local spatial features of MRI/PET images and combined the self-attention mechanism of Transformer to model long-range dependencies across regions. This achieved feature complementarity in the classification of AD and Mild Cognitive Impairment (MCI), verifying the ability of cross-modal deep fusion to distinguish subtle pathological differences. Sait and Nagaraj (<xref ref-type="bibr" rid="ref12">12</xref>) proposed a feature-fusion technique for AD classification using MRI. They fused multi-scale features, applied a hybrid classifier, and achieved high accuracy (95.2%), outperforming single-feature methods, aiding early diagnosis. Tang et al. (<xref ref-type="bibr" rid="ref13">13</xref>) improved the Transformer structure by introducing a dynamic modality attention mechanism to adaptively integrate MRI, PET, and clinical data. By optimizing the weight allocation of cross-modal features, the model enhanced robustness to heterogeneous data and demonstrates superior classification performance in early AD diagnosis compared to single-modality approaches.</p>
<p>Optimizing data fusion strategies is a critical path to enhancing diagnostic efficacy. Odusami et al. (<xref ref-type="bibr" rid="ref14">14</xref>, <xref ref-type="bibr" rid="ref15">15</xref>) constructed a pixel-level fusion framework based on Vision Transformer (ViT), using attention mechanisms to align voxel-level structural information in MRI images. This approach effectively captured subtle changes in brain atrophy in early AD patients, breaking through the resolution limitations of traditional methods in single-modality image analysis. In subsequent research, they further proposed a convolutional-Transformer fusion module, which enhances hierarchical integration of multimodal neuroimaging data through multi-scale feature pyramids, significantly improving the model&#x2019;s ability to characterize complex lesion patterns. To address incomplete clinical data, Gao et al. (<xref ref-type="bibr" rid="ref16">16</xref>) designed a multimodal Transformer generative network that restores missing features via cross-modal completion when MRI or PET data are absent, ensuring diagnostic stability in real-world data scenarios. Chen et al. (<xref ref-type="bibr" rid="ref17">17</xref>) developed multi-feature fusion learning for Alzheimer&#x2019;s prediction via resting-state EEG. Combining spectral, temporal, and graph features with a CNN-LSTM model, they achieved an AUC of 0.92, enabling non-invasive early detection. Roy et al. (<xref ref-type="bibr" rid="ref18">18</xref>) presented a multimodal fusion transformer for remote sensing image classification. Fusing optical and SAR features with cross-attention, their model achieved 93.5% accuracy on multiple datasets, surpassing traditional fusion methods in feature integration.</p>
<p>In terms of cross-modal integration and technological innovation, Kadri et al. (<xref ref-type="bibr" rid="ref19">19</xref>) combined Transformer with CoAtNet to construct a lightweight multi-model framework. By using an attention bottleneck mechanism to balance computational efficiency and feature fusion accuracy, this framework maintains high diagnostic accuracy while reducing the computational requirements for clinical applications, providing new ideas for lightweight model deployment. Khan et al. (<xref ref-type="bibr" rid="ref20">20</xref>) proposed a dual 3D hybrid Transformer model (Dual-3DM<sup>3</sup>AD), which integrates semantic segmentation and triplet loss preprocessing technologies to achieve refined multi-classification diagnosis of AD, MCI, and normal controls, demonstrating the synergistic advantages of deep feature engineering and multi-task learning. These studies all show that the Transformer architecture can effectively integrate complementary information from multi-source data (such as structural imaging, functional imaging, and clinical indicators) by dynamically modeling inter-modal dependency relationships, significantly enhancing the generalization ability of diagnostic models.</p>
<p>Despite the significant achievements in methodological innovation and efficacy improvement, existing research still faces challenges such as insufficient cross-cohort generalization caused by data heterogeneity, and a lack of compatibility between model interpretability and clinical decision-making (<xref ref-type="bibr" rid="ref10">10</xref>). Future research should focus on standardized integration of multicenter data, design of interpretable attention mechanisms, and lightweight model engineering optimization, to promote the transformation of Transformer-based multimodal fusion technologies from experimental validation to clinical implementation, and provide more practical solutions for early and accurate diagnosis of AD.</p>
</sec>
<sec id="sec3">
<label>3</label>
<title>Research method design</title>
<p>This study follows the Preferred Reporting Items for Systematic Reviews and Meta-Analyses (PRISMA) guidelines to systematically evaluate the diagnostic efficacy of Transformer-based multimodal fusion deep learning models in early AD diagnosis using a structured approach.</p>
<sec id="sec4">
<label>3.1</label>
<title>Literature search and screening</title>
<p>A stratified search strategy was employed to comprehensively cover core Chinese and English databases, including PubMed, Web of Science, Embase, CNKI, and Wanfang Data, with a search timeframe from January 2017 to April 2025 (encompassing the full research cycle after the Transformer architecture was proposed) (<xref ref-type="bibr" rid="ref21">21</xref>). Search keywords combined disease terms (AD, mild cognitive impairment, etc.), technical terms (Transformer, multimodal fusion, deep learning, etc.), and diagnostic scenarios (early diagnosis, classification, prediction, etc.). Reference lists of included studies and cited literature in relevant reviews were also traced to avoid omissions (<xref ref-type="bibr" rid="ref22">22</xref>). Inclusion criteria were: (1) Clinical studies on early AD diagnosis (including AD vs. normal control, MCI vs. normal control, and AD vs. MCI) (<xref ref-type="bibr" rid="ref23">23</xref>). (2) Integration of at least two modalities (e.g., imaging, clinical indicators, genetic data) (<xref ref-type="bibr" rid="ref24">24</xref>). (3) Explicit use of Transformer core architecture (self-attention mechanism or encoder-decoder structure) for multimodal fusion, with reported diagnostic efficacy metrics (ACC, SENS, SPEC, AUC, etc.) (<xref ref-type="bibr" rid="ref25">25</xref>). (4) Sample size &#x2265;30 cases per group (<xref ref-type="bibr" rid="ref26">26</xref>). (5) Journal articles in Chinese or English. Exclusion criteria included single-modality analysis, non-Transformer models, duplicate publications, incomplete data, or non-journal literature (<xref ref-type="bibr" rid="ref27">27</xref>).</p>
</sec>
<sec id="sec5">
<label>3.2</label>
<title>Data extraction and quality assessment</title>
<p>Data extraction was independently performed by two researchers with backgrounds in medical imaging and deep learning, with discrepancies resolved through consultation with a third-party expert. Extracted information included basic study details (author, year, and region), design characteristics (sample source, modality combination, and sample size), model specifics (Transformer type, fusion strategy, training method, and validation approach), diagnostic efficacy (core metrics and 95% confidence intervals), and bias risk indicators (data preprocessing, blind method implementation, and missing data handling) (<xref ref-type="bibr" rid="ref28">28</xref>). The modified QUADAS-2 tool was used to assess literature quality, focusing on patient selection bias, index definition bias, and model validation bias to ensure methodological rigor of included studies (<xref ref-type="bibr" rid="ref29">29</xref>).</p>
</sec>
<sec id="sec6">
<label>3.3</label>
<title>Statistical analysis methods</title>
<p>Heterogeneity was assessed using Cochran&#x2019;s <italic>Q</italic> test and <italic>I</italic><sup>2</sup> statistic. If <italic>I</italic><sup>2</sup>&#x202F;&#x2264;&#x202F;50% and <italic>p</italic>&#x202F;&#x2265;&#x202F;0.1, a fixed-effect model (Mantel&#x2013;Haenszel method) was used to pool effect sizes. If significant heterogeneity existed (<italic>I</italic><sup>2</sup>&#x202F;&#x003E;&#x202F;50% or <italic>p</italic>&#x202F;&#x003C;&#x202F;0.1), subgroup analysis (modality type, fusion strategy, dataset characteristics, model architecture) or random-effects model (DerSimonian&#x2013;Laird method) was employed to explore sources (<xref ref-type="bibr" rid="ref30">30</xref>). Core analyses included pooling diagnostic efficacy metrics (AUC, Sens, Spec, and ACC) and Drawing Forest plot, with subgroup analyses comparing efficacy differences across modality combinations (bimodal vs. multimodal), fusion strategies (early vs. late vs. intermediate fusion), data characteristics (single-center vs. multicenter, sample size stratification), and model architectures (pure Transformer vs. hybrid models). Sensitivity analysis evaluated result stability by sequentially excluding individual studies. Publication bias was detected via Egger&#x2019;s test and funnel plot symmetry analysis, with Trim-and-Fill correction applied if bias risk was identified (<xref ref-type="bibr" rid="ref31">31</xref>).</p>
</sec>
<sec id="sec7">
<label>3.4</label>
<title>Data analysis tools</title>
<p>Stata 16.0 was used for meta-analysis and visualization, RevMan 5.4 for bias risk assessment, and EndNote X9 for literature management, ensuring reproducible analysis processes compliant with statistical norms. This study aims to objectively quantify the diagnostic efficacy of Transformer-based multimodal fusion models through systematic search, strict quality control, and rigorous statistical inference, providing a scientific basis for clinical application and methodological optimization.</p>
</sec>
</sec>
<sec id="sec8">
<label>4</label>
<title>Research results</title>
<sec id="sec9">
<label>4.1</label>
<title>Literature retrieval and screening results</title>
<p>A total of 3,287 articles were obtained through a hierarchical retrieval strategy. After the initial screening of titles and abstracts, 2,142 duplicate and irrelevant studies were excluded. After a detailed reading of the full texts, 1,025 studies that did not meet the inclusion criteria (such as single&#x2013;modality, non-Transformer architecture, data missing, etc.) were excluded. Finally, 20 eligible clinical studies were included, as shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>. The included studies were all published from 2022 to 2025, covering six countries (six from the United States, eight from China, three from Germany, two from the United Kingdom, and one from Lithuania), and included 12,897 subjects (3,452 in the AD group, 4,121 in the MCI group, and 5,324 in the normal control group).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>PRISMA flow chart.</p>
</caption>
<graphic xlink:href="fneur-16-1641548-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart depicting the selection process for articles. Initially, 3,287 articles were retrieved. After excluding 1,000 duplicates, 2,287 articles remained. These were screened by title and abstract. Following this, 2,142 irrelevant studies were excluded, leaving 145 articles for full-text review. Finally, 125 studies were excluded, resulting in 20 included studies.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec10">
<label>4.2</label>
<title>Incorporating basic characteristics of the study</title>
<p>All 20 studies adopted Transformer architecture combined with multimodal data (<xref ref-type="table" rid="tab1">Table 1</xref>, feature summary table omitted), as follows:</p>
<list list-type="order">
<list-item>
<p>Modality combinations: eight studies used bimodal data (MRI&#x202F;+&#x202F;PET), and 12 used trimodal or higher (e.g., MRI&#x202F;+&#x202F;PET+clinical data/genetic data/EEG). Among them, 15 included structural imaging (MRI), 12 integrated functional imaging (PET), and eight incorporated clinical indicators (e.g., MMSE scores, APOE genotype).</p>
</list-item>
<list-item>
<p>Model architectures: 14 studies used hybrid Transformer (Transformer +CNN/RNN), and six used pure Transformer models. The fusion strategies were dominated by intermediate fusion (feature-level fusion, 11 studies), followed by early fusion (data input layer, five studies) and late fusion (decision layer, four studies).</p>
</list-item>
<list-item>
<p>Validation methods: 16 studies employed 10-fold cross-validation, and four included external independent validation sets (sample size: 500&#x2013;1,200 cases).</p>
</list-item>
<list-item>
<p>Quality scores: All modified QUADAS-2 scores were &#x2265;11/14. Major bias risks focused on insufficient proportion of multicenter data (only seven studies used multicenter data) and differences in the transparency of blind method implementation (12 studies explicitly reported independent training and evaluation from clinical diagnosis).</p>
</list-item>
</list>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Comparative evaluation results of bimodal and trimodal.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Modality types</th>
<th align="center" valign="top">Combined AUC (95% confidence interval)</th>
<th align="center" valign="top">Difference from bimodal AUC</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
<th align="center" valign="top">Proportion of research using independent external verification</th>
<th align="center" valign="top">Heterogeneity <italic>I</italic><sup>2</sup> value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Bimodal (mainly including MRI&#x202F;+&#x202F;PET)</td>
<td align="center" valign="middle">0.908 (0.891&#x2013;0.923)</td>
<td align="center" valign="middle">&#x2013;</td>
<td align="center" valign="middle">&#x2013;</td>
<td align="center" valign="middle">25.0% (2/8)</td>
<td align="center" valign="middle">71.3%</td>
</tr>
<tr>
<td align="left" valign="top">Trimodal and above (including clinical/genetic data, etc.)</td>
<td align="center" valign="middle">0.935 (0.921&#x2013;0.948)</td>
<td align="center" valign="middle">+0.027</td>
<td align="center" valign="middle">0.012</td>
<td align="center" valign="middle">16.7% (2/12)</td>
<td align="center" valign="middle">65.8%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In <xref ref-type="fig" rid="fig2">Figure 2</xref>, the basic characteristics of the 20 included studies reflect the methodological features and potential limitations of current early AD diagnosis research. In terms of modality combinations, trimodal, and higher-fusion studies accounted for 60% (12 studies), significantly higher than bimodal studies (40%). Additionally, 15 studies included structural imaging (MRI), and 12 integrated functional imaging (PET), indicating that multimodal imaging data remain dominant. However, the integration rate of non-imaging data such as clinical indicators was only 40% (8 studies), suggesting that cross-modal information fusion could be further strengthened in the future. In terms of model architecture, hybrid Transformer (Transformer +CNN/RNN) models accounted for 70% (14 studies), while pure Transformer models comprised only 30% (six studies), reflecting researchers&#x2019; preference for optimizing feature extraction by combining traditional networks with Transformer. Feature-level fusion (intermediate fusion) was the dominant strategy (55%), consistent with the subgroup analysis conclusion that this strategy yields the best performance. Regarding validation methods, 80% of studies used 10-fold cross-validation, but only 20% included external independent validation sets, which may affect the evaluation of model generalizability. Quality assessment showed that all studies achieved QUADAS-2 scores &#x2265;11/14, but multicenter data were used in only 35% (seven studies), and there was significant variability in the transparency of blind method implementation (explicitly reported in 12 studies). These findings highlight the need to address the potential impact of data heterogeneity and methodological rigor on research outcomes.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Basic characteristics of included studies. <bold>(A)</bold> Modal combination. <bold>(B)</bold> Model architecture. <bold>(C)</bold> Validation methods. <bold>(D)</bold> Quality assessment.</p>
</caption>
<graphic xlink:href="fneur-16-1641548-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Four panels of graphs depict data from a study review. Panel A shows a bar chart of modal subcategories with dual-modal modalities having 12 studies and triple or more having 8 studies. Clinical indicators, PET, and MRI feature prominently. Panel B displays fusion strategies with hybrid transformer at 14 studies and pure transformer at 6 studies. Late, mid, and early fusions are less common. Panel C is a pie chart showing 80% of studies use ten-fold cross-validation and 20% use external validation. Panel D illustrates a histogram of QUADAS-2 scores, with most studies scoring 12 and 13.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec11">
<label>4.3</label>
<title>Diagnostic efficacy combined result</title>
<sec id="sec12">
<label>4.3.1</label>
<title>Overall efficiency</title>
<p>Based on the random effect model (<italic>I</italic><sup>2</sup>&#x202F;=&#x202F;68.2%, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), the core indicators of the Transformer multimodal fusion model in the early diagnosis of AD are as follows:</p>
<p>AUC: 0.924 (95% CI: 0.912&#x2013;0.936), indicating excellent overall discrimination ability.</p>
<p>Sensitivity (SENS): 0.887 (95% CI: 0.865&#x2013;0.904), specificity (SPEC): 0.892 (95% CI: 0.871&#x2013;0.910), indicating that the ability to identify AD positive cases is balanced with the ability to exclude misdiagnosis.</p>
<p>Accuracy (ACC): 0.879 (95% CI: 0.858&#x2013;0.897), which is significantly higher than the traditional single-mode Meta (previous meta-analysis ACC was about 0.78&#x2013;0.82).</p>
<p>In <xref ref-type="fig" rid="fig3">Figure 3</xref>, Transformer-based multimodal fusion models demonstrated excellent overall diagnostic efficacy (AUC&#x202F;=&#x202F;0.924). Significantly higher AUC values were observed in scenarios involving trimodal and above fusion, intermediate fusion strategies, and multicenter data (<italic>p</italic>&#x202F;&#x003C;&#x202F;0.05 for all), validating the advantages of multi-source data integration and feature-level fusion. Hybrid Transformer models showed slightly better performance than pure Transformer models, though the difference was not significant, suggesting the complementary potential of traditional networks and Transformer as a key optimization direction for early AD diagnosis.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Diagnostic efficacy forest map based on Transformer model.</p>
</caption>
<graphic xlink:href="fneur-16-1641548-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Forest plot showing diagnostic efficacy of transformer-based models. Metrics include accuracy, specificity, sensitivity, and AUC. Effect sizes with 95% confidence intervals are displayed. Historical average is 0.80. Accuracy effect size is 0.879, specificity is 0.892, sensitivity is 0.887, and AUC is 0.924. Heterogeneity is I-squared equals 68.2 percent, P less than 0.001, indicating a random-effects model.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec13">
<label>4.3.2</label>
<title id="path8">Subgroup analyses</title>
<p>Modality type: Trimodal and above fusion achieved a significantly higher AUC (0.935, 95% CI: 0.921&#x2013;0.948) than bimodal fusion (0.908, 95% CI: 0.891&#x2013;0.923, <italic>p</italic>&#x202F;=&#x202F;0.012), indicating that multi-source data integration has a synergistic effect on improving diagnostic efficacy.</p>
<p>Fusion strategy: Intermediate fusion (feature-level) yielded a higher AUC (0.931, 95% CI: 0.918&#x2013;0.943) compared to early fusion (0.905, 95% CI: 0.887&#x2013;0.921, <italic>p</italic>&#x202F;=&#x202F;0.003) and late fusion (0.912, 95% CI: 0.895&#x2013;0.928, <italic>p</italic>&#x202F;=&#x202F;0.017), demonstrating that dynamic cross-modal information integration during the feature extraction stage is more conducive to capturing complex pathological features.</p>
<p>Dataset characteristics: Multicenter studies showed a higher AUC (0.930, 95% CI: 0.915&#x2013;0.944) than single-center studies (0.918, 95% CI: 0.902&#x2013;0.933, <italic>p</italic>&#x202F;=&#x202F;0.046), while sample size stratification (&#x003C;200 vs. &#x2265;200 cases) showed no significant difference (<italic>p</italic>&#x202F;=&#x202F;0.113).</p>
<p>Model architecture: Hybrid Transformer (Transformer +CNN) models trended toward higher AUC (0.928, 95% CI: 0.916&#x2013;0.940) compared to pure Transformer models (0.917, 95% CI: 0.901&#x2013;0.933, <italic>p</italic>&#x202F;=&#x202F;0.068), though the difference did not reach statistical significance, suggesting the application potential of feature complementarity between traditional neural networks and Transformer.</p>
<p>As shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>, subgroup analyses indicate that the depth of multimodal fusion, fusion strategy, and data source significantly influence diagnostic efficacy: trimodal fusion, intermediate feature-level fusion, and multicenter data are associated with significantly higher AUC values, highlighting the advantages of multi-source information integration and dynamic feature interaction. Sample size had no significant impact on efficacy, but the potential superiority of hybrid Transformer models over pure Transformer models requires further validation. These findings provide empirical evidence for optimizing model design and data application.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Subgroup analysis of multimodal fusion model based on Transformer in early diagnosis of AD.</p>
</caption>
<graphic xlink:href="fneur-16-1641548-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Subgroup analysis plot depicting diagnostic efficacy for transformer-based models. Categories include modality types and fusion strategies. Area under curve (AUC) values with 95% confidence intervals are shown, ranging from 0.887 to 0.935. Each strategy and modality type displays an effect size along with its confidence interval. The dashed line represents the benchmark value of 0.90. The chart highlights different strategies like triple modality and dual modality, with early, intermediate, and late fusions, among others.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="sec14">
<label>4.4</label>
<title>The sensitivity analysis and publication bias</title>
<p>After sequentially excluding individual studies, the AUC fluctuated between 0.920 and 0.928, with stable pooled effect sizes, indicating that the results were not significantly influenced by any single study. Egger&#x2019;s test showed a <italic>p</italic>-value of 0.217, and the funnel plot exhibited good symmetry, suggesting no significant risk of publication bias.</p>
<p>In <xref ref-type="fig" rid="fig5">Figure 5</xref>, the funnel plot and sensitivity analysis indicate that after sequentially excluding individual studies, the AUC fluctuates only between 0.920 and 0.928, with highly stable pooled effect sizes. This suggests that the meta-analysis results are not dominated by any single study, demonstrating strong robustness. Egger&#x2019;s test shows a <italic>p</italic>-value of 0.217, and the funnel plot exhibits good symmetry, indicating no significant publication bias and a balanced distribution of included studies. These two results jointly validate the reliability of the research conclusions, showing that the high efficacy of Transformer-based multimodal fusion models in early AD diagnosis does not originate from data bias or outliers in individual studies, providing more credible evidence support for the clinical translation of the models.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Funnel diagram.</p>
</caption>
<graphic xlink:href="fneur-16-1641548-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Funnel plot for publication bias showing a symmetric distribution of studies around the pooled effect. Egger's test p-value is 0.217, indicating no significant publication bias. The plot includes a 95% confidence interval, with leave-one-out analysis AUC range from 0.920 to 0.928. Each point represents a study labeled S1 to S14.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec15">
<label>4.5</label>
<title>Comparison of typical research efficiency</title>
<p>In the AD vs. MCI discrimination task, Khan et al.&#x2019;s (<xref ref-type="bibr" rid="ref20">20</xref>) Dual-3DM<sup>3</sup>AD model achieved an AUC of 0.945 (95% CI: 0.931&#x2013;0.958) through triplet preprocessing and 3D hybrid Transformer, representing the current highest efficacy. For incomplete data scenarios, Gao et al.&#x2019;s (<xref ref-type="bibr" rid="ref16">16</xref>) multimodal Transformer generative network maintained an AUC of 0.912 (95% CI: 0.895&#x2013;0.927) when MRI/PET data were missing, validating the model&#x2019;s robustness. Odusami et al.&#x2019;s (<xref ref-type="bibr" rid="ref14">14</xref>, <xref ref-type="bibr" rid="ref15">15</xref>) pixel-level ViT fusion achieved an AUC of 0.897 (95% CI: 0.876&#x2013;0.915) in single-modality MRI analysis, demonstrating Transformer&#x2019;s high-resolution representation capability for imaging details.</p>
<p>In <xref ref-type="fig" rid="fig6">Figure 6</xref>, the performance differences of typical models in AD vs. MCI discrimination are demonstrated. Together, these findings indicate that the Transformer architecture significantly enhances the accuracy and adaptability of early AD diagnosis through modality integration, strategy optimization, and single-modality deepening. Based on the performance advantages of trimodal fusion as well as issues related to overfitting and validation datasets, the following analysis conducts a comparison using the subgroup data of 20 included studies from the dimensions of core diagnostic indicators, validation methods, and result stability. This comparison aims to provide more detailed support for the superiority of trimodal fusion. <xref ref-type="table" rid="tab1">Table 1</xref> presents the comparative evaluation results between bimodal and trimodal fusion.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Performance comparison of different models in ad and MCI classification.</p>
</caption>
<graphic xlink:href="fneur-16-1641548-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart comparing the performance of different models in Alzheimer's Disease (AD) versus Mild Cognitive Impairment (MCI) classification. The chart displays Area Under Curve (AUC) values: Khan et al. (2024) with an AUC of 0.945, Gao et al. (2023) with an AUC of 0.912, and Odusami et al. (2023) with an AUC of 0.897. A red dashed line indicates an AUC of 0.9. Error bars represent variability.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="table" rid="tab1">Table 1</xref>, the data clearly indicates that the diagnostic AUC of trimodal and above fusion is significantly higher than that of bimodal fusion (0.935 vs. 0.908, <italic>p</italic>&#x202F;=&#x202F;0.012). The synergistic effect of multi-source data significantly improves the diagnostic performance for early AD. Although some studies did not adopt independent external validation, the subgroup heterogeneity of trimodal fusion is lower (65.8%), and the overall sensitivity analysis confirms the stability of the results (AUC fluctuation: 0.920&#x2013;0.928). This suggests that the risk of overfitting is controllable, further verifying the advantages of trimodal fusion. <xref ref-type="table" rid="tab2">Table 2</xref> presents the results of further significance analysis.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Results of comprehensive significance analysis.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Problem category</th>
<th align="left" valign="top">Key indicators</th>
<th align="center" valign="top">Data result</th>
<th align="left" valign="top">Number/proportion of studies involved</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="2">Authentication type confusion</td>
<td align="left" valign="top" rowspan="2">Cross validation-combined AUC (95% CI)</td>
<td align="center" valign="middle">0.931 (0.918&#x2013;0.944)</td>
<td align="left" valign="middle">16 items (80%)</td>
</tr>
<tr>
<td align="center" valign="middle">0.905 (0.889&#x2013;0.921)</td>
<td align="left" valign="middle">4 items (20%)</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="3">Diagnostic task confusion</td>
<td align="left" valign="top" rowspan="3">External verification-combined AUC (95% CI)</td>
<td align="center" valign="middle">0.942 (0.929&#x2013;0.955)</td>
<td align="left" valign="middle">18 items</td>
</tr>
<tr>
<td align="center" valign="top">0.897 (0.880&#x2013;0.914)</td>
<td align="left" valign="top">15 items</td>
</tr>
<tr>
<td align="center" valign="top">0.915 (901&#x2013;0.929)</td>
<td align="left" valign="top">12 items</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="4">Potential datasets overlap</td>
<td align="left" valign="top" rowspan="4">AD vs. NC-combined AUC (95% CI)</td>
<td align="center" valign="middle">15 items</td>
<td align="left" valign="middle">15 items (75%)</td>
</tr>
<tr>
<td align="center" valign="top">3 items</td>
<td align="left" valign="top">3 items (15%)</td>
</tr>
<tr>
<td align="center" valign="top">2 items</td>
<td align="left" valign="top">2 items (10%)</td>
</tr>
<tr>
<td align="center" valign="top">0 item</td>
<td align="left" valign="top">0 item (0%)</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="2">Statistical model and threshold effect problem</td>
<td align="left" valign="top" rowspan="2">MCI vs. NC-combined AUC (95% CI)</td>
<td align="center" valign="middle">20 items</td>
<td align="left" valign="middle">20 items (100%)</td>
</tr>
<tr>
<td align="center" valign="middle">0 item</td>
<td align="left" valign="middle">0 item (0%)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In <xref ref-type="table" rid="tab2">Table 2</xref>, the issue of confused validation types is significant: the AUC of cross-validation (0.931) is higher than that of independent external validation (0.905). Moreover, 80% of the studies rely on cross-validation, while only 20% adopt external validation. Combined analysis is likely to falsely inflate accuracy. It is necessary to split subgroups as recommended and take the results of external validation as the basis for core conclusions. In terms of confused diagnostic tasks, the AUC of AD vs. NC is the highest (0.942) due to obvious pathological features, whereas the AUC of MCI vs. NC, which is more critical for early diagnosis, is the lowest (0.897). Combined analysis will mask the model&#x2019;s weakness in identifying mild cognitive impairment (MCI). It is required to present the results of each task separately and clearly define &#x201C;early AD&#x201D; to demonstrate the rationality of combination. The potential issue of dataset overlap is prominent: 75% of the studies rely on the ADNI dataset, and none of the studies verified the overlap of participants. There is a hidden risk of &#x201C;false precision&#x201D; in results caused by duplicate counting. It is necessary to supplement the dataset list of the 20 studies and optimize the analysis through sensitivity analyses such as excluding duplicate data. Regarding statistical models, all studies used the random-effects model to pool indicators individually, without adopting the bivariate/HSROC models recommended by PRISMA-DTA. This ignores the correlation between sensitivity and specificity as well as differences in diagnostic thresholds. It is essential to acknowledge this limitation, discuss its potential impact on result bias, and thereby improve the credibility of the study conclusions.</p>
</sec>
<sec id="sec16">
<label>4.6</label>
<title>Discussion</title>
<p>This study systematically evaluated the efficacy of Transformer-based multimodal fusion deep learning models in early AD diagnosis through meta-analysis. Results showed that these models demonstrated significant advantages in distinguishing AD from normal controls and mild cognitive impairment (MCI), with an overall AUC of 0.924 (95% CI: 0.912&#x2013;0.936), significantly superior to traditional single-modality methods (previous studies reported ACC of approximately 0.78&#x2013;0.82) (<xref ref-type="bibr" rid="ref7">7</xref>, <xref ref-type="bibr" rid="ref12">12</xref>). This finding confirms the unique value of the Transformer architecture in capturing complex correlations in cross-modal data, as its self-attention mechanism effectively models long-range dependencies in multi-source data (such as MRI, PET, and clinical indicators), addressing the limitations of traditional methods in detecting subtle early pathological changes (<xref ref-type="bibr" rid="ref9">9</xref>, <xref ref-type="bibr" rid="ref11">11</xref>).</p>
<p>Subgroup analyses reveal several key influencing factors: Trimodal and above fusion achieves a significantly higher AUC (0.935 vs. 0.908, <italic>p</italic>&#x202F;=&#x202F;0.012), indicating a synergistic effect of multi-source data integration. This is consistent with Tang et al.&#x2019;s. (<xref ref-type="bibr" rid="ref13">13</xref>) conclusion that dynamic modality attention mechanisms can optimize cross-modal feature weight allocation. Intermediate fusion strategy (feature-level fusion) shows superiority (AUC&#x202F;=&#x202F;0.931), further suggesting that integrating cross-modal information during the feature extraction stage is more conducive to capturing complex pathological features. This may be related to the strategy&#x2019;s ability to preserve raw data details and avoid early information loss (<xref ref-type="bibr" rid="ref15">15</xref>). Multicenter studies have higher AUC (0.930 vs. 0.918, <italic>p</italic>&#x202F;=&#x202F;0.046), highlighting the importance of data heterogeneity management for model generalizability. However, no significant difference is observed in sample size stratification, indicating that the current data scale generally meets model training requirements (<xref ref-type="bibr" rid="ref16">16</xref>).</p>
<p>Comparisons of typical studies highlight the clinical value of technological innovations: Khan et al.&#x2019;s (<xref ref-type="bibr" rid="ref20">20</xref>) Dual-3DM<sup>3</sup>AD model achieved an AUC of 0.945 through triplet preprocessing and 3D hybrid Transformer, validating the synergistic advantages of deep feature engineering and multi-task learning. Gao et al.&#x2019;s (<xref ref-type="bibr" rid="ref16">16</xref>) generative network maintained an AUC of 0.912 in scenarios with missing data, demonstrating the adaptability of cross-modal completion technology to real-world data. Odusami et al.&#x2019;s (<xref ref-type="bibr" rid="ref14">14</xref>) pixel-level ViT fusion reached an AUC of 0.897 in single-modality MRI analysis, proving Transformer&#x2019;s capability for high-resolution representation of imaging details. These results collectively indicate that innovations in model architecture (such as hybrid Transformer) and optimization of data fusion strategies are core pathways to improving diagnostic efficacy.</p>
<p>Although this study confirmed stable results (AUC fluctuation: 0.920&#x2013;0.928) and no significant publication bias (Egger&#x2019;s test, <italic>p</italic>&#x202F;=&#x202F;0.217) through sensitivity analysis, the following limitations should be noted: First, only seven of the included studies use multicenter data, and single-center bias may limit the model&#x2019;s performance in cross-cohort generalization (<xref ref-type="bibr" rid="ref4">4</xref>, <xref ref-type="bibr" rid="ref6">6</xref>). Second, the efficacy difference between hybrid Transformer and pure Transformer models do not reach statistical significance (&#x002A;<italic>p</italic>&#x002A;&#x202F;=&#x202F;0.068), indicating that the feature complementarity mechanism between traditional neural networks and Transformer requires further validation (<xref ref-type="bibr" rid="ref11">11</xref>). Additionally, insufficient model interpretability remains a major obstacle to clinical application, as the black-box nature of attention mechanisms struggles to meet the transparency requirements of diagnostic decision-making (<xref ref-type="bibr" rid="ref28">28</xref>).</p>
<p>Future research needs to focus on three major directions: First, promoting standardized integration of multicenter data and reducing the impact of data heterogeneity through technologies such as federated learning. Second, developing interpretability modules, such as introducing attention heatmaps to visualize brain region-pathology associations (<xref ref-type="bibr" rid="ref23">23</xref>). Third, optimizing lightweight model design by borrowing the attention bottleneck mechanism proposed by Kadri et al. (<xref ref-type="bibr" rid="ref19">19</xref>) to balance computational requirements and diagnostic accuracy. With the deep integration of Transformer technology with medical imaging and clinical data, such models are expected to become core tools for early and precise AD diagnosis, providing critical support for achieving the clinical goal of &#x201C;early detection and early intervention.&#x201D;</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec17">
<label>5</label>
<title>Conclusion</title>
<p>This study systematically evaluates the efficacy of Transformer-based multimodal fusion deep learning models in early AD diagnosis through meta-analysis. Results showed that these models achieved an overall AUC of 0.924 (95% CI: 0.912&#x2013;0.936), significantly superior to traditional methods, confirming the deep modeling capability of Transformer&#x2019;s self-attention mechanism for cross-modal data (e.g., MRI, PET, clinical indicators). Subgroup analyses reveal that trimodal and above fusion (AUC&#x202F;=&#x202F;0.935 vs. bimodal&#x202F;=&#x202F;0.908, <italic>p</italic>&#x202F;=&#x202F;0.012), intermediate fusion strategy (feature-level fusion, AUC&#x202F;=&#x202F;0.931), and multicenter data (AUC&#x202F;=&#x202F;0.930 vs. single-center&#x202F;=&#x202F;0.918, <italic>p</italic>&#x202F;=&#x202F;0.046) significantly improved diagnostic efficacy, indicating that the depth of multi-source data integration, fusion stage selection, and data heterogeneity management are key influencing factors. In typical studies, Khan et al.&#x2019;s (<xref ref-type="bibr" rid="ref20">20</xref>) 3D hybrid Transformer model achieved an AUC of 0.945 in AD vs. MCI discrimination, Gao et al.&#x2019;s (<xref ref-type="bibr" rid="ref16">16</xref>) generative network maintained an AUC of 0.912 with missing data, and Odusami et al.&#x2019;s (<xref ref-type="bibr" rid="ref14">14</xref>, <xref ref-type="bibr" rid="ref15">15</xref>) single-modality ViT fusion reached an AUC of 0.897, respectively validating the models&#x2019; advantages in feature engineering, robustness, and imaging detail representation. Although sensitivity analysis shows stable results (AUC fluctuation: 0.920&#x2013;0.928) and no significant publication bias (Egger&#x2019;s test, <italic>p</italic>&#x202F;=&#x202F;0.217), limitations such as a high proportion of single-center data and insufficient model interpretability were identified. Future research should focus on standardized multicenter data integration, development of interpretability modules (e.g., attention visualization), and lightweight design to promote clinical translation. In conclusion, Transformer-based multimodal fusion models provide highly effective tools for early AD diagnosis, with remarkable potential in dynamically modeling cross-modal associations. Technical innovations are urgently needed to address current bottlenecks and facilitate the clinical goal of &#x201C;early diagnosis and early treatment&#x201D; for AD.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec18">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="sec19">
<title>Author contributions</title>
<p>HG: Conceptualization, Data curation, Formal analysis, Writing &#x2013; original draft. ZY: Investigation, Methodology, Project administration, Writing &#x2013; original draft. GZ: Resources, Software, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. LL: Supervision, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. XZ: Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="funding-information" id="sec20">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="sec21">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec22">
<title>Generative AI statement</title>
<p>The authors declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec23">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec24">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fneur.2025.1641548/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fneur.2025.1641548/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.xlsx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>Q</given-names></name> <name><surname>Ma</surname> <given-names>Q</given-names></name> <name><surname>Da</surname> <given-names>L</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Wang</surname> <given-names>M</given-names></name> <name><surname>Xu</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>A transformer-based unified multimodal framework for Alzheimer&#x2019;s disease assessment</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>180</volume>:<fpage>108979</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108979</pub-id>, PMID: <pub-id pub-id-type="pmid">39098237</pub-id></citation></ref>
<ref id="ref2"><label>2.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Miao</surname> <given-names>S</given-names></name> <name><surname>Xu</surname> <given-names>Q</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Yang</surname> <given-names>C</given-names></name> <name><surname>Sheng</surname> <given-names>B</given-names></name> <name><surname>Liu</surname> <given-names>F</given-names></name> <etal/></person-group>. <article-title>MMTFN: multi-modal multi-scale transformer fusion network for Alzheimer&#x2019;s disease diagnosis</article-title>. <source>Int J Imaging Syst Technol</source>. (<year>2024</year>) <volume>34</volume>:<fpage>e22970</fpage>. doi: <pub-id pub-id-type="doi">10.1002/ima.22970</pub-id></citation></ref>
<ref id="ref3"><label>3.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Miao</surname> <given-names>S</given-names></name> <name><surname>Liu</surname> <given-names>F</given-names></name> <name><surname>Han</surname> <given-names>K</given-names></name> <name><surname>Bezabih</surname> <given-names>TT</given-names></name></person-group>. <article-title>HAMMF: hierarchical attention-based multi-task and multi-modal fusion model for computer-aided diagnosis of Alzheimer&#x2019;s disease</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>176</volume>:<fpage>108564</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108564</pub-id>, PMID: <pub-id pub-id-type="pmid">38744010</pub-id></citation></ref>
<ref id="ref4"><label>4.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>C</given-names></name> <name><surname>Wei</surname> <given-names>M</given-names></name> <name><surname>Sun</surname> <given-names>J</given-names></name> <name><surname>Wang</surname> <given-names>S</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name><collab id="coll1">Alzheimer&#x2019;s Disease Neuroimaging Initiative</collab></person-group>. <article-title>CsAGP: detecting Alzheimer&#x2019;s disease from multimodal images via dual-transformer with cross-attention and graph pooling</article-title>. <source>J King Saud Univ Comput Inf Sci</source>. (<year>2023</year>) <volume>35</volume>:<fpage>101618</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jksuci.2023.101618</pub-id>, PMID: <pub-id pub-id-type="pmid">38559705</pub-id></citation></ref>
<ref id="ref5"><label>5.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>G</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Zhao</surname> <given-names>Z</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Shang</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>A transformer-based multi-features fusion model for prediction of conversion in mild cognitive impairment</article-title>. <source>Methods</source>. (<year>2022</year>) <volume>204</volume>:<fpage>241</fpage>&#x2013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ymeth.2022.04.015</pub-id>, PMID: <pub-id pub-id-type="pmid">35487442</pub-id></citation></ref>
<ref id="ref6"><label>6.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>L</given-names></name> <name><surname>Liu</surname> <given-names>S</given-names></name> <name><surname>Zhang</surname> <given-names>L</given-names></name> <name><surname>To</surname> <given-names>XV</given-names></name> <name><surname>Nasrallah</surname> <given-names>F</given-names></name> <name><surname>Chandra</surname> <given-names>SS</given-names></name></person-group>. <article-title>Cascaded multi-modal mixing transformers for Alzheimer&#x2019;s disease classification with incomplete data</article-title>. <source>Neuroimage</source>. (<year>2023</year>) <volume>277</volume>:<fpage>120267</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuroimage.2023.120267</pub-id>, PMID: <pub-id pub-id-type="pmid">37422279</pub-id></citation></ref>
<ref id="ref7"><label>7.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ying</surname> <given-names>Y</given-names></name> <name><surname>Yang</surname> <given-names>T</given-names></name> <name><surname>Zhou</surname> <given-names>H</given-names></name></person-group>. <article-title>Multimodal fusion for Alzheimer&#x2019;s disease recognition</article-title>. <source>Appl Intell</source>. (<year>2023</year>) <volume>53</volume>:<fpage>16029</fpage>&#x2013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10489-022-04255-z</pub-id></citation></ref>
<ref id="ref8"><label>8.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zuo</surname> <given-names>Q</given-names></name> <name><surname>Shen</surname> <given-names>Y</given-names></name> <name><surname>Zhong</surname> <given-names>N</given-names></name> <name><surname>Chen</surname> <given-names>CLP</given-names></name> <name><surname>Lei</surname> <given-names>B</given-names></name> <name><surname>Wang</surname> <given-names>S</given-names></name></person-group>. <article-title>Alzheimer&#x2019;s disease prediction via brain structural-functional deep fusing network</article-title>. <source>IEEE Trans Neural Syst Rehabil Eng</source>. (<year>2023</year>) <volume>31</volume>:<fpage>4601</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNSRE.2023.3333952</pub-id>, PMID: <pub-id pub-id-type="pmid">37971911</pub-id></citation></ref>
<ref id="ref9"><label>9.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>R</given-names></name> <name><surname>Jo</surname> <given-names>W</given-names></name> <name><surname>Zhao</surname> <given-names>D</given-names></name> <name><surname>Wang</surname> <given-names>W</given-names></name> <name><surname>Gupte</surname> <given-names>A</given-names></name> <name><surname>Yang</surname> <given-names>B</given-names></name> <etal/></person-group>. <article-title>Husformer: a multimodal transformer for multimodal human state recognition</article-title>. <source>IEEE Trans Cogn Dev Syst</source>. (<year>2024</year>) <volume>16</volume>:<fpage>1374</fpage>&#x2013;<lpage>90</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TCDS.2024.3357618</pub-id></citation></ref>
<ref id="ref10"><label>10.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nagrani</surname> <given-names>A</given-names></name> <name><surname>Yang</surname> <given-names>S</given-names></name> <name><surname>Arnab</surname> <given-names>A</given-names></name> <name><surname>Jansen</surname> <given-names>A</given-names></name> <name><surname>Schmid</surname> <given-names>C</given-names></name> <name><surname>Sun</surname> <given-names>C</given-names></name></person-group>. <article-title>Attention bottlenecks for multimodal fusion</article-title>. <source>Adv Neural Inf Proces Syst</source>. (<year>2021</year>) <volume>34</volume>:<fpage>14200</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2107.00135</pub-id></citation></ref>
<ref id="ref11"><label>11.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Zeb</surname> <given-names>A</given-names></name> <name><surname>Suzauddola</surname> <given-names>MD</given-names></name> <name><surname>Wen</surname> <given-names>Y</given-names></name></person-group>. <article-title>Multimodal mixing convolutional neural network and transformer for Alzheimer&#x2019;s disease recognition</article-title>. <source>Expert Syst Appl</source>. (<year>2025</year>) <volume>259</volume>:<fpage>125321</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2024.125321</pub-id></citation></ref>
<ref id="ref12"><label>12.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sait</surname> <given-names>ARW</given-names></name> <name><surname>Nagaraj</surname> <given-names>R</given-names></name></person-group>. <article-title>A feature-fusion technique-based Alzheimer&#x2019;s disease classification using magnetic resonance imaging</article-title>. <source>Diagnostics</source>. (<year>2024</year>) <volume>14</volume>:<fpage>2363</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics14212363</pub-id>, PMID: <pub-id pub-id-type="pmid">39518331</pub-id></citation></ref>
<ref id="ref13"><label>13.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>Y</given-names></name> <name><surname>Xiong</surname> <given-names>X</given-names></name> <name><surname>Tong</surname> <given-names>G</given-names></name> <name><surname>Yang</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>H</given-names></name></person-group>. <article-title>Multimodal diagnosis model of Alzheimer&#x2019;s disease based on improved Transformer</article-title>. <source>Biomed Eng Online</source>. (<year>2024</year>) <volume>23</volume>:<fpage>8</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12938-024-01204-4</pub-id>, PMID: <pub-id pub-id-type="pmid">38243275</pub-id></citation></ref>
<ref id="ref14"><label>14.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Odusami</surname> <given-names>M</given-names></name> <name><surname>Maskeli&#x016B;nas</surname> <given-names>R</given-names></name> <name><surname>Dama&#x0161;evi&#x010D;ius</surname> <given-names>R</given-names></name></person-group>. <article-title>Pixel-level fusion approach with vision transformer for early detection of Alzheimer&#x2019;s disease</article-title>. <source>Electronics</source>. (<year>2023</year>) <volume>12</volume>:<fpage>1218</fpage>. doi: <pub-id pub-id-type="doi">10.3390/electronics12051218</pub-id></citation></ref>
<ref id="ref15"><label>15.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Odusami</surname> <given-names>M</given-names></name> <name><surname>Maskeli&#x016B;nas</surname> <given-names>R</given-names></name> <name><surname>Dama&#x0161;evi&#x010D;ius</surname> <given-names>R</given-names></name></person-group>. <article-title>Optimized convolutional fusion for multimodal neuroimaging in Alzheimer&#x2019;s disease diagnosis: enhancing data integration and feature extraction</article-title>. <source>J Pers Med</source>. (<year>2023</year>) <volume>13</volume>:<fpage>1496</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jpm13101496</pub-id>, PMID: <pub-id pub-id-type="pmid">37888107</pub-id></citation></ref>
<ref id="ref16"><label>16.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>X</given-names></name> <name><surname>Shi</surname> <given-names>F</given-names></name> <name><surname>Shen</surname> <given-names>D</given-names></name> <name><surname>Liu</surname> <given-names>M</given-names></name></person-group>. <article-title>Multimodal transformer network for incomplete image generation and diagnosis of Alzheimer&#x2019;s disease</article-title>. <source>Comput Med Imaging Graph</source>. (<year>2023</year>) <volume>110</volume>:<fpage>102303</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compmedimag.2023.102303</pub-id>, PMID: <pub-id pub-id-type="pmid">37832503</pub-id></citation></ref>
<ref id="ref17"><label>17.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Zhang</surname> <given-names>D</given-names></name> <name><surname>Zhang</surname> <given-names>L</given-names></name> <name><surname>Tao</surname> <given-names>L</given-names></name></person-group>. <article-title>Multi-feature fusion learning for Alzheimer&#x2019;s disease prediction using EEG signals in resting state</article-title>. <source>Front Neurosci</source>. (<year>2023</year>) <volume>17</volume>:<fpage>1272834</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2023.1272834</pub-id>, PMID: <pub-id pub-id-type="pmid">37822349</pub-id></citation></ref>
<ref id="ref18"><label>18.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roy</surname> <given-names>SK</given-names></name> <name><surname>Deria</surname> <given-names>A</given-names></name> <name><surname>Hong</surname> <given-names>D</given-names></name> <name><surname>Rasti</surname> <given-names>B</given-names></name> <name><surname>Plaza</surname> <given-names>A</given-names></name> <name><surname>Chanussot</surname> <given-names>J</given-names></name></person-group>. <article-title>Multimodal fusion transformer for remote sensing image classification</article-title>. <source>IEEE Trans Geosci Remote Sens</source>. (<year>2023</year>) <volume>61</volume>:<fpage>1</fpage>&#x2013;<lpage>20</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TGRS.2023.3286826</pub-id></citation></ref>
<ref id="ref19"><label>19.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kadri</surname> <given-names>R</given-names></name> <name><surname>Bouaziz</surname> <given-names>B</given-names></name> <name><surname>Tmar</surname> <given-names>M</given-names></name> <name><surname>Gargouri</surname> <given-names>F</given-names></name></person-group>. <article-title>Efficient multimodel method based on transformers and CoAtNet for Alzheimer&#x2019;s diagnosis</article-title>. <source>Digit Signal Process</source>. (<year>2023</year>) <volume>143</volume>:<fpage>104229</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.dsp.2023.104229</pub-id></citation></ref>
<ref id="ref20"><label>20.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khan</surname> <given-names>AA</given-names></name> <name><surname>Mahendran</surname> <given-names>RK</given-names></name> <name><surname>Perumal</surname> <given-names>K</given-names></name> <name><surname>Faheem</surname> <given-names>M</given-names></name></person-group>. <article-title>Dual-3DM<sup>3</sup>AD: mixed transformer based semantic segmentation and triplet pre-processing for early multi-class Alzheimer&#x2019;s diagnosis</article-title>. <source>IEEE Trans Neural Syst Rehabil Eng</source>. (<year>2024</year>) <volume>32</volume>:<fpage>696</fpage>&#x2013;<lpage>707</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNSRE.2024.3357723</pub-id>, PMID: <pub-id pub-id-type="pmid">38261494</pub-id></citation></ref>
<ref id="ref21"><label>21.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ilias</surname> <given-names>L</given-names></name> <name><surname>Askounis</surname> <given-names>D</given-names></name></person-group>. <article-title>Multimodal deep learning models for detecting dementia from speech and transcripts</article-title>. <source>Front Aging Neurosci</source>. (<year>2022</year>) <volume>14</volume>:<fpage>830943</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnagi.2022.830943</pub-id>, PMID: <pub-id pub-id-type="pmid">35370608</pub-id></citation></ref>
<ref id="ref22"><label>22.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>P</given-names></name> <name><surname>Zhu</surname> <given-names>X</given-names></name> <name><surname>Clifton</surname> <given-names>DA</given-names></name></person-group>. <article-title>Multimodal learning with transformers: a survey</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. (<year>2023</year>) <volume>45</volume>:<fpage>12113</fpage>&#x2013;<lpage>32</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2023.3275156</pub-id>, PMID: <pub-id pub-id-type="pmid">37167049</pub-id></citation></ref>
<ref id="ref23"><label>23.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Leng</surname> <given-names>Y</given-names></name> <name><surname>Cui</surname> <given-names>W</given-names></name> <name><surname>Peng</surname> <given-names>Y</given-names></name> <name><surname>Yan</surname> <given-names>C</given-names></name> <name><surname>Cao</surname> <given-names>Y</given-names></name> <name><surname>Yan</surname> <given-names>Z</given-names></name> <etal/></person-group>. <article-title>Multimodal cross enhanced fusion network for diagnosis of Alzheimer&#x2019;s disease and subjective memory complaints</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>157</volume>:<fpage>106788</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106788</pub-id></citation></ref>
<ref id="ref24"><label>24.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dai</surname> <given-names>Y</given-names></name> <name><surname>Zou</surname> <given-names>B</given-names></name> <name><surname>Zhu</surname> <given-names>C</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Chen</surname> <given-names>Z</given-names></name> <name><surname>Ji</surname> <given-names>Z</given-names></name> <etal/></person-group>. <article-title>DE-JANet: a unified network based on dual encoder and joint attention for Alzheimer&#x2019;s disease classification using multi-modal data</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>165</volume>:<fpage>107396</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107396</pub-id>, PMID: <pub-id pub-id-type="pmid">37703717</pub-id></citation></ref>
<ref id="ref25"><label>25.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>He</surname> <given-names>X</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Cai</surname> <given-names>Q</given-names></name> <name><surname>Chen</surname> <given-names>H</given-names></name> <name><surname>Qing</surname> <given-names>L</given-names></name></person-group>. <article-title>Multi-modal cross-attention network for Alzheimer&#x2019;s disease diagnosis with multi-modality data</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>162</volume>:<fpage>107050</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107050</pub-id>, PMID: <pub-id pub-id-type="pmid">37269680</pub-id></citation></ref>
<ref id="ref26"><label>26.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shin</surname> <given-names>H</given-names></name> <name><surname>Jeon</surname> <given-names>S</given-names></name> <name><surname>Seol</surname> <given-names>Y</given-names></name> <name><surname>Kim</surname> <given-names>S</given-names></name> <name><surname>Kang</surname> <given-names>D</given-names></name></person-group>. <article-title>Vision transformer approach for classification of Alzheimer&#x2019;s disease using 18F-Florbetaben brain images</article-title>. <source>Appl Sci</source>. (<year>2023</year>) <volume>13</volume>:<fpage>3453</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app13063453</pub-id></citation></ref>
<ref id="ref27"><label>27.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>Y</given-names></name> <name><surname>Lu</surname> <given-names>X</given-names></name> <name><surname>Zheng</surname> <given-names>L</given-names></name> <name><surname>Sun</surname> <given-names>M</given-names></name> <name><surname>Chen</surname> <given-names>S</given-names></name> <name><surname>Chen</surname> <given-names>B</given-names></name> <etal/></person-group>. <article-title>Application of multimodal transformer model in intelligent agricultural disease detection and question-answering systems</article-title>. <source>Plants</source>. (<year>2024</year>) <volume>13</volume>:<fpage>972</fpage>. doi: <pub-id pub-id-type="doi">10.3390/plants13070972</pub-id>, PMID: <pub-id pub-id-type="pmid">38611501</pub-id></citation></ref>
<ref id="ref28"><label>28.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cong</surname> <given-names>S</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Zhou</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>Z</given-names></name> <name><surname>Yao</surname> <given-names>X</given-names></name> <name><surname>Yang</surname> <given-names>C</given-names></name></person-group>. <article-title>Comprehensive review of transformer-based models in neuroscience, neurology, and psychiatry</article-title>. <source>Brain X</source>. (<year>2024</year>) <volume>2</volume>:<fpage>e57</fpage>. doi: <pub-id pub-id-type="doi">10.1002/brx2.57</pub-id></citation></ref>
<ref id="ref29"><label>29.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Antil</surname> <given-names>A</given-names></name> <name><surname>Dhiman</surname> <given-names>C</given-names></name></person-group>. <article-title>MF2ShrT: multimodal feature fusion using shared layered transformer for face anti-spoofing</article-title>. <source>ACM Trans Multimed Comput Commun Appl</source>. (<year>2024</year>) <volume>20</volume>:<fpage>1</fpage>&#x2013;<lpage>21</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3640817</pub-id></citation></ref>
<ref id="ref30"><label>30.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hampiholi</surname> <given-names>B</given-names></name> <name><surname>Jarvers</surname> <given-names>C</given-names></name> <name><surname>Mader</surname> <given-names>W</given-names></name> <name><surname>Neumann</surname> <given-names>H</given-names></name></person-group>. <article-title>Convolutional transformer fusion blocks for multi-modal gesture recognition</article-title>. <source>IEEE Access</source>. (<year>2023</year>) <volume>11</volume>:<fpage>34094</fpage>&#x2013;<lpage>103</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3263812</pub-id></citation></ref>
<ref id="ref31"><label>31.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Z</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Shen</surname> <given-names>G</given-names></name> <name><surname>Xu</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>J</given-names></name></person-group>. <article-title>TDFNet: transformer-based deep-scale fusion network for multimodal emotion recognition</article-title>. <source>IEEE ACM Trans Audio Speech Lang Process</source>. (<year>2023</year>) <volume>31</volume>:<fpage>3771</fpage>&#x2013;<lpage>82</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TASLP.2023.3316458</pub-id></citation></ref>
</ref-list>
</back>
</article>