<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2026.1767612</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Explainable multi-modal deep learning for transparent cancer diagnosis: integrating radiology, clinical features, and decision visualization</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Dash</surname>
<given-names>Sital</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3177394"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bewoor</surname>
<given-names>Laxmi</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3372770"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Dongre</surname>
<given-names>Yashwant</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3375606"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bhosle</surname>
<given-names>Amol</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3372669"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Patil</surname>
<given-names>Kailas</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3279418"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Jadhav</surname>
<given-names>Shrikant</given-names>
</name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3184647"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mohapatra</surname>
<given-names>Banani</given-names>
</name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Walia</surname>
<given-names>Bhavnish</given-names>
</name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3186040"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Computer Engineering, Vishakarma University</institution>, <city>Pune</city>, <state>Maharashtra</state>, <country country="in">India</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Computer Engineering, Vishwakarma Institute of Technology</institution>, <city>Pune</city>, <state>Maharashtra</state>, <country country="in">India</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Computer Science and Engineering, School of Computing, MIT Art, Design and Technology University</institution>, <city>Pune</city>, <state>Maharashtra</state>, <country country="in">India</country></aff>
<aff id="aff4"><label>4</label><institution>San Jose State University</institution>, <city>San Jose</city>, <state>CA</state>, <country country="us">United States</country></aff>
<aff id="aff5"><label>5</label><institution>Walmart</institution>, <city>Sunnyvale</city>, <state>CA</state>, <country country="us">United States</country></aff>
<aff id="aff6"><label>6</label><institution>Amazon</institution>, <city>New York</city>, <state>NY</state>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Sital Dash, <email xlink:href="mailto:sital.dash@vupune.ac.in">sital.dash@vupune.ac.in</email>; Kailas Patil, <email xlink:href="mailto:kailas.patil@vupune.ac.in">kailas.patil@vupune.ac.in</email>; Shrikant Jadhav, <email xlink:href="mailto:shrikant.jadhav@sjsu.edu">shrikant.jadhav@sjsu.edu</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-23">
<day>23</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>9</volume>
<elocation-id>1767612</elocation-id>
<history>
<date date-type="received">
<day>14</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>21</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Dash, Bewoor, Dongre, Bhosle, Patil, Jadhav, Mohapatra and Walia.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Dash, Bewoor, Dongre, Bhosle, Patil, Jadhav, Mohapatra and Walia</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-23">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Although artificial intelligence&#x2013;based cancer diagnostic models have demonstrated strong predictive performance, their lack of transparency and reliance on single-modality data continue to limit clinical trust and adoption. Effectively integrating multi-modal data with interpret-able decision-making remains a key challenge.</p>
</sec>
<sec>
<title>Methods</title>
<p>We propose an explainable multi-modal deep learning framework that integrates radiological imaging and structured clinical features using attention-based fusion. Image-level explanations are generated using Grad-CAM++, while SHAP is employed to quantify clinical feature contributions, enabling unified and cross-modal aligned interpretation rather than independent uni-modal explanations. The framework was evaluated on publicly available datasets, including CBIS-DDSM mammography, Duke Breast Cancer MRI, and TCGA cohorts (BRCA, LUAD, and GBM), comprising a total of 3,842 images from 2,917 patients.</p>
</sec>
<sec>
<title>Results</title>
<p>The proposed model consistently outperformed uni-modal approaches and simple fusion baselines, achieving an improved balance between sensitivity and specificity. Attention-based fusion demonstrated superior performance compared with feature concatenation, and the integration of explainability did not compromise predictive accuracy. Visual and clinical explanations highlighted diagnostically relevant tumor regions and established oncological risk factors. Stable performance across datasets indicates strong generalization capability.</p>
</sec>
<sec>
<title>Discussion</title>
<p>These results demonstrate that explainable multi-modal learning can effectively combine accuracy, interpret-ability, and robustness, supporting the development of reliable AI-based decision-support systems for cancer diagnosis.</p>
</sec>
</abstract>
<kwd-group>
<kwd>attention-based fusion</kwd>
<kwd>cancer diagnosis</kwd>
<kwd>clinical data integration</kwd>
<kwd>explainable artificial intelligence</kwd>
<kwd>medical imaging</kwd>
<kwd>model interpretability</kwd>
<kwd>multi-modal deep learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="7"/>
<table-count count="6"/>
<equation-count count="6"/>
<ref-count count="60"/>
<page-count count="14"/>
<word-count count="9520"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medicine and Public Health</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<sec id="sec2">
<label>1.1</label>
<title>Background</title>
<p>Early and precise detection of cancer is fundamental to improving patient outcomes, as timely diagnosis directly influences therapeutic decision-making and long-term survival (<xref ref-type="bibr" rid="ref23">Litjens et al., 2017</xref>). Diagnostic procedures rely heavily on radiological imaging&#x2014;such as computed tomography (CT), magnetic resonance imaging (MRI), and mammography&#x2014;complemented by structured clinical information, laboratory markers, and patient history (<xref ref-type="bibr" rid="ref10">Esteva et al., 2017</xref>). Deep learning (DL) has been able to a large extent to replicate the tumor localization, segmentation, and classification processes which are the usual tasks of human experts, and in fact, DL has been recognized to perform at par with expert clinicians in most cases (<xref ref-type="bibr" rid="ref16">He et al., 2016</xref>; <xref ref-type="bibr" rid="ref7">Dosovitskiy et al., 2021</xref>). Nevertheless, human clinical decision-making involves different modes and, therefore, the integration of varied sources of information is a must. Consequently, multi-modal deep learning, which utilizes the complementary insights of different data types to identify disease features that single-modality models may not be able to, has been widely adopted (<xref ref-type="bibr" rid="ref32">Nakach et al., 2024a</xref>).</p>
<p>Recent technological advancements in medical imaging, electronic health records (EHRs), and high-resolution data acquisition have exponentially increased the availability of different types of patient data, from radiological images to genomics markers and pathology slides (<xref ref-type="bibr" rid="ref20">Li et al., 2024</xref>). Such a massive release of multi-modal data opens up the possibility to construct much stronger and more inclusive diagnostic models that are capable of detecting disease signatures even when they are very faint and could be overlooked if each modality is analyzed independently (<xref ref-type="bibr" rid="ref19">Lai et al., 2024</xref>). Moreover, the application of clinical features&#x2014;like age, biomarkers, comorbidity, and treatment history&#x2014;in conjunction with imaging data has been proven to result in a substantial improvement in diagnostic accuracy as well as in the ability of cancer risk stratification in various cancer types (<xref ref-type="bibr" rid="ref57">Yang et al., 2025</xref>; <xref ref-type="bibr" rid="ref6">Chen et al., 2024</xref>).</p>
<p>Nevertheless, the medical AI community is increasingly realizing that accuracy is not enough. To be clinically viable, AI-powered diagnostic systems must also be transparent, interpret-able, and consistent with human experts&#x2019; reasoning patterns (<xref ref-type="bibr" rid="ref55">Xie et al., 2025</xref>). Besides reliability, clinicians also require explanations of the predictions, e.g., which radiological regions contributed to the output, how clinical factors influenced the decision, and how the different modalities interact to form the final assessment (<xref ref-type="bibr" rid="ref36">Oviedo et al., 2025</xref>). As a result, explainable artificial intelligence (XAI) has become a critical component in advancing trustworthy AI solutions for cancer diagnosis.</p>
</sec>
<sec id="sec3">
<label>1.2</label>
<title>Limitations in existing approaches</title>
<p>Deep learning has made a lot of progress in medical diagnosis. There are still some challenges that have not been resolved:</p>
<list list-type="order">
<list-item>
<p>Lack of interpretability: Most DL models operate as black boxes and provide limited transparency into their decision-making processes, reducing their clinical acceptability (<xref ref-type="bibr" rid="ref20">Li et al., 2024</xref>).</p>
</list-item>
<list-item>
<p>Single-modality explanations: Common explainable artificial intelligence (XAI) techniques primarily target imaging data and do not generalize effectively to multi-modal systems (<xref ref-type="bibr" rid="ref19">Lai et al., 2024</xref>).</p>
</list-item>
<list-item>
<p>Unlike existing multi-modal explainable AI approaches that typically apply independent post-hoc explanations to each modality, our framework explicitly aligns image-level and clinical-level explanations through the attention-based fusion process. Rather than treating Grad-CAM++ and SHAP as separate interpretability tools, the proposed model enforces explanation coherence across modalities, ensuring that radiological regions and clinical risk factors jointly support the same diagnostic reasoning. This cross-modal explanation alignment moves beyond simple integration of established techniques and enables unified, clinically meaningful interpretation of multi-modal decisions.</p>
</list-item>
<list-item>
<p>Fragmented interpret-ability: Existing XAI methods often explain each modality independently, failing to reveal how radiological and clinical features jointly contribute to predictions.</p>
</list-item>
<list-item>
<p>Limited clinical alignment: The explanations produced by many XAI approaches do not match the diagnostic reasoning used by clinicians, reducing trust and usability (<xref ref-type="bibr" rid="ref57">Yang et al., 2025</xref>).</p>
</list-item>
<list-item>
<p>Predominance of <italic>post-hoc</italic> methods: Most interpret-ability tools are applied after model training, which may not accurately represent the model&#x2019;s true internal reasoning (<xref ref-type="bibr" rid="ref6">Chen et al., 2024</xref>).</p>
</list-item>
</list>
</sec>
<sec id="sec4">
<label>1.3</label>
<title>Motivation</title>
<p>Advanced AI (Artificial Intelligence) systems need to be accurate, as well as transparent, interpret-able, and clinically relevant in order to be integrated into real clinical workflows. Medical professionals need understandable clarifications pointing out the radiological areas that affect the model predictions, the clinical variables that influence the diagnostic decisions, and how multi-modal evidence is combined. A unified, explainable multi-modal framework can therefore:</p>
<list list-type="bullet">
<list-item>
<p>enhance clinician trust in AI-assisted diagnosis,</p>
</list-item>
<list-item>
<p>support second-opinion and quality-assurance processes,</p>
</list-item>
<list-item>
<p>improve training and interpretive consistency, and</p>
</list-item>
<list-item>
<p>enable safer deployment of AI tools in oncology.</p>
</list-item>
</list>
</sec>
<sec id="sec5">
<label>1.4</label>
<title>Research gap</title>
<p>Although multi-modal DL and XAI have each advanced considerably, there is still no unified framework that:</p>
<list list-type="bullet">
<list-item>
<p>combines radiological and structured clinical data in an integrated multi-modal architecture,</p>
</list-item>
<list-item>
<p>provides consistent, clinically aligned, and interpret-able explanations across modalities, and</p>
</list-item>
<list-item>
<p>visualizes the fused decision-making process in a manner that reflects real diagnostic reasoning.</p>
</list-item>
</list>
<p>Current studies typically emphasize diagnostic accuracy or explain-ability alone, but rarely address both holistically in a way that supports clinical use. This gap limits the practical adoption of multi-modal AI systems in oncology.</p>
</sec>
<sec id="sec6">
<label>1.5</label>
<title>Research questions</title>
<p>In response to the identified limitations, this study addresses the following research questions:</p>
<disp-quote>
<p><italic>RQ1</italic>: How can radiological and clinical features be effectively integrated into a unified multi-modal deep learning framework for cancer diagnosis?</p>
</disp-quote>
<disp-quote>
<p><italic>RQ2</italic>: Which explainable artificial intelligence techniques can provide transparent, robust, and clinically coherent insights into multi-modal diagnostic decisions?</p>
</disp-quote>
<disp-quote>
<p><italic>RQ3</italic>: Does the proposed explainable multi-modal framework enhance both diagnostic accuracy and interpret-ability when compared with uni-modal and non-explainable models?</p>
</disp-quote>
</sec>
<sec id="sec7">
<label>1.6</label>
<title>Contributions of this work</title>
<p>This study offers the following key contributions:</p>
<list list-type="order">
<list-item>
<p>A novel multi-modal deep learning architecture that fuses radiology images with structured clinical data for comprehensive cancer diagnosis.</p>
</list-item>
<list-item>
<p>An integrated explain-ability module combining attention-based visualization, feature-attribution analysis, and cross-modal explanation consistency.</p>
</list-item>
<list-item>
<p>A unified interpret-ability pipeline that clarifies how image and clinical features jointly influence diagnostic outcomes.</p>
</list-item>
<list-item>
<p>Extensive experimental evaluation demonstrating improved diagnostic performance, transparency, and alignment with clinician reasoning.</p>
</list-item>
<list-item>
<p>A reproducible and deployable workflow designed to support trustworthy AI adoption in real clinical environments.</p>
</list-item>
</list>
</sec>
<sec id="sec8">
<label>1.7</label>
<title>Organization of the paper</title>
<p>The research paper initially reviews the literature related to multi-modal deep learning, explainable artificial intelligence (XAI) techniques, and deep learning for cancer diagnosis. Afterward, it elaborates the proposed multi-modal architecture, feature-fusion method, and integrated interpret-ability framework in a very detailed manner. The next sections include descriptions of the datasets, pre-processing methods, experimental setup, and evaluation metrics. The results section focuses both on the diagnostic performance and interpret-ability findings. The discussion section summarizes main insights, reviews clinical implications, limitations, and future research directions. The paper ends with a summary of the contributions and the importance of the proposed framework for trust-worthy artificial intelligence in oncology.</p>
</sec>
</sec>
<sec id="sec9">
<label>2</label>
<title>Related works</title>
<p>The section reviews research articles in a well-organized manner which is in line with the study&#x2019;s research questions. It first focuses on deep learning for cancer imaging (RQ1), then on multi-modal fusion frameworks (RQ1), followed by explainability methods (RQ2), multi-modal explainability (RQ2), and finally, evaluation strategies for trustworthy clinical deployment (RQ3).</p>
<sec id="sec10">
<label>2.1</label>
<title>Deep learning for cancer imaging</title>
<p>Deep learning has largely revolution cancer imaging with the help of convolutional neural networks (CNNs), transformers, and hybrid architectures resulting in significantly improved lesions detection, segmentation, and malignancy classification in CT, MRI, PET, and histopathology imaging modalities (<xref ref-type="bibr" rid="ref23">Litjens et al., 2017</xref>; <xref ref-type="bibr" rid="ref10">Esteva et al., 2017</xref>; <xref ref-type="bibr" rid="ref16">He et al., 2016</xref>; <xref ref-type="bibr" rid="ref7">Dosovitskiy et al., 2021</xref>; <xref ref-type="bibr" rid="ref33">Nakach et al., 2024b</xref>; <xref ref-type="bibr" rid="ref20">Li et al., 2024</xref>; <xref ref-type="bibr" rid="ref19">Lai et al., 2024</xref>; <xref ref-type="bibr" rid="ref57">Yang et al., 2025</xref>). Very recent studies demonstrate the benefits of a large-scale self-supervised pretraining, federated models, and foundation architectures which allow generalization to different institutions and imaging devices (<xref ref-type="bibr" rid="ref6">Chen et al., 2024</xref>; <xref ref-type="bibr" rid="ref55">Xie et al., 2025</xref>; <xref ref-type="bibr" rid="ref36">Oviedo et al., 2025</xref>; <xref ref-type="bibr" rid="ref27">Ma et al., 2024</xref>). Multi-center testing of DL models also confirms their stability and reliability in real clinical scenarios, notably in breast, lung, and brain cancer diagnosis (<xref ref-type="bibr" rid="ref28">Maigari et al., 2025a</xref>; <xref ref-type="bibr" rid="ref26">Liu et al., 2025</xref>; <xref ref-type="bibr" rid="ref25">Liu et al., 2025</xref>). These breakthroughs constitute a strong argument for the use of image-based neural representations as the core of integrated diagnostic frameworks.</p>
</sec>
<sec id="sec11">
<label>2.2</label>
<title>Multi-modal learning in oncology</title>
<p>Multi-modal learning combines imaging with structured clinical variables, genomic profiles, pathology images, and patient demographics to improve the accuracy of staging, prognosis, and subtype prediction (<xref ref-type="bibr" rid="ref4">Buzdugan et al., 2025a</xref>; <xref ref-type="bibr" rid="ref18">Kumar et al., 2025</xref>; <xref ref-type="bibr" rid="ref21">Liang et al., 2025a</xref>; <xref ref-type="bibr" rid="ref52">Turki et al., 2025</xref>; <xref ref-type="bibr" rid="ref3">Bhosekar et al., 2025</xref>; <xref ref-type="bibr" rid="ref41">Ramkumar et al., 2023</xref>; <xref ref-type="bibr" rid="ref15">He et al., 2024</xref>; <xref ref-type="bibr" rid="ref24">Liu et al., 2025</xref>). Fusion strategies, including early, late, and hybrid fusion, consistently show benefits over uni-modal systems, with hybrid attention-based mechanisms being most effective in capturing cross-modality interactions (<xref ref-type="bibr" rid="ref8">Ennab et al., 2025a</xref>; <xref ref-type="bibr" rid="ref39">Peng et al., 2025</xref>; <xref ref-type="bibr" rid="ref32">Nakach et al., 2024a</xref>, <xref ref-type="bibr" rid="ref33">2024b</xref>; <xref ref-type="bibr" rid="ref45">Shah et al., 2024</xref>). Research in breast, lung, and glioma oncology shows that multi-modal fusion results in better risk stratification, recurrence prediction, and treatment response modelling (<xref ref-type="bibr" rid="ref11">Fayyaz et al., 2025</xref>; <xref ref-type="bibr" rid="ref46">Singh et al., 2025</xref>; <xref ref-type="bibr" rid="ref14">Ghasemi et al., 2024</xref>; <xref ref-type="bibr" rid="ref54">Wei et al., 2025</xref>). Current literature also deals with real-world problems- such as incomplete modalities, data heterogeneity, and alignment issues-and suggests different designs to accommodate missing or noisy modalities (<xref ref-type="bibr" rid="ref35">Oviedo et al., 2025</xref>; <xref ref-type="bibr" rid="ref13">Gharaibeh et al., 2025</xref>; <xref ref-type="bibr" rid="ref17">Kumar et al., 2023</xref>).</p>
</sec>
<sec id="sec12">
<label>2.3</label>
<title>Explainable artificial intelligence for medical imaging</title>
<p>Explainable artificial intelligence (XAI) is increasingly being looked at as one of the indispensable elements for the medical AI to be trusted. XAI core methods involve gradient-based visualization (Grad-CAM and its variants), perturbation and occlusion analyses, game-theoretic feature attribution (SHAP), and local surrogate modelling (LIME) (<xref ref-type="bibr" rid="ref1">Aftab et al., 2025</xref>; <xref ref-type="bibr" rid="ref47">Singh et al., 2025</xref>; <xref ref-type="bibr" rid="ref50">Tempel et al., 2025</xref>; <xref ref-type="bibr" rid="ref40">Rabah et al., 2025</xref>; <xref ref-type="bibr" rid="ref29">Maigari et al., 2025b</xref>; <xref ref-type="bibr" rid="ref5">Buzdugan et al., 2025b</xref>). Comparative studies weigh the faithfulness of the explanation, resistance to noise, and clinical interpret-ability, thus different tasks and architectures being able to take advantage of customized explanation strategies (<xref ref-type="bibr" rid="ref49">Song et al., 2025</xref>; <xref ref-type="bibr" rid="ref22">Liang et al., 2025b</xref>; <xref ref-type="bibr" rid="ref9">Ennab et al., 2025b</xref>). A lot of ground has been covered in getting the heatmaps to be more stable, lessening the artifact activation, and merging spatial and feature-level explanations for giving more reliable interpret-ability (<xref ref-type="bibr" rid="ref59">Zhao et al., 2024</xref>; <xref ref-type="bibr" rid="ref38">Patel et al., 2025</xref>; <xref ref-type="bibr" rid="ref60">Zhou et al., 2024</xref>).</p>
</sec>
<sec id="sec13">
<label>2.4</label>
<title>Explainability for multimodal models</title>
<p>Multiple studies have been carried out in the area of multi-modal learning intersecting with explainability, which is phenomenal. The techniques comprise of unified attribution scoring across modalities, attention-based explanation layers embedded into fusion architectures, and cross-modal visualization techniques linking image regions with clinical variables (<xref ref-type="bibr" rid="ref51">Thambawita et al., 2024</xref>; <xref ref-type="bibr" rid="ref31">Nagar et al., 2025</xref>; <xref ref-type="bibr" rid="ref30">Martins et al., 2024</xref>; <xref ref-type="bibr" rid="ref58">Yoon et al., 2025</xref>). Experiments reveal that the provision of visual explanations along with the structured feature attributions makes the model more transparent and hence easier for the clinicians to follow their reasoning, which is more than what is achievable through uni-modal explanation pipelines (<xref ref-type="bibr" rid="ref2">Ahmed et al., 2024</xref>; <xref ref-type="bibr" rid="ref12">Fernandez et al., 2025</xref>; <xref ref-type="bibr" rid="ref34">Ortega et al., 2024</xref>). The newly proposed architectures, as a matter of fact, have embedded the explanation mechanisms within the learning process thus leading to the behavior of a model that is more consistent with the explanations, rather than applying them post-hoc (<xref ref-type="bibr" rid="ref37">Park et al., 2025</xref>; <xref ref-type="bibr" rid="ref42">Rossi et al., 2024</xref>; <xref ref-type="bibr" rid="ref48">Singh et al., 2025</xref>).</p>
</sec>
<sec id="sec14">
<label>2.5</label>
<title>Evaluation, robustness, and clinical validation</title>
<p>Firstly, quantitative metrics (faithfulness, localization accuracy, stability) need to be supported by human-centered studies evaluating clarity, usability, and clinical alignment for a thorough assessment of explainability (<xref ref-type="bibr" rid="ref53">Wang et al., 2024</xref>; <xref ref-type="bibr" rid="ref42">Rossi et al., 2024</xref>). Robustness analyses concern also the sensitivity to domain shifts, adversarial perturbations, and missing modalities, where mitigation strategies use uncertainty estimation, domain adaptation, and counterfactual reasoning (<xref ref-type="bibr" rid="ref56">Xu et al., 2025</xref>; <xref ref-type="bibr" rid="ref38">Patel et al., 2025</xref>; <xref ref-type="bibr" rid="ref51">Thambawita et al., 2024</xref>). Several clinical trials disclose that explainable multi-modal systems become the source of diagnostic confidence and thus, the clinicians&#x2019; decision-making process is facilitated when such systems are used as decision-support tools in radiology and oncology workflows (<xref ref-type="bibr" rid="ref60">Zhou et al., 2024</xref>; <xref ref-type="bibr" rid="ref30">Martins et al., 2024</xref>; <xref ref-type="bibr" rid="ref58">Yoon et al., 2025</xref>; <xref ref-type="bibr" rid="ref48">Singh et al., 2025</xref>).</p>
</sec>
<sec id="sec15">
<label>2.6</label>
<title>Summary</title>
<p>This review draws out three major revelations that are in direct alignment with the study&#x2019;s questions of research: (i) multi-modal learning significantly improves diagnostic performance but needs strong fusion strategies (RQ1); (ii) XAI methods offer useful interpret-ability but have to be changed for multi-modal fusion architectures (RQ2); and (iii) integrated evaluation protocols that merge accuracy, interpret-ability, and clinician usability are indispensable for real clinical translation (RQ3). These insights, in aggregate, serve as a rationale for the creation of the explainable multi-modal deep learning framework proposed.</p>
</sec>
</sec>
<sec sec-type="methods" id="sec16">
<label>3</label>
<title>Methodology</title>
<p>The framework that is being proposed combines radiological imaging and structured clinical data in a single multi-modal deep learning architecture that is enhanced by an explainability module which delivers transparent, clinically aligned interpretations. The methodological flow comprises data acquisition from publicly available repositories, preprocessing and harmonization of heterogeneous modalities, uni-modal feature extraction, multi-modal fusion through an attention-based mechanism, classification using a joint prediction head, and multi-modal explainability using both visual and feature-level attributions. Each of these components is designed to address the three research questions by enabling effective multi-modal integration (RQ1), clinically coherent explainability (RQ2), and interpret-able performance evaluation (RQ3). As shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>, the framework consists of parallel imaging and clinical pipelines followed by attention-based fusion and explainability modules.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Overview of the proposed explainable multimodal deep learning framework for cancer diagnosis. Radiological images are processed through a CNN&#x2013;Transformer encoder to generate image embeddings, while structured clinical features are encoded using a feedforward neural network. An attention-based fusion module integrates both modalities for final classification. Model decisions are explained using visual heatmaps for imaging data and feature-importance scores for clinical variables.</p>
</caption>
<graphic xlink:href="frai-09-1767612-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating a dual-input machine learning pipeline where imaging data undergoes preprocessing, CNN transformer encoding, and embedding, while clinical data including features like age and tumor size is processed by a feedforward neural network; both streams are fused via attention, leading to classification with outputs including a heatmap explanation for imaging and a feature importance bar chart for clinical variables.</alt-text>
</graphic>
</fig>
<p>The overall training procedure of the proposed explainable multi-modal framework is summarized in <xref rid="fig8" ref-type="fig">Algorithm 1</xref>. <xref rid="fig8" ref-type="fig">Algorithm 1</xref> formalizes the end-to-end training pipeline of the proposed multi-modal model, including data preprocessing, uni-modal feature extraction, attention-based fusion, classification, and validation-driven early stopping.</p>
<fig position="float" id="fig8">
<label>Algorithm 1</label>
<caption><p>Explainable Multimodal Deep Learning Methodology</p></caption>
<graphic xlink:href="frai-09-1767612-i001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Algorithmic pipeline flowchart in text form describing a multimodal model training process, including preprocessing of images and clinical data, feature extraction using CNN-Transformer and MLP, cross-modal attention fusion, classifier, loss calculation, backpropagation, parameter updates with AdamW, validation, and early stopping.</alt-text>
</graphic>
</fig>
<sec id="sec17">
<label>3.1</label>
<title>Dataset sources and availability</title>
<p>This study utilizes multiple publicly available cancer imaging datasets that provide radiological images and associated structured clinical metadata. All datasets are fully de-identified and released for research use. After preprocessing, quality control, and multimodal alignment, a total of 3,842 images from 2,917 patients were retained across all datasets for experimental evaluation.</p>
<sec id="sec18">
<label>3.1.1</label>
<title>Breast imaging datasets</title>
<sec id="sec19">
<label>3.1.1.1</label>
<title>CBIS-DDSM (curated breast imaging subset of DDSM)</title>
<p>The Curated Breast Imaging Subset of the Digital Database for Screening Mammography (CBIS-DDSM) (<xref ref-type="bibr" rid="ref44">Sawyer-Lee et al., 2016</xref>), hosted by The Cancer Imaging Archive (TCIA), contains digitized mammograms with pathology-verified benign and malignant findings acquired in craniocaudal (CC) and mediolateral oblique (MLO) views.</p>
<p>In this study, 1,200 mammography images from the CBIS-DDSM collection were selected after preprocessing and quality filtering. Images were resized and normalized prior to training, and available clinical attributes such as patient age and breast density were incorporated as structured clinical inputs for multi-modal fusion. CBIS-DDSM is available at the link given below.<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref></p>
</sec>
<sec id="sec20">
<label>3.1.1.2</label>
<title>TCIA&#x2014;breast MRI (breast-MRI-NACT and RIDER breast MRI)</title>
<p>Breast MRI data were obtained from the Duke Breast Cancer MRI collection hosted by TCIA (<xref ref-type="bibr" rid="ref43">Saha et al., 2021</xref>). This dataset consists of dynamic contrast-enhanced (DCE) MRI scans from biopsy-confirmed invasive breast cancer cases, along with associated clinical and pathological metadata.</p>
<p>Following bias-field correction, intensity normalization, and spatial alignment, 900 MRI images were included in this study. Corresponding clinical variables, including tumor grade and hormonal receptor status, were integrated with imaging features to support multi-modal learning and explainability analysis. This dataset is available at: <ext-link xlink:href="https://www.cancerimagingarchive.net/collection/duke-breast-cancer-mri/" ext-link-type="uri">https://www.cancerimagingarchive.net/collection/duke-breast-cancer-mri/</ext-link>.</p>
</sec>
</sec>
<sec id="sec21">
<label>3.1.2</label>
<title>TCGA (the cancer genome atlas)&#x2014;BRCA/LUAD/GBM</title>
<p>To evaluate cross-cancer generalization, data from The Cancer Genome Atlas (TCGA) were accessed via the Genomic Data Commons (GDC) portal, including the TCGA-BRCA, TCGA-LUAD, and TCGA-GBM projects. These cohorts provide comprehensive clinical annotations and, where available, corresponding radiological images via TCIA.</p>
<p>After alignment of imaging records with structured clinical data and exclusion of incomplete cases, 1,742 images from the TCGA cohorts were retained. The clinical variables consisted of age, tumor stage, survival outcomes, and selected molecular features, which allowed the assessment of the proposed model for different types of cancer.</p>
<p>Availability: <ext-link xlink:href="https://portal.gdc.cancer.gov/projects/TCGA-BRCA" ext-link-type="uri">https://portal.gdc.cancer.gov/projects/TCGA-BRCA</ext-link>, <ext-link xlink:href="https://portal.gdc.cancer.gov/projects/TCGA-LUAD" ext-link-type="uri">https://portal.gdc.cancer.gov/projects/TCGA-LUAD</ext-link>, <ext-link xlink:href="https://portal.gdc.cancer.gov/projects/TCGA-GBM" ext-link-type="uri">https://portal.gdc.cancer.gov/projects/TCGA-GBM</ext-link>.</p>
</sec>
<sec id="sec22">
<label>3.1.3</label>
<title>Dataset usage and splitting</title>
<p>For all datasets, samples were separated into training, validation, and test sets by means of stratified splitting in order to maintain class distributions. They have also been preprocessed and data augmented in a similar manner across different modalities to allow for fair comparison and reproducibility. Overall, 3,842 images from 2,917 patients were utilized in all experiments.</p>
</sec>
<sec id="sec23">
<label>3.1.4</label>
<title>Dataset summary</title>
<p>Summary of datasets used in this study is shown in <xref ref-type="table" rid="tab1">Table 1</xref>.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Summary of datasets used in this study.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Dataset</th>
<th align="left" valign="top">Cancer type</th>
<th align="left" valign="top">Imaging modality</th>
<th align="center" valign="top">Images used</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">CBIS-DDSM (TCIA)</td>
<td align="left" valign="middle">Breast</td>
<td align="left" valign="middle">Mammography</td>
<td align="center" valign="middle">1,200</td>
</tr>
<tr>
<td align="left" valign="middle">Duke breast cancer MRI (TCIA)</td>
<td align="left" valign="middle">Breast</td>
<td align="left" valign="middle">DCE-MRI</td>
<td align="center" valign="middle">900</td>
</tr>
<tr>
<td align="left" valign="middle">TCGA-BRCA/LUAD/GBM (GDC/TCIA)</td>
<td align="left" valign="middle">Breast/lung/brain</td>
<td align="left" valign="middle">Imaging + Clinical</td>
<td align="center" valign="middle">1,742</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="left" valign="middle">&#x2013;</td>
<td align="left" valign="middle">&#x2013;</td>
<td align="center" valign="middle">3,842</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="sec24">
<label>3.2</label>
<title>Preprocessing and data harmonization</title>
<p>Standardization of radiological images is done by voxel intensity normalization, N4 bias-field correction for MRI, and z-score scaling for CT attenuation values. Spatial harmonization is achieved by isotropic resampling to uniform voxel spacing, and then cropping or padding to fixed input dimensions. To keep morphological features and at the same time not over-fit, data augmentation is done through affine transformations, elastic deformation, and contrast perturbations. Clinical variables are given categorical encoding, outlier correction using inter-quartile filters, and min-max normalization. Combined multi-modal instances are created by matching patient identifiers across datasets. Those cases which do not have complete modality pairing are either removed or dealt with through auxiliary missing-modality embedding.</p>
</sec>
<sec id="sec25">
<label>3.3</label>
<title>Unimodal feature extraction</title>
<p>The image stream utilizes a hybrid CNN-Transformer backbone, wherein convolutional layers are used to obtain low-level spatial features and a Vision Transformer (ViT) encoder is employed to model long-range contextual dependencies. This two-stage representation is able to capture local morphological changes as well as global radiological patterns related to tumor aggressiveness. The clinical stream is a feed-forward network with multi-layer perceptron to generate the latent embedding of the tabular variables that represent patient-specific risk factors. The two encoding branches are aimed at generating modality-specific feature representations in a common latent space which is fusion compatible.</p>
</sec>
<sec id="sec26">
<label>3.4</label>
<title>Multi-modal fusion mechanism</title>
<p>Information from various modes is brought together through an attention-guided fusion module that changes the weights of the modalities depending on the context and the relevance for a particular prediction. The fusion mechanism computes cross-modal attention matrices that map clinical attributes onto image features and vice versa, thereby modelling how radiological abnormalities interact with clinical biomarkers. The fused embedding is passed through a joint prediction head that outputs class probabilities for diagnostic labels such as benign versus malignant status or tumor sub-type categories.</p>
<p>A conceptual diagram of the architecture (<xref ref-type="fig" rid="fig1">Figure 1</xref>) consists of parallel imaging and clinical streams feeding into an attention-based fusion block, followed by a unified classifier and an explainability generator. The imaging branch processes normalized TCIA/CBIS-DDSM scans through CNN and Transformer encoders, the clinical branch encodes structured TCGA/EHR variables, and the fusion block produces a single multi-modal vector used for classification and interpretation.</p>
</sec>
<sec id="sec27">
<label>3.5</label>
<title>Mathematical formulation</title>
<p>The uni-modal feature extraction and cross-modal attention fusion are mathematically formulated in <xref ref-type="disp-formula" rid="E1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="E4">4</xref>, while the classification and optimization steps are defined in <xref ref-type="disp-formula" rid="E5">Equations 5</xref>, <xref ref-type="disp-formula" rid="E6">6</xref>. Together, <xref ref-type="disp-formula" rid="E1">Equations (1</xref>&#x2013; <xref ref-type="disp-formula" rid="E6">6)</xref> provide the complete mathematical description of the proposed explainable multi-modal framework.</p>
<p>Let X<sub>I</sub> denote the preprocessed radiological image input (image_p) and X<sub>c</sub> denote the preprocessed structured clinical feature vector (clinical_p). The multi-modal framework consists of modality-specific encoders, a cross-modal attention fusion mechanism, and a unified classifier.</p>
<sec id="sec28">
<label>3.5.1</label>
<title>Uni-modal feature encoding</title>
<p>The imaging branch employs a CNN&#x2013;Transformer encoder, denoted by the function <inline-formula>
<mml:math id="M1">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>I</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mo>&#x22C5;</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>, which extracts spatial and contextual radiological representations:</p>
<disp-formula id="E1">
<mml:math id="M2">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>I</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>I</mml:mi>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>I</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>
<p>where <inline-formula>
<mml:math id="M3">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>I</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> corresponds to img_feat in <xref rid="fig8" ref-type="fig">Algorithms 1</xref>, <xref rid="fig9" ref-type="fig">2</xref>.</p>
<p>Similarly, the clinical branch uses a multilayer perceptron encoder <inline-formula>
<mml:math id="M4">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">(</mml:mo>
<mml:mo>&#x00B7;</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula> to project structured clinical variables into a latent embedding space:</p>
<disp-formula id="E2">
<mml:math id="M5">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>
<p>where <inline-formula>
<mml:math id="M6">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula> corresponds to clin_feat in the algorithms.</p>
</sec>
<sec id="sec29">
<label>3.5.2</label>
<title>Cross-modal attention-based fusion</title>
<p>To model interactions between radiological and clinical modalities, multi-modal fusion is performed using a cross-attention mechanism. Query, key, and value matrices are computed as linear projections of the uni-modal features:</p>
<disp-formula id="E3">
<mml:math id="M7">
<mml:mi>Q</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>I</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:math>
<label>(3)</label>
</disp-formula>
<p>Where <inline-formula>
<mml:math id="M8">
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M9">
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, and <inline-formula>
<mml:math id="M10">
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> are learnable projection matrices and d is the attention dimensionality.</p>
<p>The fused multi-modal representation is then obtained as:</p>
<disp-formula id="E4">
<mml:math id="M11">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>F</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>softmax</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mfrac>
<mml:msup>
<mml:mi>QK</mml:mi>
<mml:mi mathvariant="normal">T</mml:mi>
</mml:msup>
<mml:msqrt>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:msqrt>
</mml:mfrac>
<mml:mo stretchy="true">)</mml:mo>
<mml:mspace width="0.25em"/>
<mml:mi>V</mml:mi>
</mml:math>
<label>(4)</label>
</disp-formula>
<p>where <inline-formula>
<mml:math id="M12">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>F</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> corresponds to fused_feat produced by the CrossModalAttention(&#x00B7;) module in <xref rid="fig8" ref-type="fig">Algorithms 1</xref>, <xref rid="fig9" ref-type="fig">2</xref>.</p>
</sec>
<sec id="sec30">
<label>3.5.3</label>
<title>Classification and optimization</title>
<p>The fused multi-modal embedding is passed to a unified classification head to estimate class probabilities:</p>
<disp-formula id="E5">
<mml:math id="M13">
<mml:mover accent="true">
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mtext>softmax</mml:mtext>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>W</mml:mi>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>F</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>b</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>
<p>where W and b denote the classifier parameters, and <inline-formula>
<mml:math id="M14">
<mml:mover accent="true">
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
</mml:math>
</inline-formula> corresponds to the predicted class probabilities (logits) used in both training and inference.</p>
<p>Model parameters are optimized by minimizing the cross-entropy loss with L2 regularization:</p>
<disp-formula id="E6">
<mml:math id="M15">
<mml:mi>L</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi mathvariant="italic">CE</mml:mi>
</mml:msub>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mover accent="true">
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mi>&#x03BB;</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mo stretchy="true">&#x2016;</mml:mo>
<mml:mi>&#x03B8;</mml:mi>
<mml:mo stretchy="true">&#x2016;</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:math>
<label>(6)</label>
</disp-formula>
<p>where y is the ground-truth label, <italic>&#x03B8;</italic> represents all trainable parameters, and <italic>&#x03BB;</italic> controls the regularization strength. Optimization is performed using the AdamW optimizer, consistent with <xref rid="fig8" ref-type="fig">Algorithm 1</xref>.</p>
</sec>
</sec>
<sec id="sec31">
<label>3.6</label>
<title>Explainability framework</title>
<p>To provide transparent and clinically relevant interpretations, the model has integrated explainability at two levels: (1) image-level visualization with Grad-CAM++ and attention-rollout to locate the spatial tumor regions influencing the prediction and (2) clinical feature attribution with SHAP to determine the contribution of biomarkers, demographic variables, and laboratory features. The explanations of the two modalities are merged into a multi-modal explanation map that not only aligns the highlighted radiological regions with the corresponding clinical determinants but also, thus, it addresses the research goal of illustrating the fused decision pathways. Moreover, consistency checks are performed to ensure that explanations remain unchanged even if perturbations are applied, thereby reducing the risk of interpretations that are incorrect. This design differs from conventional multimodal XAI pipelines, where explanations are generated independently for each modality, by explicitly coupling explanation generation with the fusion mechanism so that both visual and clinical attributions reflect a shared multimodal decision pathway.</p>
<p>In order to facilitate transparent and clinically meaningful decision-making at the time of inference, a trained multi-modal model is supplemented with a dedicated explainability workflow. This operation produces supplementary explanations at both the image and clinical feature levels, thus, users are informed not only about the final diagnostic prediction but also about the multi-modal evidence that has led to it. The complete inference and explainability process that has been used during the deployment of the model is presented in <xref rid="fig9" ref-type="fig">Algorithm 2</xref>.</p>
<fig position="float" id="fig9">
<label>Algorithm 2</label>
<caption><p>Multi-modal Inference and Explainability Pipeline</p></caption>
<graphic xlink:href="frai-09-1767612-i002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart-style text describing a multimodal prediction pipeline with steps for loading a trained model, preprocessing image and clinical data, generating image and clinical features, fusing features, predicting class, and generating both image and clinical explanations before outputting results.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec32">
<label>3.7</label>
<title>Training and evaluation protocol</title>
<p>The dataset is splitted into training, validation, and testing subsets through stratified sampling to maintain class balance. Training is done with AdamW optimization, cosine learning-rate scheduling, and early stopping based on validation loss. Performance metrics include accuracy, sensitivity, specificity, AUC, and F1-score, while interpret-ability metrics include explanation faithfulness, attribution stability, and clinician-alignment scoring. Where possible, evaluation incorporates expert radiologist review to ensure clinical plausibility.</p>
<p>All statistical analyses were performed following standard practices in medical AI evaluation. Performance metrics were computed with confidence intervals obtained via bootstrapping, and comparisons between models were conducted using consistent train&#x2013;validation&#x2013;test splits to avoid data leakage. The evaluation protocol was reviewed to ensure appropriate metric selection, sufficient sample representation, and methodological validity. These measures ensure that the reported results are statistically sound and reproducible.</p>
<sec id="sec33">
<label>3.7.1</label>
<title>Explainability stability and reliability evaluation</title>
<p>In addition to qualitative visual inspection, the reliability of explanations was quantitatively evaluated using stability and faithfulness metrics. Explanation stability was assessed by measuring the consistency of Grad-CAM++ heatmaps and SHAP feature attributions under small input perturbations, including Gaussian noise and minor spatial transformations. For image explanations, the Structural Similarity Index (SSIM) was used to compare original and perturbed Grad-CAM++ maps, while Pearson correlation was employed to evaluate consistency between SHAP attribution vectors.</p>
<p>Explanation faithfulness was evaluated by progressively masking the most highly activated image regions identified by Grad-CAM++ and removing the top-ranked clinical features identified by SHAP, followed by measuring the resulting decrease in prediction confidence. A larger drop in model confidence indicates stronger alignment between explanations and the model&#x2019;s true decision-making process. These quantitative measures ensure that the generated explanations are stable, robust, and meaningfully linked to prediction outcomes.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="sec34">
<label>4</label>
<title>Results</title>
<p>This section presents a comprehensive evaluation of the proposed explainable multi-modal deep learning framework. Quantitative diagnostic performance, explainability analysis, cross-dataset generalization, clinician-centered assessment, and ablation studies are reported using a combination of tables and figures to provide transparent and interpretable evidence of model effectiveness.</p>
<sec id="sec35">
<label>4.1</label>
<title>Experimental setup and evaluation protocol</title>
<p>Experiments were conducted on CBIS-DDSM, TCIA Breast MRI, and TCGA (BRCA, LUAD, and GBM) cohorts using stratified training, validation, and test splits. Uni-modal image-only and clinical-only baselines, as well as non-explainable multi-modal variants, were implemented under identical training settings to ensure fair comparison. Performance was evaluated using accuracy, sensitivity, specificity, F1-score, and AUC. Explainability was assessed using attribution faithfulness, stability, and clinician-alignment metrics. Stability and faithfulness were evaluated using SSIM-based heatmap similarity and attribution-removal confidence degradation tests.</p>
</sec>
<sec id="sec36">
<label>4.2</label>
<title>Overall diagnostic performance</title>
<p><xref ref-type="table" rid="tab2">Table 2</xref> presents a summary of the diagnostic capability of the suggested framework as a comparison to various uni-modal and multi-modal baselines on breast cancer datasets. The explanable multi-modal model kept on delivering enhanced performances, showing that it was able to better balance sensitivity and specificity.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Diagnostic performance comparison on breast cancer datasets.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="left" valign="top">Accuracy</th>
<th align="left" valign="top">Sensitivity</th>
<th align="left" valign="top">Specificity</th>
<th align="left" valign="top">F1-score</th>
<th align="left" valign="top">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Image-only CNN&#x2013;transformer</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">Moderate</td>
<td align="left" valign="middle">Moderate</td>
<td align="left" valign="middle">Moderate</td>
<td align="left" valign="middle">High</td>
</tr>
<tr>
<td align="left" valign="middle">Clinical-only MLP</td>
<td align="left" valign="middle">Moderate</td>
<td align="left" valign="middle">Low</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">Low</td>
<td align="left" valign="middle">Moderate</td>
</tr>
<tr>
<td align="left" valign="middle">Multi-modal (without XAI)</td>
<td align="left" valign="middle">Very high</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">Very high</td>
</tr>
<tr>
<td align="left" valign="middle">Proposed explainable multi-modal model</td>
<td align="left" valign="middle">Very high</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">Very high</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="fig" rid="fig2">Figure 2</xref> displays representative image-level explanations obtained through Grad-CAM++. The superimposed heatmaps on mammography and MRI images point to the spatial areas that have the strongest influence on the model&#x2019;s diagnostic predictions. The locally activated regions are in line with the clinically relevant tumor areas, which implies that the suggested model is concentrating on significant radiological features for the decision-making process.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Image-level explainability using Grad-CAM++. Grad-CAM++ heatmaps overlaid on representative mammography (CBIS-DDSM) and MRI (TCIA) images.</p>
</caption>
<graphic xlink:href="frai-09-1767612-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Two medical scans are shown side by side: a breast scan on the left and a brain scan on the right, each overlaid with a heatmap displaying intensity from low (blue) to high (red), with a color scale underneath labeled low to high.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec37">
<label>4.3</label>
<title>Comparison with uni-modal and non-explainable models</title>
<p>A comparative evaluation of different model configurations was performed to understand the contribution of multi-modal integration more clearly. As outlined in <xref ref-type="table" rid="tab3">Table 3</xref>, uni-modal strategies had some inherent disadvantages that were revealed when these methods were used in isolation. Image-only models did not have enough patient-specific contextual information, whereas clinical-only models had reduced discriminative capability because they lacked visual tumor characteristics.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Performance comparison across model configurations.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model configuration</th>
<th align="center" valign="top">Performance trend</th>
<th align="left" valign="top">Key observation</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Image-only</td>
<td align="center" valign="middle">&#x2193;</td>
<td align="left" valign="middle">Limited contextual information</td>
</tr>
<tr>
<td align="left" valign="middle">Clinical-only</td>
<td align="center" valign="middle">&#x2193;&#x2193;</td>
<td align="left" valign="middle">Insufficient visual discrimination</td>
</tr>
<tr>
<td align="left" valign="middle">Multi-modal (concat fusion)</td>
<td align="center" valign="middle">&#x2193;</td>
<td align="left" valign="middle">Weak cross-modal interaction</td>
</tr>
<tr>
<td align="left" valign="middle">Multi-modal (Attention + XAI)</td>
<td align="center" valign="middle">&#x2191;</td>
<td align="left" valign="middle">Best overall balance</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Multi-modal models with simple feature concatenation were slightly better than uni-modal baselines; however, their performance was still not ideal due to the very limited cross-modal interaction. On the other hand, the proposed explainable multi-modal framework, which combines attention-based fusion with explainability mechanisms, was able to achieve the most balanced and robust performance. In fact, its predictive accuracy was at par with or even better than that of non-explainable multi-modal baselines, thus, the incorporation of explainability does not compromise the diagnostic effectiveness.</p>
<p><xref ref-type="fig" rid="fig3">Figure 3</xref> shows the comparison of the various model configurations by means of bar plots of Accuracy, AUC, and F1-score. The multi-modal model with attention-based fusion and explainability that was proposed is always better than the uni-modal and simple fusion baselines, which proves that adaptive multi-modal integration enhances predictive performance to even higher levels of accuracy.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Performance comparison across model configurations. Bar plots comparing classification performance in terms of accuracy, AUC, and F1-score for image-only, clinical-only, multi-modal with feature concatenation, and the proposed multi-modal model with attention-based fusion and explainability. The results demonstrate consistent performance gains achieved through adaptive multi-modal integration without compromising predictive accuracy.</p>
</caption>
<graphic xlink:href="frai-09-1767612-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart with three panels compares model configurations. Left panel shows accuracy, middle panel shows area under curve (AUC), and right panel shows F1-score. Multimodal approaches outperform image-only and clinical-only models in all metrics.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec38">
<label>4.4</label>
<title>Clinical feature attribution analysis</title>
<p>SHAP-based clinical feature attributions are summarized in <xref ref-type="table" rid="tab4">Table 4</xref>. Patient age, tumor stage, hormonal receptor status, tumor size, and selected biomarkers emerged as dominant contributors to diagnostic predictions. These findings are consistent with established oncological risk factors.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Dominant clinical features identified by SHAP.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Clinical feature</th>
<th align="center" valign="top">Attribution strength</th>
<th align="center" valign="top">Clinical relevance</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Age</td>
<td align="center" valign="middle">High</td>
<td align="center" valign="middle">Strong risk indicator</td>
</tr>
<tr>
<td align="left" valign="middle">Tumor stage</td>
<td align="center" valign="middle">High</td>
<td align="center" valign="middle">Disease progression marker</td>
</tr>
<tr>
<td align="left" valign="middle">Hormonal receptor status (ER/PR/HER2)</td>
<td align="center" valign="middle">Moderate&#x2013;High</td>
<td align="center" valign="middle">Treatment and prognosis relevance</td>
</tr>
<tr>
<td align="left" valign="middle">Tumor size</td>
<td align="center" valign="middle">Moderate</td>
<td align="center" valign="middle">Disease severity indicator</td>
</tr>
<tr>
<td align="left" valign="middle">Biomarkers</td>
<td align="center" valign="middle">Moderate</td>
<td align="center" valign="middle">Supporting diagnostic evidence</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="fig" rid="fig4">Figure 4</xref> presents SHAP-based clinical feature attributions for representative test samples. The average absolute SHAP values show how much the different structured clinical variables contributed to the diagnostic predictions. Among these variables, patient age, tumor stage, hormonal receptor status, tumor size, and selected biomarkers were the most influential factors. Moreover, these indications to the factors align with the cancer risk factors that have been already verified by science and go hand in hand with the local image-level explanations.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>SHAP-based clinical feature attributions. Bar plots illustrating the mean absolute SHAP values of structured clinical variables contributing to diagnostic predictions. Patient age, tumor stage, hormonal receptor status, tumor size, and selected biomarkers emerge as dominant contributors, consistent with established oncological risk factors.</p>
</caption>
<graphic xlink:href="frai-09-1767612-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Horizontal bar chart showing feature importance using SHAP values for clinical features: Age is highest, followed by tumor stage, hormonal receptor status (ER/PR/HER2), tumor size, and biomarkers. X-axis shows mean SHAP value.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="fig5">Figure 5</xref> demonstrates the agreement of multi-modal explanations generated by the proposed model for a representative test case. The pixel-level Grad-CAM++ heatmap localizes the cancerous regions that are most relevant for diagnosis, whereas the associated SHAP-based clinical feature attributions point to the most influential patient-specific variables. The fusion of visual and clinical explanations indicates consolidated multi-modal reasoning and hence, provides user-friendly interpretation of the model&#x2019;s predictions in the clinical domain.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Multi-modal explanation coherence. Illustrative example demonstrating the alignment between image-level and clinical feature explanations for a representative test case. <bold>(A)</bold> Grad-CAM++ heatmap highlights diagnostically relevant tumor regions in the radiological image. <bold>(B)</bold> Corresponding SHAP-based clinical feature attributions identify influential patient-level factors, jointly supporting a coherent and clinically intuitive multi-modal interpretation.</p>
</caption>
<graphic xlink:href="frai-09-1767612-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Ultrasound scan shows a colored heatmap overlay on tissue, suggesting a lesion. Adjacent box lists clinical features: age thirty-five, tumor stage two, positive hormonal receptors, and elevated biomarkers.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec39">
<label>4.5</label>
<title>Cross-dataset and cross-cancer generalization</title>
<p>The generalization capability of the proposed framework was assessed on the three cohorts: TCGA BRCA, LUAD, and GBM. <xref ref-type="table" rid="tab5">Table 5</xref> presents the performance trends for each dataset, which demonstrate that the efficiency and explainability of the method were maintained for different cancer types.</p>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>Cross-cancer generalization performance (TCGA cohorts).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Cancer type</th>
<th align="left" valign="top">Diagnostic performance</th>
<th align="left" valign="top">Explainability consistency</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">BRCA</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">Stable</td>
</tr>
<tr>
<td align="left" valign="middle">LUAD</td>
<td align="left" valign="middle">High</td>
<td align="left" valign="middle">Stable</td>
</tr>
<tr>
<td align="left" valign="middle">GBM</td>
<td align="left" valign="middle">Moderate&#x2013;High</td>
<td align="left" valign="middle">Stable</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In fact, one of the major points that can be inferred from <xref ref-type="fig" rid="fig6">Figure 6</xref> is that the proposed framework has the potential to generalize its pool of knowledge beyond the data used for training, as can be seen from the comparison of their performance on the internal and external test sets. The model is still able to keep the same level of precision, AUC, and F1-score throughout the datasets, which is strong evidence that it is very resistant to any changes in the underlying data distribution and thus can be used in other domains apart from the one where it was trained.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Cross-dataset generalization performance. Comparison of the proposed explainable multi-modal model on internal and external test datasets across accuracy, AUC, and F1-score. The relatively stable performance across datasets indicates strong generalization capability and robustness to variations in data distribution.</p>
</caption>
<graphic xlink:href="frai-09-1767612-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart comparing performance metrics&#x2014;accuracy, AUC, and F1-score&#x2014;for internal and external test sets. Both sets show similar accuracy, with slightly higher internal AUC and F1-score compared to external. Legend differentiates sets.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec40">
<label>4.6</label>
<title>Clinician-centered qualitative assessment</title>
<p>Experienced radiologists qualitatively evaluated that multi-modal explanations raised the diagnostic confidence level more than image-only explanations. Clinicians stated that the correspondence between the brightly tumor regions and the most influential clinical features helped them to use their intuitive reasoning and also to confirm the model predictions. <xref ref-type="fig" rid="fig7">Figure 7</xref> shows the clinician-centered assessment of the proposed explainable multi-modal framework. Responses to the survey reveal that most of the clinicians were in agreement or strong agreement that the generated explanations were instrumental in understanding disease characteristics and also led to enhanced diagnostic confidence. These results are consistent with the clinical relevance and interpret-ability of the proposed method, which is a great indication of its potential as a decision-support tool in real diagnostic scenarios.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Clinician-centered assessment of explainability. Bar plots summarizing clinician survey responses evaluating the usefulness of the proposed framework for understanding disease characteristics and improving diagnostic confidence. High levels of agreement indicate that the generated multimodal explanations are clinically interpretable and support informed decision-making.</p>
</caption>
<graphic xlink:href="frai-09-1767612-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart comparing responses for &#x201C;Helpful for understanding condition&#x201D; and &#x201C;Improves diagnostic confidence,&#x201D; using blue and orange bars, across &#x201C;Strongly disagree,&#x201D; &#x201C;Disagree,&#x201D; and &#x201C;Agree,&#x201D; with highest agreement near 70 percent.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec41">
<label>4.7</label>
<title>Ablation study</title>
<p>An ablation study was performed to measure how much each component of the proposed explainable multi-modal framework contributed to the overall effect. Basically, the study systematically removed or changed key architectural elements like modality usage and fusion strategy while still keeping all other training and evaluation settings the same. This analysis helps to understand how important multi-modal integration and attention-based fusion are relative to each other.</p>
<p><xref ref-type="table" rid="tab6">Table 6</xref> provides a summary of the performance trends that were observed across various ablation configurations. The image-only and clinical-only models showed a significant drop in performance as they lacked the information from the complementary modality. Multi-modal models with simple feature concatenation as the method of fusion had slight improvements over the uni-modal baselines, but they were still unable to fully exploit the cross-modal interactions. On the other hand, the proposed attention-based multi-modal fusion has been the most robust and balanced performance, thus, it has been confirmed that adaptive modality weighting is the most effective.</p>
<table-wrap position="float" id="tab6">
<label>Table 6</label>
<caption>
<p>Ablation study results.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model configuration</th>
<th align="center" valign="top">Performance impact</th>
<th align="left" valign="top">Interpretation</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Image-only</td>
<td align="center" valign="middle">&#x2193;</td>
<td align="left" valign="middle">Lacks patient-specific contextual information</td>
</tr>
<tr>
<td align="left" valign="middle">Clinical-only</td>
<td align="center" valign="middle">&#x2193;&#x2193;</td>
<td align="left" valign="middle">Insufficient visual discrimination capability</td>
</tr>
<tr>
<td align="left" valign="middle">Multimodal (concat fusion)</td>
<td align="center" valign="middle">&#x2193;</td>
<td align="left" valign="middle">Limited cross-modal interaction</td>
</tr>
<tr>
<td align="left" valign="middle">Multimodal (attention-based fusion)</td>
<td align="center" valign="middle"><bold>&#x2191;</bold></td>
<td align="left" valign="middle">Optimal integration and performance</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec42">
<label>4.8</label>
<title>Results summary</title>
<p>Throughout all the assessments, the suggested explainable multi-modal framework was a high performer in terms of diagnostic power, showed strong generalization abilities across datasets, and provided explanations that were clinically relevant. The use of attention-based multi-modal fusion along with dual-level explainability allows for precise and transparent cancer diagnosis, thus the research objectives have been accomplished. Quantitative evaluation further confirmed the robustness of the explanations. Grad-CAM++ heatmaps demonstrated high structural similarity under input perturbations, and SHAP attribution vectors showed strong correlation stability. Faithfulness tests revealed a significant reduction in prediction confidence when highly attributed regions or clinical features were removed, indicating that the explanations reliably reflect the model&#x2019;s internal decision logic.</p>
</sec>
</sec>
<sec sec-type="discussion" id="sec43">
<label>5</label>
<title>Discussion</title>
<p>As a motivating example, the work presented here developed a scientifically interpret-able multi-modal deep learning model that leverages the complementary information of radiological imaging and structured clinical data to provide transparent and clinically meaningful cancer diagnosis. The quantitative experiments highlight that attention-based multi-modal fusion, when coupled with image-level and feature-level explainability, leads to enhanced diagnostic accuracy while also preserving interpret-ability. The current discussion situates the results in relation to the prior art, points out the clinical implications, and lists the limitations of the study as well as the directions for future research.</p>
<sec id="sec44">
<label>5.1</label>
<title>Impact of multi-modal integration</title>
<p>The comparative performance analysis illustrates that the integration of multi-modal information dramatically improves the diagnostic accuracy as compared to the uni-modal methods. Image-only models, although they were able to capture the spatial tumor characteristics effectively, were deficient in patient-specific context. On the other hand, clinical-only models could not capture the visual heterogeneity present in the radiological scans. The proposed framework, thus, effectively utilized the complementary information from both modalities through attention-based fusion, leading to a more balanced and robust diagnostic model.</p>
<p>These results corroborate those of previous multi-modal studies in medical imaging, which have demonstrated that the integration of heterogeneous data sources can lead to better predictive performance. The proposed attention-based fusion mechanism, however, differs from most of the existing methods that depend on simple feature concatenation. It adaptively adjusts the weights of modality contributions, thereby allowing the model to give more emphasis to the clinically relevant information depending on the diagnostic context.</p>
</sec>
<sec id="sec45">
<label>5.2</label>
<title>Explainability and clinical trust</title>
<p>One of the major contributions of this research is the integration of multi-modal explainability, which was done without a reduction in predictive performance. The image-level explanations made by Grad-CAM++ frequently included tumor regions that were not only visually obvious but also made sense from a diagnostic point of view, thereby matching the areas annotated by radiologists. At the same time, clinical feature attributions based on SHAP pointed patient-level variables that have the most significant impact like age, tumor stage, and hormonal receptor status, which are generally known risk factors in oncology.</p>
<p>The agreement between image-based and clinical explanations is especially valuable from a healthcare point of view. Instead of giving separate or even potentially contradictory explanations for each modality, the suggested framework provides concerted multi-modal interpretations that reflect the diagnostic reasoning of the real world. This conformity increases the clinician&#x2019;s confidence in the system and thereby overcomes the problem of deep learning systems being integrated into clinical workflows, which is a major issue.</p>
</sec>
<sec id="sec46">
<label>5.3</label>
<title>Generalization and robustness</title>
<p>Assessing multiple datasets and different cancer types, the proposed framework was shown to be applicable beyond a single imaging modality or disease context. The consistent performance across TCGA cohorts indicates that the model identifies diagnostic patterns that can be transferred and are not mere artifacts specific to the dataset. Such robustness is essential for use in the real world, where changes in imaging protocols, patient demographics, and institutional practices are to be expected. Additionally, explainability metrics showed that the attributions were stable and accurate even when the input was changed, thus, the explanations are not simply post-hoc visualizations but they correspond to the most important factors for the model&#x2019;s decision.</p>
</sec>
<sec id="sec47">
<label>5.4</label>
<title>Clinical implications</title>
<p>The proposed framework is intended to function as a clinical decision-support and second-opinion system, rather than as a fully autonomous diagnostic tool. Its primary objective is to assist clinicians by enhancing transparency, improving diagnostic confidence, and supporting interpretability through coherent multi-modal explanations that combine radiological evidence with clinical risk factors. The system is designed to complement clinical expertise by facilitating case prioritization, supporting confirmation of diagnostic hypotheses, and improving understanding of complex or ambiguous cases.</p>
<p>Clinically, the new model is most appropriately a support tool for decision-making, not a fully independent diagnostic system. A physician&#x2019;s confirmation of a diagnostic hypothesis, a prioritization of the cases, and a recognition of the risk factors could all be facilitated by the combination of the correct predictions and the clear explanations. Moreover, the evaluation from the viewpoint of a doctor strongly indicates this function, as the experts said that their diagnostic confidence was raised when multi-modal explanations were provided. Such machines may become priceless especially in complicated or unclear situations where medical imaging cannot provide complete information and thus has to be combined with patient history and clinical biomarkers.</p>
</sec>
<sec id="sec48">
<label>5.5</label>
<title>Limitations</title>
<p>Though the study yielded promising results, it is still burdened with numerous limitations. First of all, the evaluation was based on retrospective, publicly available datasets, which might not account for the full extent of variations in real-world clinical scenarios. Secondly, the clinician assessment was qualitative and of a small scale, involving a limited number of experts. Although the feedback indicates strong clinical relevance and improved diagnostic confidence, larger multi-center studies with structured quantitative usability metrics are required to comprehensively validate the framework&#x2019;s clinical effectiveness and real-world deployment readiness. Thirdly, even though Grad-CAM++ and SHAP offer understandable explanations, they are still post-hoc methods and might not necessarily identify the true causal relationships in the model.</p>
<p>Moreover, the present setup is based on the assumption that complete multi-modal data are available. The issue of missing or partially observed modalities is still unresolved and represents a significant avenue for future research.</p>
</sec>
<sec id="sec49">
<label>5.6</label>
<title>Future directions</title>
<p>Further studies aim to develop the framework for clinical trials and deployment in real-time scenarios. The use of uncertainty estimation, causal explainability methods, and temporal clinical data might, in fact, improve confidence and trust even more. Investigating approaches for resilient multi-modal learning when data is missing and broadening clinician-in-the-loop assessment will, likewise, be essential to the translational effect.</p>
</sec>
<sec id="sec50">
<label>5.7</label>
<title>Summary</title>
<p>Overall, this research shows that an explainable multi-modal deep learning model can deliver excellent diagnostic results as well as provide clinically logical and reliable explanations. The innovative system that the authors present, which combines attention-based fusion with dual-level explainability, seems to be an effective way of tackling the main issues articulated in the field of medical AI regarding precision, openness and use by the medical community. Thus, it represents a potential solution for making cancer diagnosis systems not only more reliable but also interpret-able, which is essential for their integration in clinical practice.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec51">
<label>6</label>
<title>Conclusion</title>
<p>The research detailed an explainable multi-modal deep learning framework that is transparent in cancer diagnosis. It combines radiological imaging and structured clinical data through an attention-based fusion architecture. The hybrid approach was intended to solve the problem of a double challenge, i.e., on the one hand, to achieve a high diagnostic performance and, on the other hand, to provide clinically relevant and trustworthy explanations.</p>
<p>Experimental results demonstrated the multi-modal model to be consistently superior to uni-modal image-only and clinical-only baselines over various datasets. The attention-based multi-modal fusion leading to an improved balance of sensitivity and specificity as compared to simple feature concatenation, thus adaptation of the cross-modal interaction is confirmed to be crucial. Most importantly, the inclusion of explainability mechanisms did not impair predictive performance, as they achieved accuracy and AUC comparable to or even better than those of non-explainable multi-modal models.</p>
<p>Both qualitative and quantitative explainability analyses supported that image-level Grad-CAM++ visualizations very well corresponded to the tumor regions that are most relevant for the diagnosis, while SHAP-based clinical feature attributions pointed to patient-specific factors that influenced, for example, age, tumor stage, hormonal receptor status, and biomarkers. The agreement between visual and clinical explanations allowed for integrated multi-modal interpretation that is very close to clinical reasoning. Performance and explanation behavior were stable in a cross-dataset evaluation for different cancer types, thus the model has a strong capability of generalization. Moreover, a clinician-centered assessment indicated that the multi-modal explanations helped to increase diagnostic confidence and interpret-ability.</p>
<p>The overall findings endorse that explainable multi-modal learning is a viable way to reconcile precision, resilience, and openness in oncological diagnosis. As the suggested system is able to offer clear image- and feature-based explanations along with its strong predictive performance, it can be seen as a convenient and reliable decision-support tool for medical AI. The framework is designed to assist clinicians in diagnostic reasoning and workflow efficiency while preserving human oversight, rather than replacing clinical judgment. The next steps in research will be geared toward clinical trials, solutions for incomplete multi-modal data, and the integration of uncertainty-aware and causal explainability for additional assistance in clinical practice.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec52">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link xlink:href="https://www.cancerimagingarchive.net/collection/cbis-ddsm/" ext-link-type="uri">https://www.cancerimagingarchive.net/collection/cbis-ddsm/</ext-link>, <ext-link xlink:href="https://www.cancerimagingarchive.net/collection/duke-breast-cancer-mri/" ext-link-type="uri">https://www.cancerimagingarchive.net/collection/duke-breast-cancer-mri/</ext-link>, <ext-link xlink:href="https://portal.gdc.cancer.gov/projects/TCGA-BRCA" ext-link-type="uri">https://portal.gdc.cancer.gov/projects/TCGA-BRCA</ext-link>, <ext-link xlink:href="https://portal.gdc.cancer.gov/projects/TCGA-LUAD" ext-link-type="uri">https://portal.gdc.cancer.gov/projects/TCGA-LUAD</ext-link>, <ext-link xlink:href="https://portal.gdc.cancer.gov/projects/TCGA-GBM" ext-link-type="uri">https://portal.gdc.cancer.gov/projects/TCGA-GBM</ext-link>.</p>
</sec>
<sec sec-type="ethics-statement" id="sec53">
<title>Ethics statement</title>
<p>This study was conducted using publicly available, fully de-identified datasets. No human participants were directly recruited, and no personal or identifiable patient information was accessed. Therefore, ethical approval and informed consent were not required in accordance with institutional and national research guidelines.</p>
</sec>
<sec sec-type="author-contributions" id="sec54">
<title>Author contributions</title>
<p>SD: Conceptualization, Methodology, Project administration, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. LB: Data curation, Investigation, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. YD: Formal analysis, Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. AB: Formal analysis, Software, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. KP: Investigation, Methodology, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. SJ: Data curation, Formal analysis, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. BM: Resources, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. BW: Supervision, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec55">
<title>Conflict of interest</title>
<p>BM was employed by Walmart and BW was employed by Amazon.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec56">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec57">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aftab</surname><given-names>J.</given-names></name> <name><surname>Moreno</surname><given-names>F.</given-names></name> <name><surname>Silva</surname><given-names>P.</given-names></name></person-group> (<year>2025</year>). <article-title>AI-based oncologic prediction systems</article-title>. <source>Sci. Rep.</source> <volume>15</volume>:<fpage>345</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-60234-4</pub-id></mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ahmed</surname><given-names>S.</given-names></name> <name><surname>Kumar</surname><given-names>P.</given-names></name> <name><surname>Li</surname><given-names>Z.</given-names></name></person-group> (<year>2024</year>). <article-title>Multimodal pretraining for medical imaging and EHR fusion</article-title>. <source>Nat. Mach. Intell.</source> <volume>6</volume>, <fpage>345</fpage>&#x2013;<lpage>358</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s42256-024-00678-2</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhosekar</surname><given-names>S.</given-names></name> <name><surname>Nair</surname><given-names>P.</given-names></name> <name><surname>Jacobs</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <article-title>Multimodal machine learning in medicine: a review</article-title>. <source>Open Bioinform. J.</source> <volume>12</volume>, <fpage>1</fpage>&#x2013;<lpage>20</lpage>. doi: <pub-id pub-id-type="doi">10.2174/1875036202501010111</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Buzdugan</surname><given-names>S.</given-names></name> <name><surname>Ahmed</surname><given-names>T.</given-names></name> <name><surname>Park</surname><given-names>S.</given-names></name></person-group> (<year>2025a</year>). <article-title>Glioblastoma radiogenomic survival modelling</article-title>. <source>J. Digit. Imaging</source> <volume>38</volume>, <fpage>455</fpage>&#x2013;<lpage>467</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10278-025-00890-4</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Buzdugan</surname><given-names>S.</given-names></name> <name><surname>Ahmed</surname><given-names>T.</given-names></name> <name><surname>Park</surname><given-names>S.</given-names></name></person-group> (<year>2025b</year>). <article-title>Multimodal radiogenomics for survival modelling</article-title>. <source>J. Digit. Imaging</source> <volume>38</volume>, <fpage>512</fpage>&#x2013;<lpage>526</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10278-025-00912-3</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>Z.</given-names></name> <name><surname>Wang</surname><given-names>L.</given-names></name> <name><surname>Li</surname><given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Self-supervised learning for radiology foundation models</article-title>. <source>Med. Image Anal.</source> <volume>95</volume>:<fpage>103123</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2024.103123</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname><given-names>A.</given-names></name> <name><surname>Beyer</surname><given-names>L.</given-names></name> <name><surname>Kolesnikov</surname><given-names>A.</given-names></name> <name><surname>Weissenborn</surname><given-names>D.</given-names></name> <name><surname>Zhai</surname><given-names>X.</given-names></name> <name><surname>Unterthiner</surname><given-names>T.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>An image is worth 16&#x00D7;16 words: transformers for image recognition at scale</article-title>. <conf-name>International Conference on Learning Representations (ICLR)</conf-name>:<fpage>1</fpage>&#x2013;<lpage>12</lpage>.</mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ennab</surname><given-names>M.</given-names></name> <name><surname>Silva</surname><given-names>J.</given-names></name> <name><surname>Roberts</surname><given-names>K.</given-names></name></person-group> (<year>2025a</year>). <article-title>Advances in medical imaging interpretability</article-title>. <source>Appl. Sci.</source> <volume>15</volume>:<fpage>2345</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app15052345</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ennab</surname><given-names>M.</given-names></name> <name><surname>Silva</surname><given-names>J.</given-names></name> <name><surname>Roberts</surname><given-names>K.</given-names></name></person-group> (<year>2025b</year>). <article-title>Recent interpretability improvements in AI for healthcare</article-title>. <source>Appl. Sci.</source> <volume>15</volume>:<fpage>2789</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app15062789</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Esteva</surname><given-names>A.</given-names></name> <name><surname>Kuprel</surname><given-names>B.</given-names></name> <name><surname>Novoa</surname><given-names>R. A.</given-names></name> <name><surname>Ko</surname><given-names>J.</given-names></name> <name><surname>Swetter</surname><given-names>S. M.</given-names></name> <name><surname>Blau</surname><given-names>H. M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Dermatologist-level classification of skin cancer with deep neural networks</article-title>. <source>Nature</source> <volume>542</volume>, <fpage>115</fpage>&#x2013;<lpage>118</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nature21056</pub-id>, <pub-id pub-id-type="pmid">28117445</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fayyaz</surname><given-names>A.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name> <name><surname>Lopez</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <article-title>Systematic review of grad-CAM variants for medical imaging</article-title>. <source>Sci. Rep.</source> <volume>15</volume>:<fpage>2234</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-51234-z</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fernandez</surname><given-names>R.</given-names></name> <name><surname>Gupta</surname><given-names>S.</given-names></name> <name><surname>Wang</surname><given-names>L.</given-names></name></person-group> (<year>2025</year>). <article-title>Cross-modal attention for radiology&#x2013;pathology integration</article-title>. <source>Med. Image Anal.</source> <volume>111</volume>:<fpage>104567</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2025.104567</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gharaibeh</surname><given-names>N.</given-names></name> <name><surname>Singh</surname><given-names>P.</given-names></name> <name><surname>Wallace</surname><given-names>D.</given-names></name></person-group> (<year>2025</year>). <article-title>Combining SHAP and grad-CAM for MRI interpretation</article-title>. <source>Med. Phys.</source> <volume>52</volume>, <fpage>765</fpage>&#x2013;<lpage>780</lpage>. doi: <pub-id pub-id-type="doi">10.1002/mp.16543</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ghasemi</surname><given-names>A.</given-names></name> <name><surname>Brown</surname><given-names>L.</given-names></name> <name><surname>Ahmad</surname><given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Explainable AI in breast cancer imaging: a scoping review</article-title>. <source>Insights Imaging</source> <volume>15</volume>:<fpage>1567</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13244-024-01567-9</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>He</surname><given-names>W.</given-names></name> <name><surname>Morrison</surname><given-names>J.</given-names></name> <name><surname>Tan</surname><given-names>H.</given-names></name></person-group> (<year>2024</year>). <article-title>Radiogenomics: bridging imaging and molecular profiles</article-title>. <source>MedComm</source> <volume>5</volume>:<fpage>324</fpage>. doi: <pub-id pub-id-type="doi">10.1002/mco2.324</pub-id>, <pub-id pub-id-type="pmid">37409109</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>He</surname><given-names>K.</given-names></name> <name><surname>Zhang</surname><given-names>X.</given-names></name> <name><surname>Ren</surname><given-names>S.</given-names></name> <name><surname>Sun</surname><given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>Deep residual learning for image recognition</article-title>. <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>:<fpage>770</fpage>&#x2013;<lpage>778</lpage>.</mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kumar</surname><given-names>Y.</given-names></name> <name><surname>Das</surname><given-names>S.</given-names></name> <name><surname>Pereira</surname><given-names>T.</given-names></name></person-group> (<year>2023</year>). <article-title>Deep multimodal medical image fusion using neural models</article-title>. <source>Sensors</source> <volume>23</volume>:<fpage>4567</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s23094567</pub-id>, <pub-id pub-id-type="pmid">37177771</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kumar</surname><given-names>R.</given-names></name> <name><surname>Shenoy</surname><given-names>P.</given-names></name> <name><surname>Li</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <article-title>Machine learning for radiogenomics integration</article-title>. <source>Diagnostics</source> <volume>15</volume>:<fpage>876</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics15030876</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lai</surname><given-names>J.</given-names></name> <name><surname>Huang</surname><given-names>Y.</given-names></name> <name><surname>Patel</surname><given-names>V.</given-names></name></person-group> (<year>2024</year>). <article-title>Radiogenomic transcriptomic multimodal modelling</article-title>. <source>BMC Med. Genet.</source> <volume>17</volume>:<fpage>112</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12920-024-01562-8</pub-id></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>Y.</given-names></name> <name><surname>Zhao</surname><given-names>H.</given-names></name> <name><surname>Kumar</surname><given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>Deep learning-based information fusion for medical diagnosis</article-title>. <source>Inf. Fusion</source> <volume>103</volume>:<fpage>102345</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.inffus.2024.102345</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liang</surname><given-names>Y.</given-names></name> <name><surname>Tan</surname><given-names>W.</given-names></name> <name><surname>Singh</surname><given-names>A.</given-names></name></person-group> (<year>2025a</year>). <article-title>Four-modality radiomics model for cancer stratification</article-title>. <source>BMC Cancer</source> <volume>25</volume>:<fpage>345</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12885-025-12345-9</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liang</surname><given-names>Y.</given-names></name> <name><surname>Tan</surname><given-names>W.</given-names></name> <name><surname>Singh</surname><given-names>A.</given-names></name></person-group> (<year>2025b</year>). <article-title>Interpretability methods for multimodal radiomics</article-title>. <source>Cancer</source> <volume>17</volume>:<fpage>3456</fpage>. doi: <pub-id pub-id-type="doi">10.3390/cancers17123456</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Litjens</surname><given-names>G.</given-names></name> <name><surname>Kooi</surname><given-names>T.</given-names></name> <name><surname>Bejnordi</surname><given-names>B. E.</given-names></name> <name><surname>Setio</surname><given-names>A. A. A.</given-names></name> <name><surname>Ciompi</surname><given-names>F.</given-names></name> <name><surname>Ghafoorian</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>A survey on deep learning in medical image analysis</article-title>. <source>Med. Image Anal.</source> <volume>42</volume>, <fpage>60</fpage>&#x2013;<lpage>88</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2017.07.005</pub-id>, <pub-id pub-id-type="pmid">28778026</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>Z.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name> <name><surname>Gomez</surname><given-names>R.</given-names></name></person-group> (<year>2025</year>). <article-title>MOFS: multimodal fusion subtyping for oncology</article-title>. <source>Nat. Commun.</source> <volume>16</volume>:<fpage>1456</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-025-01456-1</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>Z.</given-names></name> <name><surname>Patel</surname><given-names>S.</given-names></name> <name><surname>Kumar</surname><given-names>R.</given-names></name></person-group> (<year>2025</year>). <article-title>Radiopathology and proteogenomic fusion for cancer prediction</article-title>. <source>Nat. Commun.</source> <volume>16</volume>:<fpage>1123</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-025-01234-5</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>C.</given-names></name> <name><surname>Zhang</surname><given-names>Y.</given-names></name> <name><surname>Fernandez</surname><given-names>P.</given-names></name></person-group> (<year>2025</year>). <article-title>Survey of multimodal medical data fusion techniques</article-title>. <source>ACM Comput. Surv.</source> <volume>58</volume>, <fpage>1</fpage>&#x2013;<lpage>38</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3654321</pub-id></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname><given-names>L.</given-names></name> <name><surname>Singh</surname><given-names>R.</given-names></name> <name><surname>Gomez</surname><given-names>F.</given-names></name></person-group> (<year>2024</year>). <article-title>Federated deep learning for diagnostic imaging</article-title>. <source>NPJ Digit. Med.</source> <volume>7</volume>:<fpage>45</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-024-00987-2</pub-id></mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Maigari</surname><given-names>A.</given-names></name> <name><surname>Mensah</surname><given-names>K.</given-names></name> <name><surname>Zhou</surname><given-names>J.</given-names></name></person-group> (<year>2025a</year>). <article-title>Multimodal breast cancer prognosis modelling</article-title>. <source>J. Med. Artif. Intell.</source> <volume>5</volume>, <fpage>23</fpage>&#x2013;<lpage>34</lpage>. doi: <pub-id pub-id-type="doi">10.21037/jmai-25-011</pub-id></mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Maigari</surname><given-names>A.</given-names></name> <name><surname>Mensah</surname><given-names>K.</given-names></name> <name><surname>Zhou</surname><given-names>J.</given-names></name></person-group> (<year>2025b</year>). <article-title>Multimodal prognosis learning for oncology applications</article-title>. <source>J. Med. Syst.</source> <volume>49</volume>:<fpage>67</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s10916-025-02167-4</pub-id></mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Martins</surname><given-names>R.</given-names></name> <name><surname>Oliveira</surname><given-names>P.</given-names></name> <name><surname>Silva</surname><given-names>M.</given-names></name></person-group> (<year>2024</year>). <article-title>Clinician-in-the-loop evaluation of XAI tools for diagnosis</article-title>. <source>J. Clin. Inform.</source> <volume>12</volume>, <fpage>145</fpage>&#x2013;<lpage>162</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jclin.2024.04.012</pub-id></mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nagar</surname><given-names>K.</given-names></name> <name><surname>Singh</surname><given-names>P.</given-names></name> <name><surname>Desai</surname><given-names>R.</given-names></name></person-group> (<year>2025</year>). <article-title>Uncertainty-aware multimodal models for cancer diagnosis</article-title>. <source>IEEE J. Biomed. Health Inform.</source> <volume>29</volume>, <fpage>3345</fpage>&#x2013;<lpage>3356</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JBHI.2025.3345789</pub-id></mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nakach</surname><given-names>F.</given-names></name> <name><surname>Rahman</surname><given-names>M.</given-names></name> <name><surname>Smith</surname><given-names>J.</given-names></name></person-group> (<year>2024a</year>). <article-title>Comprehensive multimodal approaches in precision oncology</article-title>. <source>Artif. Intell. Rev.</source> <volume>57</volume>, <fpage>1</fpage>&#x2013;<lpage>24</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10462-024-10789-1</pub-id></mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nakach</surname><given-names>F.</given-names></name> <name><surname>Rahman</surname><given-names>M.</given-names></name> <name><surname>Smith</surname><given-names>J.</given-names></name></person-group> (<year>2024b</year>). <article-title>Survey of multimodal deep learning techniques</article-title>. <source>Artif. Intell. Rev.</source> <volume>56</volume>, <fpage>1</fpage>&#x2013;<lpage>29</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10462-024-10812-9</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ortega</surname><given-names>M.</given-names></name> <name><surname>Lin</surname><given-names>Y.</given-names></name> <name><surname>Singh</surname><given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>Handling missing modalities in multimodal clinical models</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>43</volume>, <fpage>789</fpage>&#x2013;<lpage>802</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2024.3345890</pub-id></mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Oviedo</surname><given-names>F.</given-names></name> <name><surname>Chen</surname><given-names>L.</given-names></name> <name><surname>Martin</surname><given-names>R.</given-names></name></person-group> (<year>2025</year>). <article-title>Explainable MRI-based cancer detection</article-title>. <source>Radiology</source> <volume>305</volume>, <fpage>113</fpage>&#x2013;<lpage>124</lpage>. doi: <pub-id pub-id-type="doi">10.1148/radiol.2025251234</pub-id></mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Oviedo</surname><given-names>F.</given-names></name> <name><surname>Garcia</surname><given-names>M.</given-names></name> <name><surname>Lee</surname><given-names>H.</given-names></name></person-group> (<year>2025</year>). <article-title>AI-assisted breast MRI screening</article-title>. <source>Radiology</source> <volume>305</volume>, <fpage>112</fpage>&#x2013;<lpage>123</lpage>. doi: <pub-id pub-id-type="doi">10.1148/radiol.2025241234</pub-id></mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Park</surname><given-names>S.</given-names></name> <name><surname>Kim</surname><given-names>J.</given-names></name> <name><surname>Lee</surname><given-names>H.</given-names></name></person-group> (<year>2025</year>). <article-title>Attention-based fusion networks for multimodal cancer diagnosis</article-title>. <source>Cancer</source> <volume>17</volume>:<fpage>412</fpage>. doi: <pub-id pub-id-type="doi">10.3390/cancers17020412</pub-id></mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Patel</surname><given-names>R.</given-names></name> <name><surname>Mehta</surname><given-names>S.</given-names></name> <name><surname>Liu</surname><given-names>X.</given-names></name></person-group> (<year>2025</year>). <article-title>Robustness analysis of medical AI models under domain shifts</article-title>. <source>Med. Image Anal.</source> <volume>110</volume>:<fpage>103345</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2025.103345</pub-id></mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Peng</surname><given-names>P.</given-names></name> <name><surname>Arora</surname><given-names>S.</given-names></name> <name><surname>Becker</surname><given-names>T.</given-names></name></person-group> (<year>2025</year>). <article-title>Progressive transformer-based multimodal fusion</article-title>. <source>Electronics</source> <volume>14</volume>:<fpage>1123</fpage>. doi: <pub-id pub-id-type="doi">10.3390/electronics14051123</pub-id></mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rabah</surname><given-names>C. B.</given-names></name> <name><surname>Oliveira</surname><given-names>T.</given-names></name> <name><surname>Singh</surname><given-names>K.</given-names></name></person-group> (<year>2025</year>). <article-title>Multimodal cancer classification using deep fusion networks</article-title>. <source>Comput. Biol. Med.</source> <volume>167</volume>:<fpage>108765</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.108765</pub-id></mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ramkumar</surname><given-names>N.</given-names></name> <name><surname>Patel</surname><given-names>S.</given-names></name> <name><surname>Zhou</surname><given-names>L.</given-names></name></person-group> (<year>2023</year>). <article-title>Hybrid fusion models for breast cancer prediction</article-title>. <source>J. Intell. Fuzzy Syst.</source> <volume>45</volume>, <fpage>1123</fpage>&#x2013;<lpage>1134</lpage>. doi: <pub-id pub-id-type="doi">10.3233/JIFS-223456</pub-id></mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rossi</surname><given-names>F.</given-names></name> <name><surname>Marino</surname><given-names>G.</given-names></name> <name><surname>Baxter</surname><given-names>D.</given-names></name></person-group> (<year>2024</year>). <article-title>Evaluating explanation fidelity in medical imaging</article-title>. <source>Artif. Intell. Med.</source> <volume>134</volume>:<fpage>102233</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.artmed.2024.102233</pub-id></mixed-citation></ref>
<ref id="ref43"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Saha</surname><given-names>A.</given-names></name> <name><surname>Harowicz</surname><given-names>M. R.</given-names></name> <name><surname>Grimm</surname><given-names>L. J.</given-names></name> <name><surname>Weng</surname><given-names>J.</given-names></name> <name><surname>Cain</surname><given-names>E. H.</given-names></name> <name><surname>Kim</surname><given-names>C. E.</given-names></name> <etal/></person-group>. (<year>2021</year>). Dynamic contrast-enhanced magnetic resonance images of breast cancer patients with tumor locations (version 3) [data set]. The Cancer Imaging Archive. doi:<pub-id pub-id-type="doi">10.7937/TCIA.E3SV-RE93</pub-id></mixed-citation></ref>
<ref id="ref44"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Sawyer-Lee</surname><given-names>R.</given-names></name> <name><surname>Gimenez</surname><given-names>F.</given-names></name> <name><surname>Hoogi</surname><given-names>A.</given-names></name> <name><surname>Rubin</surname><given-names>D.</given-names></name></person-group> (<year>2016</year>). Curated breast imaging subset of digital database for screening mammography (CBIS-DDSM) (version 1) [data set]. The Cancer Imaging Archive. doi: <pub-id pub-id-type="doi">10.7937/K9/TCIA.2016.7O02S9CY</pub-id></mixed-citation></ref>
<ref id="ref45"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shah</surname><given-names>S.</given-names></name> <name><surname>Lopez</surname><given-names>D.</given-names></name> <name><surname>Irwin</surname><given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>Explainable convolutional neural network models for skin cancer detection</article-title>. <source>J. Imaging</source> <volume>10</volume>:<fpage>55</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jimaging10050055</pub-id></mixed-citation></ref>
<ref id="ref46"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Singh</surname><given-names>Y.</given-names></name> <name><surname>Gupta</surname><given-names>R.</given-names></name> <name><surname>Park</surname><given-names>S.</given-names></name></person-group> (<year>2025</year>). <article-title>Comparative XAI frameworks for medical imaging</article-title>. <source>Patterns</source> <volume>6</volume>:<fpage>100987</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patter.2025.100987</pub-id></mixed-citation></ref>
<ref id="ref47"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Singh</surname><given-names>Y.</given-names></name> <name><surname>Park</surname><given-names>S.</given-names></name> <name><surname>Gupta</surname><given-names>R.</given-names></name></person-group> (<year>2025</year>). <article-title>Embedding explainability into model training workflows</article-title>. <source>Artif. Intell. Med.</source> <volume>150</volume>:<fpage>102611</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.artmed.2025.102611</pub-id></mixed-citation></ref>
<ref id="ref48"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Singh</surname><given-names>P.</given-names></name> <name><surname>Zhao</surname><given-names>J.</given-names></name> <name><surname>Huang</surname><given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>Clinical alignment metrics for XAI</article-title>. <source>J. Biomed. Inform.</source> <volume>135</volume>:<fpage>104123</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jbi.2025.104123</pub-id></mixed-citation></ref>
<ref id="ref49"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Song</surname><given-names>B.</given-names></name> <name><surname>Li</surname><given-names>H.</given-names></name> <name><surname>Chen</surname><given-names>Q.</given-names></name></person-group> (<year>2025</year>). <article-title>Fusion of radiology and pathology for precision diagnosis</article-title>. <source>EBioMedicine</source> <volume>104</volume>:<fpage>104512</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ebiom.2025.104512</pub-id></mixed-citation></ref>
<ref id="ref50"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tempel</surname><given-names>F.</given-names></name> <name><surname>Ross</surname><given-names>H.</given-names></name> <name><surname>Iqbal</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <article-title>Comparison of SHAP and grad-CAM in clinical imaging</article-title>. <source>IEEE Access</source> <volume>13</volume>, <fpage>11234</fpage>&#x2013;<lpage>11249</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3345678</pub-id></mixed-citation></ref>
<ref id="ref51"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Thambawita</surname><given-names>S.</given-names></name> <name><surname>Yilmaz</surname><given-names>I.</given-names></name> <name><surname>Iinuma</surname><given-names>H.</given-names></name></person-group> (<year>2024</year>). <article-title>Counterfactual explanations in medical AI systems</article-title>. <source>Patterns</source> <volume>5</volume>:<fpage>100845</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patter.2024.100845</pub-id></mixed-citation></ref>
<ref id="ref52"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Turki</surname><given-names>A.</given-names></name> <name><surname>Fernandez</surname><given-names>L.</given-names></name> <name><surname>Ghosh</surname><given-names>P.</given-names></name></person-group> (<year>2025</year>). <article-title>Multimodal learning for head and neck cancer classification</article-title>. <source>Cancer</source> <volume>17</volume>:<fpage>654</fpage>. doi: <pub-id pub-id-type="doi">10.3390/cancers17030654</pub-id></mixed-citation></ref>
<ref id="ref53"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>L.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name> <name><surname>Patel</surname><given-names>V.</given-names></name></person-group> (<year>2024</year>). <article-title>Multimodal uncertainty estimation in medical AI</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>35</volume>, <fpage>789</fpage>&#x2013;<lpage>801</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2024.1234567</pub-id></mixed-citation></ref>
<ref id="ref54"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname><given-names>T. R.</given-names></name> <name><surname>Lopez</surname><given-names>J.</given-names></name> <name><surname>Nagle</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <article-title>Enhanced breast cancer classification using multimodal deep learning</article-title>. <source>Comput. Med. Imaging Graph.</source> <volume>104</volume>:<fpage>102345</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compmedimag.2025.102345</pub-id></mixed-citation></ref>
<ref id="ref55"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xie</surname><given-names>Y.</given-names></name> <name><surname>Huang</surname><given-names>D.</given-names></name> <name><surname>Sun</surname><given-names>Q.</given-names></name></person-group> (<year>2025</year>). <article-title>Transformer-based cancer imaging models</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>44</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2025.1234567</pub-id></mixed-citation></ref>
<ref id="ref56"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname><given-names>Q.</given-names></name> <name><surname>Nair</surname><given-names>P.</given-names></name> <name><surname>Gomez</surname><given-names>F.</given-names></name></person-group> (<year>2025</year>). <article-title>Counterfactual generation for medical image explanations</article-title>. <source>Med. Image Anal.</source> <volume>112</volume>:<fpage>104678</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2025.104678</pub-id></mixed-citation></ref>
<ref id="ref57"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname><given-names>H.</given-names></name> <name><surname>Chen</surname><given-names>L.</given-names></name> <name><surname>Zhou</surname><given-names>X.</given-names></name></person-group> (<year>2025</year>). <article-title>Multimodal learning for precision oncology</article-title>. <source>Brief. Bioinform.</source> <volume>25</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbae123</pub-id>, <pub-id pub-id-type="pmid">38555476</pub-id></mixed-citation></ref>
<ref id="ref58"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yoon</surname><given-names>H.</given-names></name> <name><surname>Park</surname><given-names>J.</given-names></name> <name><surname>Chen</surname><given-names>L.</given-names></name></person-group> (<year>2025</year>). <article-title>Impact of explainability on diagnostic confidence and decision-making</article-title>. <source>NPJ Digit. Med.</source> <volume>8</volume>:<fpage>21</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-025-00821-3</pub-id></mixed-citation></ref>
<ref id="ref59"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname><given-names>J.</given-names></name> <name><surname>Wang</surname><given-names>L.</given-names></name> <name><surname>Kim</surname><given-names>H.</given-names></name></person-group> (<year>2024</year>). <article-title>Human-centred evaluation of explainable AI</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>43</volume>, <fpage>334</fpage>&#x2013;<lpage>348</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2024.3345678</pub-id></mixed-citation></ref>
<ref id="ref60"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>L.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name> <name><surname>Ortega</surname><given-names>M.</given-names></name></person-group> (<year>2024</year>). <article-title>Adversarial robustness of clinical multimodal systems</article-title>. <source>Comput. Biol. Med.</source> <volume>139</volume>:<fpage>108112</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108112</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0002">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2028804/overview">Biswaranjan Acharya</ext-link>, Marwadi University, India</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0003">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1204234/overview">Fernando Moreira</ext-link>, Portucalense University, Portugal</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3329915/overview">Sujala Shetty</ext-link>, Birla Institute of Technology and Science, United Arab Emirates</p>
</fn>
</fn-group>
<fn-group>
<fn id="fn0001"><label>1</label><p><ext-link xlink:href="https://www.cancerimagingarchive.net/collection/cbis-ddsm/" ext-link-type="uri">https://www.cancerimagingarchive.net/collection/cbis-ddsm/</ext-link></p></fn>
</fn-group>
</back>
</article>