<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="EN" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2025.1741146</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Early prediction of diabetic retinopathy using a multimodal deep learning framework integrating fundus and OCT imaging</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Emara</surname> <given-names>Abdel-Hamid M.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3325843/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Alkhateeb</surname> <given-names>Jawad Hasan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3270981/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Atteia</surname> <given-names>Ghada</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3326052/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Turani</surname> <given-names>Aiman</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3325011/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Zraqou</surname> <given-names>Jamal</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Elsawaf</surname> <given-names>Zeinab</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3324371/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Jameel</surname> <given-names>Abid</given-names></name>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/3290240/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Computer Science, College of Computer Science and Engineering, Taibah University</institution>, <city>Medina</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Computer Engineering, College of Engineering and Computer Science, Prince Mohammad Bin Fahd University</institution>, <city>Al Khobar</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Information Technology, College of Computer and Information Sciences, Princess Nourah bint Abdulrahman University</institution>, <city>Riyadh</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Information Systems, College of Computer Science and Engineering, Taibah University</institution>, <city>Medina</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Computer Science, University of Petra</institution>, <city>Amman</city>, <country country="jo">Jordan</country></aff>
<aff id="aff6"><label>6</label><institution>Department of Pathology, Medical Faculty, Taibah University</institution>, <city>Madinah</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff7"><label>7</label><institution>Department of Computer Science, Faculty of Computing, International Islamic University Islamabad</institution>, <city>Islamabad</city>, <country country="pk">Pakistan</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Ghada Atteia, <email xlink:href="mailto:geatteiaallah@pnu.edu.sa">geatteiaallah@pnu.edu.sa</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-09">
<day>09</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1741146</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>10</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>15</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Emara, Alkhateeb, Atteia, Turani, Zraqou, Elsawaf and Jameel.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Emara, Alkhateeb, Atteia, Turani, Zraqou, Elsawaf and Jameel</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-09">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Diabetic Retinopathy (DR) remains a leading cause of preventable vision impairment among individuals with diabetes, particularly when not identified in its early stages. Conventional diagnostic techniques typically employ either fundus photography or Optical Coherence Tomography (OCT), with each modality offering distinct yet partial insights into retinal abnormalities. This study proposes a multimodal diagnostic framework that fuses both structural and spatial retinal characteristics through the integration of fundus and OCT imagery. We utilize a curated subset of 222 high- quality, modality- paired images (111 fundus + 111 OCT), selected from a larger publicly available dataset based on strict inclusion criteria including image clarity, diagnostic labeling, and modality alignment. Feature extraction pipelines are optimized for each modality to capture relevant pathological markers, and the extracted features are fused using an attention- based weighting mechanism that emphasizes diagnostically salient regions across modalities. The proposed approach achieves an accuracy of 90.5% and an AUC- ROC of 0.970 on this curated subset, indicating promising feasibility of multimodal fusion for early- stage DR assessment. Given the limited dataset size, these results should be interpreted as preliminary, demonstrating methodological potential rather than large- scale robustness. The study highlights the clinical value of hybrid imaging frameworks and AI- assisted screening tools, while emphasizing the need for future validation on larger and more diverse datasets.</p>
</abstract>
<kwd-group>
<kwd>artificial intelligence in ophthalmology</kwd>
<kwd>attention-based fusion</kwd>
<kwd>deep learning</kwd>
<kwd>diabetic retinopathy</kwd>
<kwd>early diagnosis</kwd>
<kwd>EyePACS dataset</kwd>
<kwd>fundus photography</kwd>
<kwd>medical image analysis</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. Financial support was received from Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2025R748), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia.</funding-statement>
</funding-group>
<counts>
<fig-count count="12"/>
<table-count count="13"/>
<equation-count count="12"/>
<ref-count count="19"/>
<page-count count="17"/>
<word-count count="9213"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Pathology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="S1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Diabetic Retinopathy (DR) is a progressive microvascular complication of diabetes mellitus and remains a leading cause of preventable blindness among the working-age population worldwide. The global prevalence of DR is estimated to exceed 93 million cases, and this number is expected to rise due to the increasing incidence of type 2 diabetes and longer life expectancy of affected individuals (<xref ref-type="bibr" rid="B1">1</xref>). Timely detection of DR is crucial, as the early stages are often asymptomatic but treatable, whereas delayed diagnosis can lead to irreversible vision loss.</p>
<p>Traditionally, DR screening relies on fundus photography, a non-invasive technique that captures two-dimensional color images of the retina. Fundus images provide high-resolution views of surface-level retinal features such as microaneurysms, exudates, and hemorrhages, which are critical indicators of DR progression (<xref ref-type="bibr" rid="B2">2</xref>). However, fundus imaging lacks depth perception and fails to reveal sub-retinal or structural changes beneath the retinal surface. These limitations have led to the complementary use of Optical Coherence Tomography (OCT), which offers cross-sectional, depth-resolved visualization of retinal layers and is particularly effective in identifying macular edema, retinal thickening, and subretinal fluid accumulation&#x2014;hallmarks of early and moderate DR (<xref ref-type="bibr" rid="B3">3</xref>, <xref ref-type="bibr" rid="B4">4</xref>). Despite their individual benefits, most automated DR screening models are designed around a single imaging modality, which inherently restricts diagnostic accuracy. While fundus-based models excel at identifying superficial lesions, they miss structural alterations observable through OCT. OCT-based screening systems, while effective for analyzing internal retinal structures, may not adequately capture surface-level abnormalities, particularly during the initial stages of diabetic retinopathy (DR) (<xref ref-type="bibr" rid="B5">5</xref>). Recognizing the limitations of using a single imaging technique, recent efforts have increasingly emphasized the integration of multiple diagnostic modalities to strengthen early detection strategies. By combining complementary information from fundus photography and OCT scans, clinicians can attain a more complete view of retinal pathology, encompassing both superficial and deep retinal layers (<xref ref-type="bibr" rid="B6">6</xref>&#x2013;<xref ref-type="bibr" rid="B8">8</xref>). Multimodal diagnostic strategies have demonstrated clear advantages in capturing diverse retinal features that might be overlooked when relying on a single modality. Studies confirm that integrating fundus and OCT imaging enables more accurate disease grading and classification by leveraging spatial and cross-sectional data simultaneously (<xref ref-type="bibr" rid="B9">9</xref>, <xref ref-type="bibr" rid="B10">10</xref>). For instance, Kermany et al. (<xref ref-type="bibr" rid="B10">10</xref>) highlighted significant improvements in diagnostic outcomes when both imaging types were used to assess age-related macular degeneration. Likewise, Goutam et al. (<xref ref-type="bibr" rid="B11">11</xref>) incorporated multimodal imaging and patient risk profiles in predicting the onset of diabetes-related complications, thereby reinforcing the broader applicability of multi-source medical imaging frameworks. The effectiveness of such multimodal systems, however, largely depends on how information from each modality is merged. Simple combination techniques, such as direct feature concatenation, may fail to distinguish the individual diagnostic contributions of each modality, resulting in suboptimal integration. In contrast, more nuanced methods that assign variable importance to different image sources&#x2014;based on their diagnostic relevance&#x2014;can enhance both the interpretability and stability of the resulting prediction. This targeted integration not only improves performance but also aligns with clinical requirements for transparent decision-making in medical diagnostics (<xref ref-type="bibr" rid="B12">12</xref>&#x2013;<xref ref-type="bibr" rid="B14">14</xref>).</p>
<p>In this study, we propose a multimodal deep learning framework that integrates fundus images from the publicly available EyePACS dataset with OCT scans from the DUKE OCT dataset. The proposed model combines ResNet50 and EfficientNet as backbone feature extractors for fundus and OCT images, respectively. While many recent studies in ophthalmic image analysis exploit large datasets with several thousand images, practical constraints such as variability in image quality, inconsistent labeling, and modality mismatches often introduce noise and reduce reliability. In our work, instead of using the full dataset, we deliberately chose a filtered subset of 222 paired fundus and OCT images. The selection was guided by strict inclusion criteria, good image resolution, clear modality pairing, and accurate diagnostic labels, ensuring consistency and enabling a focused evaluation of the proposed dual-modal fusion architecture. and introduces an attention-based fusion layer to integrate high-level features from both modalities. We hypothesize that this approach will lead to improved early detection of DR, particularly in distinguishing between no DR, mild DR, and moderate DR cases.</p>
<p>The key contributions of this paper are as follows:</p>
<list list-type="bullet">
<list-item>
<p>We design a dual-stream CNN architecture that processes fundus and OCT images in parallel to extract spatial and structural features.</p>
</list-item>
<list-item>
<p>We implement and evaluate multiple fusion strategies, demonstrating the superiority of attention-based fusion in enhancing classification performance.</p>
</list-item>
<list-item>
<p>We validate our model on a large, real-world dataset combination and compare its performance with existing single-modality and multimodal DR classification models.</p>
</list-item>
</list>
<p>The remainder of this paper is organized as follows: section 2 reviews related work on single and multimodal DR detection. Section 3 describes the dataset, preprocessing, and proposed methodology in detail. Section 4 presents experimental results and performance evaluation. Section 5 discusses the findings and implications, and Section 6 concludes the paper with insights into future research directions.</p>
</sec>
<sec id="S2">
<label>2</label>
<title>Related work</title>
<p>The integration of deep learning in ophthalmology has accelerated the development of automated systems for detecting diabetic retinopathy (DR), particularly using fundus photography and Optical Coherence Tomography (OCT). Early studies focused predominantly on fundus imaging, leveraging both handcrafted features and shallow classifiers. For instance, traditional machine learning methods used color, texture, and vascular morphology to detect DR lesions, achieving moderate performance but often requiring manual preprocessing and feature engineering (<xref ref-type="bibr" rid="B1">1</xref>). The emergence of Convolutional Neural Networks (CNNs) enabled the shift toward end-to-end learning frameworks. Models such as VGGNet, Inception, and ResNet have shown improved accuracy in classifying fundus images by learning hierarchical patterns directly from raw pixel data (<xref ref-type="bibr" rid="B2">2</xref>, <xref ref-type="bibr" rid="B3">3</xref>). Tan et al. (<xref ref-type="bibr" rid="B4">4</xref>) were among the first to demonstrate a high-performing deep learning model on the EyePACS dataset, achieving sensitivity and specificity levels comparable to expert ophthalmologists. However, such models are primarily trained on two-dimensional surface data and lack structural context, limiting their utility in cases where subretinal or layer-specific abnormalities are present. Optical Coherence Tomography (OCT) imaging provides detailed cross-sectional views of retinal layers and has become instrumental in identifying structural indicators such as macular edema, retinal thinning, and vitreoretinal traction (<xref ref-type="bibr" rid="B5">5</xref>). While several recent studies have relied solely on OCT data for diagnostic purposes, including assessments comparable to those made by experienced ophthalmologists, such approaches are not without challenges. Limitations include the high cost of OCT equipment, restricted accessibility in primary care settings, and an inability to capture surface-level retinal abnormalities. To address these shortcomings, a growing number of investigations have shifted toward multimodal imaging frameworks that bring together the strengths of OCT and fundus photography. This integration is grounded in the understanding that diabetic retinopathy (DR) often involves both superficial and subsurface changes, which&#x2014;when analyzed in tandem&#x2014;can enhance the precision of disease classification. For instance, Kermany et al. (<xref ref-type="bibr" rid="B10">10</xref>) demonstrated that using both modalities to assess age-related macular degeneration led to more accurate diagnostic outcomes, as evidenced by improved AUC metrics. Goutam et al. (<xref ref-type="bibr" rid="B11">11</xref>) further extended this approach by combining imaging data with clinical risk indicators to forecast the onset of type 2 diabetes, suggesting wider applicability for chronic disease monitoring. The effectiveness of these multimodal frameworks often hinges on how the information is combined. Basic fusion strategies, such as direct merging of extracted features, are computationally straightforward but may overlook the distinct diagnostic value each modality offers (<xref ref-type="bibr" rid="B12">12</xref>). In contrast, more sophisticated techniques&#x2014;such as those assigning variable weights to imaging inputs based on their relevance&#x2014;have proven more robust in practice. These weighted strategies not only enhance interpretability but also ensure that diagnostic decisions are grounded in the most informative image characteristics. Evidence from the work of Yi et al. (<xref ref-type="bibr" rid="B15">15</xref>) and Ferrara et al. (<xref ref-type="bibr" rid="B16">16</xref>) supports this claim, showing that such adaptive integration mechanisms consistently yield superior results in multiple clinical imaging contexts, including DR grading, tumor boundary delineation, and multi-organ analysis.</p>
<p>Despite these advancements, several limitations persist in the literature:</p>
<list list-type="bullet">
<list-item>
<p>Many studies rely on private or limited datasets, hindering reproducibility and generalizability.</p>
</list-item>
<list-item>
<p>Fusion methods are often heuristic and not optimized for medical interpretability.</p>
</list-item>
<list-item>
<p>Real-time deployment and clinical validation are rarely addressed.</p>
</list-item>
</list>
<p>Recent research has shifted toward multimodal learning, aiming to combine fundus and OCT data. However, challenges remain in effective feature fusion, model generalizability, and interpretability.</p>
<p>Lin et al. (<xref ref-type="bibr" rid="B17">17</xref>) introduced a dual-branch CNN using shared attention to fuse fundus and OCT features. While effective, their approach relied on large datasets (&#x003E;5,000 samples) and lacked interpretability in fusion regions. Karthikeyan et al. (<xref ref-type="bibr" rid="B18">18</xref>) combined handcrafted statistical features from both modalities and applied SVM classifiers. While interpretable, the method failed to leverage modern CNN architectures, limiting scalability and performance. Zhang et al. (<xref ref-type="bibr" rid="B19">19</xref>) proposed a Transformer-based fusion model that captured long-range dependencies across fundus and OCT inputs. Despite high accuracy, it required significant computational resources and massive datasets to avoid overfitting. These works demonstrate progress, but also highlight gaps&#x2014;particularly for low-resource settings or clinics with limited imaging data.</p>
<p>Our proposed framework differs in the following key aspects presented in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Difference between the proposed framework and existing studies.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Study</th>
<th valign="top" align="left">Modalities</th>
<th valign="top" align="left">Fusion method</th>
<th valign="top" align="left">Backbone</th>
<th valign="top" align="left">Dataset size</th>
<th valign="top" align="left">External validation</th>
<th valign="top" align="left">Key limitation</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Lin et al. (<xref ref-type="bibr" rid="B17">17</xref>)</td>
<td valign="top" align="left">Fundus + OCT</td>
<td valign="top" align="left">Shared Attention</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="left">5,000 +</td>
<td valign="top" align="left">&#x00D7;</td>
<td valign="top" align="left">Poor interpretability</td>
</tr>
<tr>
<td valign="top" align="left">Karthikeyan et al. (<xref ref-type="bibr" rid="B18">18</xref>)</td>
<td valign="top" align="left">Fundus + OCT</td>
<td valign="top" align="left">Feature Concatenation</td>
<td valign="top" align="left">Handcrafted + SVM</td>
<td valign="top" align="left">400</td>
<td valign="top" align="left">&#x00D7;</td>
<td valign="top" align="left">Shallow features</td>
</tr>
<tr>
<td valign="top" align="left">Zhang et al. (<xref ref-type="bibr" rid="B19">19</xref>)</td>
<td valign="top" align="left">Fundus + OCT</td>
<td valign="top" align="left">Transformer Fusion</td>
<td valign="top" align="left">ViT</td>
<td valign="top" align="left">6,000</td>
<td valign="top" align="left">&#x00D7;</td>
<td valign="top" align="left">High computation</td>
</tr>
<tr>
<td valign="top" align="left">Our method</td>
<td valign="top" align="left">Fundus + OCT</td>
<td valign="top" align="left">Attention-Based Feature Fusion</td>
<td valign="top" align="left">ResNet50 + EfficientNet-B0</td>
<td valign="top" align="left">222</td>
<td valign="top" align="left">(stratified split)</td>
<td valign="top" align="left">Data-efficient, interpretable</td>
</tr>
</tbody>
</table></table-wrap>
<p>We specifically target early-stage DR detection using a compact dataset of 111 paired fundus and OCT images. Our attention-based fusion module allows the model to emphasize salient regions across both modalities, improving accuracy while retaining interpretability. Additionally, the dual-stream backbone uses ResNet50 for fundus and EfficientNet-B0 for OCT&#x2014;balancing performance with computational efficiency.</p>
<p>Unlike prior models, we ensure:</p>
<list list-type="bullet">
<list-item>
<p>Paired data consistency (every fundus image has an OCT counterpart),</p>
</list-item>
<list-item>
<p>Data-efficient training with robust validation,</p>
</list-item>
<list-item>
<p>Modular fusion architecture easily extendable to other modalities (e.g., fluorescein angiography).</p>
</list-item>
</list>
<p>By critically analyzing recent literature and benchmarking against it, we position our method as a lightweight, interpretable, and practically deployable multimodal DR framework, ideal for real-world low-resource clinical settings. Our contributions are threefold:</p>
<list list-type="bullet">
<list-item>
<p>A dual-stream deep-learning architecture optimized for fundus + OCT integration.</p>
</list-item>
<list-item>
<p>A novel attention-based fusion strategy that emphasizes clinically relevant features.</p>
</list-item>
<list-item>
<p>Demonstrated performance (90.5% accuracy, AUC 0.970) on a curated, high-quality paired dataset with careful validation.</p>
</list-item>
</list>
<p>Our proposed work addresses these gaps by utilizing two publicly available, large-scale datasets&#x2014;EyePACS and DUKE OCT&#x2014;and by implementing a dual-stream CNN model with attention-based fusion. This approach not only strengthens the model&#x2019;s ability to detect early-stage DR across diverse imaging modalities but also enhances its suitability for integration into clinical workflows.</p>
<p><xref ref-type="table" rid="T2">Table 2</xref> shows the main research findings regarding deep learning-based multi-modal detection of diabetic retinopathy with their respective methodological approaches and accomplishment rates and weaknesses along with proposed enhancement strategies.</p>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Summary of recent deep learning approaches for diabetic retinopathy detection.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">References</th>
<th valign="top" align="left">Technique/methodology used</th>
<th valign="top" align="left">Accuracy reported</th>
<th valign="top" align="left">Identified weaknesses</th>
<th valign="top" align="left">Suggested improvements</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Atwany et al. (<xref ref-type="bibr" rid="B1">1</xref>)</td>
<td valign="top" align="left">Traditional machine learning with handcrafted features (fundus)</td>
<td valign="top" align="left">&#x223C;75%</td>
<td valign="top" align="left">Poor generalization, lacks hierarchical feature learning</td>
<td valign="top" align="left">Replace with CNN-based feature extraction</td>
</tr>
<tr>
<td valign="top" align="left">Rashed et al. (<xref ref-type="bibr" rid="B2">2</xref>)</td>
<td valign="top" align="left">CNN-based classification using ResNet, VGG, Inception (fundus)</td>
<td valign="top" align="left">&#x223C;85%</td>
<td valign="top" align="left">Ignores depth information; no subsurface analysis</td>
<td valign="top" align="left">Integrate OCT imaging for structural features</td>
</tr>
<tr>
<td valign="top" align="left">Tan and Le (<xref ref-type="bibr" rid="B4">4</xref>)</td>
<td valign="top" align="left">Deep CNN (Inception-v3) on EyePACS fundus dataset</td>
<td valign="top" align="left">&#x223C;87.5%</td>
<td valign="top" align="left">Limited to 2D data; no structural biomarkers</td>
<td valign="top" align="left">Combine with 3D OCT for improved assessment</td>
</tr>
<tr>
<td valign="top" align="left">Ramachandran et al. (<xref ref-type="bibr" rid="B6">6</xref>)</td>
<td valign="top" align="left">End-to-end CNN for OCT classification (Nature Med)</td>
<td valign="top" align="left">&#x223C;88.3&#x2013;91%</td>
<td valign="top" align="left">Resource-intensive; lacks multimodal perspective</td>
<td valign="top" align="left">Add multimodal fusion with fundus features</td>
</tr>
<tr>
<td valign="top" align="left">Kermany et al. (<xref ref-type="bibr" rid="B10">10</xref>)</td>
<td valign="top" align="left">Multimodal CNN for AMD detection using OCT + fundus</td>
<td valign="top" align="left">&#x223C;89.2%</td>
<td valign="top" align="left">Not directly optimized for DR classification</td>
<td valign="top" align="left">Retrain and fine-tune on DR-specific datasets</td>
</tr>
<tr>
<td valign="top" align="left">Goutam et al. (<xref ref-type="bibr" rid="B11">11</xref>)</td>
<td valign="top" align="left">Multimodal fusion with fundus + clinical risk factors (T2DM)</td>
<td valign="top" align="left">&#x223C;95%</td>
<td valign="top" align="left">Does not use OCT; limited to diabetes risk</td>
<td valign="top" align="left">Extend model for DR using OCT integration</td>
</tr>
<tr>
<td valign="top" align="left">Wang et al. (<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="left">Simple concatenation of fundus + OCT features</td>
<td valign="top" align="left">&#x223C;88.1%</td>
<td valign="top" align="left">Does not prioritize modality importance</td>
<td valign="top" align="left">Employ attention-based fusion mechanisms</td>
</tr>
<tr>
<td valign="top" align="left">Bhoyar et al. (<xref ref-type="bibr" rid="B13">13</xref>)</td>
<td valign="top" align="left">Feature embedding fusion of multimodal features</td>
<td valign="top" align="left">&#x223C;89.0%</td>
<td valign="top" align="left">Lacks interpretability; fusion not adaptive</td>
<td valign="top" align="left">Improve with learned attention weights</td>
</tr>
<tr>
<td valign="top" align="left">Sahlsten et al. (<xref ref-type="bibr" rid="B14">14</xref>)</td>
<td valign="top" align="left">Attention-based multimodal fusion for DR detection (proposed)</td>
<td valign="top" align="left"><bold>&#x223C;90.5%</bold></td>
<td valign="top" align="left">Requires high computation, not yet real-time</td>
<td valign="top" align="left">Optimize for lightweight deployment</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p>Bold values highlight the best-performing results (i.e., highest accuracy) reported across the listed approaches.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="S3">
<label>3</label>
<title>Proposed methodology</title>
<p>The proposed framework based on deep learning techniques will predict early stages of diabetic retinopathy (DR). The proposed methodology includes data acquisition followed by preprocessing and feature extraction stages before multimodal fusion and classification and an evaluation metrics phase.</p>
<p>The following pseudocode outlines the complete dataset preparation and model training pipeline, including image loading, preprocessing, feature extraction, attention-based fusion, and training. This provides a clear and reproducible framework for implementing our multimodal deep learning approach.</p>
<table-wrap position="float" id="T14">
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<tbody>
<tr>
<td valign="top" align="left">Step 1</td>
<td valign="top" align="left">Load fundus and OCT images from their respective datasets. <italic>fundus_images = load_images (&#x201C;EyePACS_dataset_path&#x201D;) oct_images = load_images (&#x201C;DUKE_OCT_dataset_path&#x201D;)</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 2</td>
<td valign="top" align="left">Ensure that each fundus image is paired with its corresponding OCT image. <italic>paired_images = pair_images (fundus_images, oct_images)</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 3</td>
<td valign="top" align="left">Apply resizing, normalization, and other necessary transformations to ensure image consistency. <italic>processed_fundus = preprocess (fundus_images) processed_oct = preprocess (oct_images)</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 4</td>
<td valign="top" align="left">Use CNNs (or other techniques) to extract meaningful features from the images. <italic>fundus_features = extract_features (processed_fundus) oct_features = extract_features (processed_oct)</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 5</td>
<td valign="top" align="left">Apply the attention mechanism to fuse the features from the fundus and OCT images based on their relative importance. <italic>fused_features = attention_fusion(fundus_features, oct_features)</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 6</td>
<td valign="top" align="left">Define the neural network architecture and compile it. <italic>model = build_model()</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 7</td>
<td valign="top" align="left">Train the model with the prepared dataset, including training and validation splits. <italic>model.compile(optimizer = &#x201C;Adam&#x201D;, loss = &#x201C;categorical_crossentropy,&#x201D; metrics = [&#x201C;accuracy&#x201D;]) model.fit(fused_features, labels, epochs = 50, batch_size = 32, validation_split = 0.2)</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 8</td>
<td valign="top" align="left">Evaluate the model using test data or validation sets to compute performance metrics. <italic>evaluation_metrics = model.evaluate(test_data) print(&#x2018;Evaluation metrics:&#x2019;, evaluation_metrics)</italic></td>
</tr>
<tr>
<td valign="top" align="left">Step 9</td>
<td valign="top" align="left">Save the trained model for future use or deployment. <italic>model.save(&#x201C;trained_model.h5&#x201D;)</italic></td>
</tr>
</tbody>
</table></table-wrap>
<sec id="S3.SS1">
<label>3.1</label>
<title>Dataset description and curation</title>
<p>To enable robust early prediction of diabetic retinopathy (DR), this study uses a custom-curated multimodal dataset by combining fundus images from the EyePACS dataset and OCT scans from the Duke OCT dataset, both of which are publicly available and widely used in ophthalmic AI research. A comprehensive overview of the data preparation, pairing, and training pipeline is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>Dataset preparation pipeline.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a dataset preparation pipeline. It starts with EyePACS and DUKE OCT datasets, leading to Fundus and OCT images. These undergo data augmentation, paired image filtering, and preprocessing before an 80/20 train-validation split.</alt-text>
</graphic>
</fig>
<list list-type="bullet">
<list-item>
<p>Fundus images source:</p>
</list-item>
</list>
<p>Retrieved from the EyePACS dataset via TensorFlow repository,<sup><xref ref-type="fn" rid="footnote1">1</xref></sup> which contains thousands of retinal images labeled with DR severity.</p>
<list list-type="bullet">
<list-item>
<p>OCT images source:</p>
</list-item>
</list>
<p>Acquired from the Duke OCT dataset, which provides high-resolution cross-sectional retinal scans with ground truth annotations.<sup><xref ref-type="fn" rid="footnote2">2</xref></sup></p>
<sec id="S3.SS1.SSS1">
<label>3.1.1</label>
<title>Pairing logic and inclusion criteria</title>
<p>To ensure modality consistency and clinical relevance, a multi-stage filtering and pairing process was applied:</p>
<list list-type="bullet">
<list-item>
<p>Initial Screening:</p>
</list-item>
<list-item>
<p>Images were screened for:</p>
</list-item>
<list-item>
<p>Resolution &#x2265; 512 &#x00D7; 512 pixels</p>
</list-item>
<list-item>
<p>No motion blur or noise artifacts</p>
</list-item>
<list-item>
<p>Presence of clear anatomical markers (macula, optic disc)</p>
</list-item>
<list-item>
<p>Label verification:</p>
</list-item>
</list>
<p>DR severity labels were cross-checked and harmonized across both datasets. Only images with No DR, Mild DR, and Moderate DR labels were retained.</p>
<list list-type="bullet">
<list-item>
<p>Cross-modality pairing:</p>
</list-item>
</list>
<p>Since EyePACS and Duke OCT are from different sources, strict pairing was not natively available. Therefore, an expert ophthalmologist manually paired fundus and OCT samples based on:</p>
<list list-type="bullet">
<list-item>
<p>Similar DR severity levels</p>
</list-item>
<list-item>
<p>Close image quality and field-of-view (FOV)</p>
</list-item>
<list-item>
<p>Matched anatomical regions (central macula)</p>
</list-item>
</list>
<p>This resulted in a total of 222 high-quality paired samples (111 fundus + 111 OCT), each representing the same DR severity class. While not perfect eye-wise pairing, this label-wise modality fusion is common in early multimodal DR frameworks.</p>
</sec>
<sec id="S3.SS1.SSS2">
<label>3.1.2</label>
<title>Justification for subsampling</title>
<p>While large-scale datasets offer better generalizability, they often suffer from label noise and modality mismatch. Hence, a curated subset was selected to minimize noise, standardize quality, and ensure fair fusion-based classification.</p>
<p>All images were resized to 224 &#x00D7; 224, normalized, and preprocessed to ensure uniform input across the model. Each class (No DR, Mild DR, Moderate DR) contains 37 images per modality, maintaining balance for training and validation purposes.</p>
<p><xref ref-type="fig" rid="F2">Figure 2</xref> presents the sample fundus and OCT images from the dataset, illustrating different severity levels of diabetic retinopathy</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>Sample fundus and OCT images from the dataset, illustrating different severity levels of diabetic retinopathy.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g002.tif">
<alt-text content-type="machine-generated">Three pairs of retinal images show varying degrees of diabetic retinopathy (DR) alongside corresponding optical coherence tomography (OCT) scans. The first pair depicts a healthy retina and OCT scan with no DR. The second pair shows mild DR with slight changes in the retina and OCT. The third pair illustrates moderate DR with noticeable retinal alterations and OCT differences.</alt-text>
</graphic>
</fig>
<p>To mitigate overfitting risks associated with the relatively small dataset of 222 paired images (111 fundus + 111 OCT), we employed several robust validation techniques. The data was divided using an 80/20 train-validation split, ensuring class balance in both subsets. We further implemented 5-fold cross-validation, allowing the model to generalize across different data partitions. To improve regularization, dropout layers were incorporated in the deep learning architecture, and early stopping was used to halt training when validation loss plateaued. Although an external independent test set was unavailable due to the rarity of high-quality paired datasets, the model consistently achieved high performance across all folds, suggesting good generalization. Future work will focus on external validation with larger and more diverse datasets.</p>
<p>While the dataset used in this study includes only the early stages of DR (No DR, Mild DR, and Moderate DR), the absence of Severe DR and Proliferative DR stages limits the system&#x2019;s applicability for full-scale clinical screening. This dataset limitation should be considered when interpreting the performance of the model, as the inclusion of more diverse stages of DR would provide a more comprehensive assessment of the system&#x2019;s ability to detect advanced DR cases. Therefore, this is a critical aspect to address in future work, where expanding the dataset to include more severe stages of DR will improve the model&#x2019;s robustness and clinical utility.</p>
</sec>
</sec>
<sec id="S3.SS2">
<label>3.2</label>
<title>Proposed multimodal framework architecture</title>
<p>Our proposed system is a dual-stream multimodal deep-learning architecture designed to integrate complementary features from retinal fundus and OCT images for early-stage diabetic retinopathy (DR) classification. The framework consists of two primary branches for each modality, followed by a fusion module and a final classification layer.</p>
<sec id="S3.SS2.SSS1">
<label>3.2.1</label>
<title>Image preprocessing and input format</title>
<p>All fundus and OCT images were resized to 224 &#x00D7; 224 pixels and normalized. The fundus images were sourced from the EyePACS dataset, while OCT images were obtained from the DUKE OCT database. Each fundus image was manually paired with an OCT scan based on consistent labeling (No DR, Mild DR, or Moderate DR), verified by clinical metadata.</p>
</sec>
<sec id="S3.SS2.SSS2">
<label>3.2.2</label>
<title>Feature extraction branches</title>
<p>Fundus Branch: A pre-trained ResNet50 model was used to extract structural and vascular features from color fundus images. The final convolutional layer was retained, and the classifier head was removed, as mathematically presented in <xref ref-type="disp-formula" rid="Ex1">Equation 1</xref>.</p>
<p>OCT Branch: The grayscale OCT images were processed using EfficientNet-B0, chosen for its lightweight design and strong performance in medical imaging tasks, as mathematically presented in <xref ref-type="disp-formula" rid="Ex2">Equation 2</xref>.</p>
<p>Both branches extract high-level deep features:</p>
<p>Let</p>
<disp-formula id="Ex1">
<mml:math id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>H</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo mathvariant="italic" separator="true">&#x2003;&#x2003;&#x2003;&#x2002;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo rspace="5.8pt" stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>be</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>the</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>fundus</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>feature</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>map</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="Ex2">
<mml:math id="M2">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>H</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo mathvariant="italic" separator="true">&#x2003;&#x2003;&#x2003;&#x2002;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo rspace="5.8pt" stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>be</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>the</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>OCT</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>feature</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>map</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>C</italic>, <italic>H</italic>, and <italic>W</italic> represent channels, height, and width <italic>respectively</italic>.</p>
</sec>
<sec id="S3.SS2.SSS3">
<label>3.2.3</label>
<title>Channel-wise attention mechanism</title>
<p>To focus on diagnostically relevant regions within each modality, we applied a Convolutional Block Attention Module (CBAM) independently on both branches.</p>
<p>Given a feature map <italic>F</italic>, <italic>CBAM</italic> applies:</p>
<list list-type="bullet">
<list-item>
<p>Channel Attention is mathematically presented in <xref ref-type="disp-formula" rid="Ex3">Equation 3</xref>
<disp-formula id="Ex3">
<mml:math id="M3">
<mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03C3;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">+</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="7.5pt">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label></disp-formula></p>
</list-item>
</list>
<p>where &#x03C3; is the sigmoid function, and MLP is a shared multi-layer perceptron.</p>
<list list-type="bullet">
<list-item>
<p>Spatial attention is mathematically presented in <xref ref-type="disp-formula" rid="Ex4">Equation 4</xref>
<disp-formula id="Ex4">
<mml:math id="M4">
<mml:mrow>
<mml:mrow>
<mml:mi>Ms</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03C3;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>f7</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mi>AvgPool</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>MaxPool</mml:mi>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(4)</label></disp-formula></p>
</list-item>
<list-item>
<p>Refined feature output is mathematically presented in <xref ref-type="disp-formula" rid="Ex5">Equation 5</xref>
<disp-formula id="Ex5">
<mml:math id="M5">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msup>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>Ms</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>Mc</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label></disp-formula></p>
</list-item>
</list>
<p>where &#x2299; denotes element-wise multiplication.</p>
</sec>
<sec id="S3.SS2.SSS4">
<label>3.2.4</label>
<title>Multimodal fusion and classification</title>
<p>After attention refinement, feature maps from both branches are flattened and concatenated, as mathematically presented in <xref ref-type="disp-formula" rid="Ex6">Equation 6</xref>:</p>
<disp-formula id="Ex6">
<mml:math id="M6">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>Fconcat</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>Flatten</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msup>
<mml:mi>Ff</mml:mi>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo lspace="2.5pt" rspace="2.5pt">&#x2225;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>Flatten</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msup>
<mml:mi>Fo</mml:mi>
<mml:msup>
<mml:mi/>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msup>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label></disp-formula>
<p>This joint feature vector passes through two fully connected (FC) layers with ReLU activation and dropout for regularization. The final layer uses Softmax for 3-class classification. The summary of key design choices is presented in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Summary of key design choices.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center">Component</th>
<th valign="top" align="center">Architecture used</th>
<th valign="top" align="center">Justification</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Fundus branch</td>
<td valign="top" align="center">ResNet50</td>
<td valign="top" align="center">Effective for color vessel patterns</td>
</tr>
<tr>
<td valign="top" align="center">OCT branch</td>
<td valign="top" align="center">EfficientNet-B0</td>
<td valign="top" align="center">Compact, high-performing on gray-scale</td>
</tr>
<tr>
<td valign="top" align="center">Attention module</td>
<td valign="top" align="center">CBAM</td>
<td valign="top" align="center">Highlights modality-specific features</td>
</tr>
<tr>
<td valign="top" align="center">Fusion technique</td>
<td valign="top" align="center">Concatenation + FC</td>
<td valign="top" align="center">Preserves modality independence</td>
</tr>
<tr>
<td valign="top" align="center">Final output</td>
<td valign="top" align="center">Softmax</td>
<td valign="top" align="center">3-class (No DR, Mild, Moderate)</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
</sec>
<sec id="S3.SS3">
<label>3.3</label>
<title>Data preprocessing</title>
<p>Prior to model training, modality-specific preprocessing steps are applied to optimize image clarity, reduce noise, and standardize dimensions. Fundus images undergo a three-stage pipeline consisting of:</p>
<list list-type="bullet">
<list-item>
<p>Histogram Equalization for global contrast adjustment,</p>
</list-item>
<list-item>
<p>Contrast Limited Adaptive Histogram Equalization (CLAHE) for local contrast refinement,</p>
<list list-type="simple">
<list-item>
<label>&#x2022;&#x00A0;</label>
<p>&#x00A0;&#x00A0;Clip limit: 2.0</p>
</list-item>
<list-item>
<label>&#x2022;&#x00A0;</label>
<p>&#x00A0;&#x00A0;Tile grid size: 8 &#x00D7; 8</p>
</list-item>
</list>
</list-item>
</list>
<p>These settings were chosen to enhance the contrast of the images without amplifying noise, which is particularly useful for medical image modalities like OCT and fundus images.</p>
<list list-type="bullet">
<list-item>
<p>Resizing to 224 &#x00D7; 224 pixels to match CNN input dimensions.</p>
</list-item>
</list>
<p>For OCT images, the preprocessing involves:</p>
<list list-type="bullet">
<list-item>
<p>Gaussian filtering to suppress high-frequency noise,</p>
<list list-type="simple">
<list-item>
<label>&#x2022;&#x00A0;</label>
<p>&#x00A0;&#x00A0;Kernel size: 5 &#x00D7; 5</p>
</list-item>
</list>
</list-item>
</list>
<p>A Gaussian filter with a kernel size of 5 &#x00D7; 5 was used to reduce noise and smooth the images before feature extraction.</p>
<list list-type="bullet">
<list-item>
<p>Adaptive histogram equalization,</p>
</list-item>
<list-item>
<p>Image resizing:</p>
</list-item>
</list>
<p>All images were resized to 256 &#x00D7; 256 pixels to ensure consistency and compatibility with the model input dimensions.</p>
<list list-type="bullet">
<list-item>
<p>Median filtering to further smooth the intensity distribution,</p>
<list list-type="simple">
<list-item>
<label>&#x2022;&#x00A0;</label>
<p>&#x00A0;&#x00A0;The pixel intensity values were normalized to a range of [0, 1] by dividing by 255.</p>
</list-item>
<list-item>
<label>&#x2022;&#x00A0;</label>
<p>&#x00A0;&#x00A0;For each modality, we also performed mean subtraction and division by standard deviation for normalization based on pre-defined values:</p>
</list-item>
<list-item>
<label>&#x2022;&#x00A0;</label>
<p>&#x00A0;&#x00A0;Fundus images: mean = 0.485, std = 0.229</p>
</list-item>
<list-item>
<label>&#x2022;&#x00A0;</label>
<p>&#x00A0;&#x00A0;OCT images: mean = 0.485, std = 0.229</p>
</list-item>
</list>
</list-item>
<list-item>
<p>Rigid registration for spatial alignment and consistency</p>
</list-item>
</list>
<p>We used a rigid transformation to align fundus and OCT images, employing bilinear interpolation for resizing and alignment. The registration accuracy was validated using overlap metrics like the Dice similarity coefficient.</p>
<p>These preprocessing settings ensure that the data is consistent, comparable, and ready for the deep learning framework. The chosen hyperparameters were optimized to balance between image enhancement and noise reduction, ensuring that the model training is stable and reproducible.</p>
<p>Both image modalities are finally normalized using min-max normalization, facilitating uniform learning dynamics across the deep network (<xref ref-type="bibr" rid="B2">2</xref>). The detailed preprocessing pipeline is illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>Preprocessing steps applied to fundus and OCT images, including contrast enhancement, noise reduction, and normalization.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g003.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a retinal image processing sequence. It starts with a fundus image undergoing three steps: contrast enhancement, noise reduction, and normalization, shown in three images. An OCT scan also follows this process, depicted through three images with arrows connecting each step, indicating the progression of image processing.</alt-text>
</graphic>
</fig>
</sec>
<sec id="S3.SS4">
<label>3.4</label>
<title>Features extraction</title>
<p>A dual-stream deep learning architecture is designed to extract modality-specific features from fundus and OCT images. As shown in <xref ref-type="fig" rid="F3">Figure 3</xref>, the proposed architecture includes:</p>
<list list-type="bullet">
<list-item>
<p><italic>ResNet50</italic> for processing fundus images. This residual learning-based architecture captures spatial and vascular patterns effectively (<xref ref-type="bibr" rid="B3">3</xref>).</p>
</list-item>
<list-item>
<p><italic>EfficientNet</italic> for OCT scans. Due to its compound scaling, EfficientNet is adept at learning depth-sensitive representations of retinal layers (<xref ref-type="bibr" rid="B4">4</xref>).</p>
</list-item>
</list>
<p>Each network produces a 2048-dimensional feature vector, which is then passed to the multimodal fusion stage.</p>
<p>The layer-wise configurations of both CNN streams are detailed in <xref ref-type="table" rid="T4">Table 4</xref> (ResNet50) and <xref ref-type="table" rid="T5">Table 5</xref> (EfficientNet), respectively.</p>
<table-wrap position="float" id="T4">
<label>TABLE 4</label>
<caption><p>ResNet50 architecture for fundus image feature extraction.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Layer type</th>
<th valign="top" align="left">Kernel size</th>
<th valign="top" align="left">Stride</th>
<th valign="top" align="left">Output shape</th>
<th valign="top" align="left">Activation function</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Convolutional</td>
<td valign="top" align="left">7 &#x00D7; 7</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">112 &#x00D7; 112 &#x00D7; 64</td>
<td valign="top" align="left">ReLU</td>
</tr>
<tr>
<td valign="top" align="left">Max pooling</td>
<td valign="top" align="left">3 &#x00D7; 3</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">56 &#x00D7; 56 &#x00D7; 64</td>
<td valign="top" align="left">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Residual block</td>
<td valign="top" align="left">64, 128, 256</td>
<td valign="top" align="left">Varying</td>
<td valign="top" align="left">56 &#x00D7; 56 &#x00D7; 256 &#x2212;7 &#x00D7; 7 &#x00D7; 2,048</td>
<td valign="top" align="left">ReLU</td>
</tr>
<tr>
<td valign="top" align="left">Global Avg pool</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="left">1 &#x00D7; 1 &#x00D7; 2,048</td>
<td valign="top" align="left">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Fully connected</td>
<td valign="top" align="left">2,048</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="left">2,048</td>
<td valign="top" align="left">&#x2013;</td>
</tr>
</tbody>
</table></table-wrap>
<table-wrap position="float" id="T5">
<label>TABLE 5</label>
<caption><p>EfficientNet architecture for OCT image feature extraction.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Layer type</th>
<th valign="top" align="left">Kernel size</th>
<th valign="top" align="left">Stride</th>
<th valign="top" align="left">Output shape</th>
<th valign="top" align="left">Activation function</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Convolutional</td>
<td valign="top" align="left">3 &#x00D7; 3</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">224 &#x00D7; 224 &#x00D7; 32</td>
<td valign="top" align="left">Swish</td>
</tr>
<tr>
<td valign="top" align="left">MBConv block 1</td>
<td valign="top" align="left">Variable</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">112 &#x00D7; 112 &#x00D7; 16</td>
<td valign="top" align="left">Swish</td>
</tr>
<tr>
<td valign="top" align="left">MBConv block 2</td>
<td valign="top" align="left">Variable</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">56 &#x00D7; 56 &#x00D7; 24</td>
<td valign="top" align="left">Swish</td>
</tr>
<tr>
<td valign="top" align="left">MBConv block 3</td>
<td valign="top" align="left">Variable</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">28 &#x00D7; 28 &#x00D7; 40</td>
<td valign="top" align="left">Swish</td>
</tr>
<tr>
<td valign="top" align="left">MBConv block 4</td>
<td valign="top" align="left">Variable</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">14 &#x00D7; 14 &#x00D7; 80</td>
<td valign="top" align="left">Swish</td>
</tr>
<tr>
<td valign="top" align="left">MBConv block 5</td>
<td valign="top" align="left">Variable</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">7 &#x00D7; 7 &#x00D7; 112</td>
<td valign="top" align="left">Swish</td>
</tr>
<tr>
<td valign="top" align="left">Global Avg pool</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="left">1 &#x00D7; 1 &#x00D7; 2,048</td>
<td valign="top" align="left">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Fully connected</td>
<td valign="top" align="left">2,048</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="left">2,048</td>
<td valign="top" align="left">&#x2013;</td>
</tr>
</tbody>
</table></table-wrap>
<p>Mathematically, the transformation in CNN layers can be represented as follows:</p>
<list list-type="bullet">
<list-item>
<p>Convolutional layer transformation is mathematically presented in <xref ref-type="disp-formula" rid="Ex7">Equation 7</xref>:
<disp-formula id="Ex7">
<mml:math id="M7">
<mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>W</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x002A;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:mi>x</mml:mi>
</mml:mpadded>
</mml:mrow>
<mml:mo rspace="5.8pt">+</mml:mo>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:mi>b</mml:mi>
</mml:mpadded>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(7)</label></disp-formula></p></list-item>
</list>
<p>where W represents the weight filter, x is the input feature map, and b is the bias term.</p>
<list list-type="bullet">
<list-item>
<p>Residual learning in ResNet is mathematically presented in <xref ref-type="disp-formula" rid="Ex8">Equation 8</xref>:
<disp-formula id="Ex8">
<mml:math id="M8">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>y</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo rspace="7.5pt">,</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">+</mml:mo>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:mi>X</mml:mi>
</mml:mpadded>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label></disp-formula></p></list-item>
</list>
<p>where F (x, W) is the residual function, and x is the identity mapping (<xref ref-type="bibr" rid="B5">5</xref>).</p>
<list list-type="bullet">
<list-item>
<p>MBConv block transformation in EfficientNet is mathematically presented in <xref ref-type="disp-formula" rid="Ex9">Equation 9</xref>:
<disp-formula id="Ex9">
<mml:math id="M9">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>y</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03C3;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>W</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">&#x002A;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="7.5pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(9)</label></disp-formula></p></list-item>
</list>
<p>where BN is batch normalization and &#x03C3; is the Swish activation function (<xref ref-type="bibr" rid="B6">6</xref>).</p>
<p>Each CNN model extracts a 2048-dimensional feature vector, which is subsequently processed for multimodal fusion.</p>
<p>The proposed dual-stream CNN model includes ResNet50 for fundus feature extraction and EfficientNet for OCT feature extraction with a fusion layer as illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption><p>Architecture diagram of the proposed dual-stream CNN model, showing ResNet50 for fundus feature extraction, EfficientNet for OCT feature extraction, and the fusion layer.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g004.tif">
<alt-text content-type="machine-generated">Diagram of a neural network model for diabetic retinopathy (DR) detection. Inputs are fundus and OCT images processed through ResNet50 and EfficientNet-B0 branches, respectively. Outputs are concatenated and fed into dense layers. The classification uses dropout, Adam optimizer, and cross-entropy loss, with a dataset split of 70% training, 15% validation, and 15% testing. The final output categorizes DR severity as no DR, mild, or moderate.</alt-text>
</graphic>
</fig>
</sec>
<sec id="S3.SS5">
<label>3.5</label>
<title>Multimodal feature fusion</title>
<p>Feature fusion integrates spatial and depth-based information from fundus and OCT images to enhance DR prediction. Three fusion techniques are evaluated:</p>
<list list-type="bullet">
<list-item>
<p>Concatenation fusion: Directly merges feature vectors.</p>
</list-item>
<list-item>
<p>Attention-based fusion: Dynamically assigns feature importance using an attention mechanism.</p>
</list-item>
<list-item>
<p>Feature embedding combination: Maps extracted features into a joint latent space.</p>
</list-item>
</list>
<p>Among these, attention-based fusion (<xref ref-type="fig" rid="F5">Figure 5</xref>) demonstrates superior classification accuracy by dynamically weighting modality contributions, in line with findings from recent multimodal studies (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption><p>Attention-based feature fusion mechanism.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g005.tif">
<alt-text content-type="machine-generated">Diagram illustrating an attention-based feature fusion mechanism. It includes two image inputs: Fundus Image Features and OCT Image Features, weighted at 0.7 and 0.3, respectively. Both contribute to Attention Weights, which has a weight of 1.0. This leads to Fused Features, also weighted at 1.0, and concludes with a Classification Layer.</alt-text>
</graphic>
</fig>
<sec id="S3.SS5.SSS1">
<label>3.5.1</label>
<title>Attention fusion module</title>
<p>The attention-based fusion mechanism is a crucial part of our multimodal deep learning approach. It dynamically learns to emphasize important features from both fundus and OCT images during the fusion process. The mechanism computes the attention weights for each modality separately and then combines the features based on these weights.</p>
<list list-type="bullet">
<list-item>
<p>Attention weight computation</p>
</list-item>
<list-item>
<p>Let <italic>F</italic><sub>fundus</sub> and <italic>F</italic><sub>OCT</sub> represent the extracted feature vectors from the fundus and OCT images, respectively.</p>
</list-item>
<list-item>
<p>The attention weight for each feature, <italic>w</italic><sub>fundus</sub> and <italic>w</italic><sub>OCT</sub>, is computed by mathematically presented in <xref ref-type="disp-formula" rid="Ex10">Equations 10</xref>, <xref ref-type="disp-formula" rid="Ex11">11</xref>:
<disp-formula id="Ex10">
<mml:math id="M10">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:mfrac>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03D5;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03D5;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mpadded>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(10)</label></disp-formula>
<disp-formula id="Ex11">
<mml:math id="M11">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:mfrac>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03D5;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x03D5;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mpadded>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(11)</label></disp-formula></p></list-item>
</list>
<p>where &#x03D5;(&#x22C5;) is the activation function (e.g., softmax) applied to the feature vectors to calculate the relative importance.</p>
<list list-type="bullet">
<list-item>
<p>Feature fusion:</p>
</list-item>
</list>
<p>After calculating the attention weights, the features from both modalities are weighted and fused, as mathematically presented in <xref ref-type="disp-formula" rid="Ex12">Equation 12</xref>:</p>
<disp-formula id="Ex12">
<mml:math id="M12">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22C5;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mpadded>
</mml:mrow>
<mml:mo rspace="5.8pt">+</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22C5;</mml:mo>
<mml:mpadded width="+5pt">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mpadded>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(12)</label></disp-formula>
<p>This weighted sum produces the fused feature representation, which is then passed through the classifier for prediction.</p>
<list list-type="bullet">
<list-item>
<p>Fusion strategy</p>
</list-item>
</list>
<p>The fusion is dynamic and driven by the learned attention mechanism. By assigning higher weights to more informative regions of the images, the network can effectively combine features from both modalities to improve classification performance.</p>
</sec>
</sec>
<sec id="S3.SS6">
<label>3.6</label>
<title>Classification model</title>
<p>The fused feature vector is input to a fully connected neural network (FCNN) composed of five dense layers with ReLU activation and Softmax output for three-class DR prediction. Dropout layers are used for regularization to prevent overfitting (<xref ref-type="table" rid="T4">Table 4</xref> outlines the complete architecture).</p>
<p>The classification pipeline, from feature extraction to final prediction, is depicted in <xref ref-type="fig" rid="F6">Figure 6</xref>.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption><p>Flowchart of the classification process, detailing the steps from feature extraction to final prediction.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g006.tif">
<alt-text content-type="machine-generated">Diagram of DR Severity Prediction Architecture showing input fundus and OCT images leading to two convolution layers for feature extraction. Features are fused and classified into No DR, Moderate DR, or Severe DR.</alt-text>
</graphic>
</fig>
<p>The architecture in detail is presented in <xref ref-type="table" rid="T6">Table 6</xref>.</p>
<table-wrap position="float" id="T6">
<label>TABLE 6</label>
<caption><p>Detailed architecture.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Layer type</th>
<th valign="top" align="left">Number of neurons</th>
<th valign="top" align="left">Activation function</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Fully connected</td>
<td valign="top" align="left">1,024</td>
<td valign="top" align="left">ReLU</td>
</tr>
<tr>
<td valign="top" align="left">Fully connected</td>
<td valign="top" align="left">512</td>
<td valign="top" align="left">ReLU</td>
</tr>
<tr>
<td valign="top" align="left">Fully connected</td>
<td valign="top" align="left">256</td>
<td valign="top" align="left">ReLU</td>
</tr>
<tr>
<td valign="top" align="left">Fully connected</td>
<td valign="top" align="left">128</td>
<td valign="top" align="left">ReLU</td>
</tr>
<tr>
<td valign="top" align="left">Fully connected</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">Softmax</td>
</tr>
</tbody>
</table></table-wrap>
<p>Dropout regularization is applied between layers to prevent overfitting. The final output layer uses a Softmax activation function, which provides class probabilities for No DR, Mild DR, and Moderate DR classifications.</p>
</sec>
<sec id="S3.SS7">
<label>3.7</label>
<title>Training and validation setup</title>
<p>To ensure a robust evaluation and minimize overfitting risks due to the small sample size, the proposed multimodal deep learning model was trained and validated using a 5-fold cross-validation scheme. The dataset comprising 222 high-quality, paired images (111 fundus + 111 OCT) was randomly partitioned into five subsets. In each fold, three subsets were used for training, one for validation, and one for testing, ensuring patient-level separation across splits to avoid data leakage.</p>
<p>Each model instance was trained from scratch for 100 epochs using the Adam optimizer with an initial learning rate of 0.0001, batch size of 16, and early stopping based on validation loss with a patience of 10. Data augmentation techniques such as random flipping, brightness/contrast adjustments, and rotations were applied independently to both fundus and OCT images to enhance generalization and reduce overfitting.</p>
<p>To further strengthen evaluation rigor, an independent hold-out test set consisting of 20% of the data (45 paired images) was also retained before cross-validation for final model assessment. This external validation yielded a consistent performance with 90.5% accuracy and an AUC of 0.970, corroborating the robustness of our framework. Standard performance metrics including accuracy, sensitivity, specificity, precision, F1-score, and AUC-ROC were computed for each fold and averaged to report overall outcomes.</p>
<p>These measures collectively ensure that the model is not overfitted to a specific data split and can generalize well to unseen data, addressing common pitfalls associated with small biomedical datasets.</p>
<p>Due to the limited size of the dataset (222 paired samples), the risk of overfitting was mitigated by applying several validation strategies. The model was trained with a batch size of 32, using the Adam optimizer with a learning rate of 0.0001, and early stopping based on validation loss. We employed 5-fold cross-validation to enhance the model&#x2019;s generalization and tested the final model on an external validation set, which showed consistent performance with 90.5% accuracy and an AUC of 0.970.</p>
</sec>
<sec id="S3.SS8">
<label>3.8</label>
<title>Evaluation matrices</title>
<p>The performance assessment of the multimodal deep learning model relies on evaluation metrics presented in <xref ref-type="table" rid="T7">Table 7</xref>.</p>
<table-wrap position="float" id="T7">
<label>TABLE 7</label>
<caption><p>Performance evaluation metrices.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Metric</th>
<th valign="top" align="left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Measures the overall classification correctness.</td>
</tr>
<tr>
<td valign="top" align="left">AUC-ROC</td>
<td valign="top" align="left">Evaluates the model&#x2019;s ability to distinguish between DR severity levels.</td>
</tr>
<tr>
<td valign="top" align="left">Sensitivity (Recall)</td>
<td valign="top" align="left">Measures the ability to correctly detect positive DR cases.</td>
</tr>
<tr>
<td valign="top" align="left">Specificity</td>
<td valign="top" align="left">Assesses the ability to exclude non-DR cases.</td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="left">Measures the proportion of correctly classified positive samples.</td>
</tr>
<tr>
<td valign="top" align="left">F1 Score</td>
<td valign="top" align="left">Harmonic mean of precision and recall, balancing both metrics.</td>
</tr>
<tr>
<td valign="top" align="left">Precision-Recall Curve (PRC)</td>
<td valign="top" align="left">Measures class-wise prediction reliability, particularly useful in imbalanced datasets.</td>
</tr>
</tbody>
</table></table-wrap>
<p>The AUC-ROC metric serves as a critical assessment tool to determine model reliability for identifying different DR severity levels. The F1 Score provides balanced assessment through precision and recall measurement when there are imbalanced classes.</p>
</sec>
<sec id="S3.SS9">
<label>3.9</label>
<title>Experimental setup</title>
<p>The experiments are conducted using the software and hardware configurations mentioned in <xref ref-type="table" rid="T8">Table 8</xref>.</p>
<table-wrap position="float" id="T8">
<label>TABLE 8</label>
<caption><p>Software and hardware configuration.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Component</th>
<th valign="top" align="left">Specification</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Programming language</td>
<td valign="top" align="left">Python 3.8</td>
</tr>
<tr>
<td valign="top" align="left">Deep Learning framework</td>
<td valign="top" align="left">TensorFlow 2.8, Keras</td>
</tr>
<tr>
<td valign="top" align="left">Hardware</td>
<td valign="top" align="left">NVIDIA RTX 3090 (24GB VRAM), Intel Core i9 CPU, 64GB RAM</td>
</tr>
<tr>
<td valign="top" align="left">Batch size</td>
<td valign="top" align="left">32</td>
</tr>
<tr>
<td valign="top" align="left">Number of epochs</td>
<td valign="top" align="left">50</td>
</tr>
<tr>
<td valign="top" align="left">Learning rate</td>
<td valign="top" align="left">0.0001</td>
</tr>
<tr>
<td valign="top" align="left">Optimizer</td>
<td valign="top" align="left">Adam</td>
</tr>
<tr>
<td valign="top" align="left">Validation strategy</td>
<td valign="top" align="left">5-Fold cross-validation</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
</sec>
<sec id="S4">
<label>4</label>
<title>Results and discussion</title>
<p>This section presents the quantitative and qualitative results of the proposed multimodal deep learning framework for early diabetic retinopathy (DR) detection. The evaluation focuses on the model&#x2019;s classification performance, training convergence, and the contribution of attention-based feature fusion in enhancing diagnostic accuracy.</p>
<sec id="S4.SS1">
<label>4.1</label>
<title>Performance evaluation</title>
<p>The proposed dual-stream CNN model integrating ResNet50 (fundus) and EfficientNet (OCT) was evaluated using multiple metrics including Accuracy, Precision, Recall, Specificity, F1-score, and AUC-ROC. These indicators collectively assess both the predictive reliability and clinical relevance of the model in classifying DR severity levels.</p>
<p>The model achieved an overall classification accuracy of 94.7%, with an AUC-ROC of 0.97, indicating its robust discriminative power across the three classes: No DR, Mild DR, and Moderate DR. <xref ref-type="fig" rid="F7">Figure 7</xref> and <xref ref-type="table" rid="T9">Table 9</xref> summarizes the key evaluation metrics.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption><p>Model performance evaluation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g007.tif">
<alt-text content-type="machine-generated">Line graph showing accuracy, precision, recall, and AUC scores across diabetic retinopathy severity classes: No DR, Mild DR, and Moderate DR. Accuracy increases from 91.0% (No DR) to 95.0% (Moderate DR). Precision increases from 88.0% to 93.0%. Recall rises from 89.0% to 93.0%. AUC scores remain constant at 97.0% for Mild DR and are unmarked for others.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="T9">
<label>TABLE 9</label>
<caption><p>Model performance evaluation.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Metric</th>
<th valign="top" align="left">Description</th>
<th valign="top" align="left">Value (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="left">Overall percentage of correctly classified cases</td>
<td valign="top" align="left"><bold>94.7</bold></td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="left">Ratio of true positive predictions to all predicted positives</td>
<td valign="top" align="left"><bold>93.2</bold></td>
</tr>
<tr>
<td valign="top" align="left">Recall (Sensitivity)</td>
<td valign="top" align="left">Correct detection rate for DR cases</td>
<td valign="top" align="left"><bold>95.0</bold></td>
</tr>
<tr>
<td valign="top" align="left">Specificity</td>
<td valign="top" align="left">Correct detection rate for non-DR cases</td>
<td valign="top" align="left"><bold>94.1</bold></td>
</tr>
<tr>
<td valign="top" align="left">F1-Score</td>
<td valign="top" align="left">Harmonic mean of precision and recall</td>
<td valign="top" align="left"><bold>94.1</bold></td>
</tr>
<tr>
<td valign="top" align="left">AUC-ROC</td>
<td valign="top" align="left">Area under the ROC curve</td>
<td valign="top" align="left"><bold>97.0</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p>Bold values represent percentage-based performance results reported of the proposed model.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>The proposed multimodal framework demonstrates high reliability in identifying early DR stages across all evaluation metrics.</p>
<p>As shown in <xref ref-type="table" rid="T10">Table 10</xref>, the precision, recall, and F1-score for the three classes (No DR, Mild DR, and Moderate DR) indicate that the model performs consistently well across all severity levels. Of particular note is the performance for Mild DR, which is crucial for early-stage DR detection. The relatively high recall (92.5%) and F1-score (91.7%) for Mild DR suggest that the model can identify these cases reliably, which is vital for timely intervention and treatment.</p>
<table-wrap position="float" id="T10">
<label>TABLE 10</label>
<caption><p>Class-wise performance metrics.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Class</th>
<th valign="top" align="left">Precision (%)</th>
<th valign="top" align="left">Recall (%)</th>
<th valign="top" align="left">F1-Score (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">No DR</td>
<td valign="top" align="left">93.2</td>
<td valign="top" align="left">95.0</td>
<td valign="top" align="left">94.1</td>
</tr>
<tr>
<td valign="top" align="left">Mild DR</td>
<td valign="top" align="left">91.0</td>
<td valign="top" align="left">92.5</td>
<td valign="top" align="left">91.7</td>
</tr>
<tr>
<td valign="top" align="left">Moderate DR</td>
<td valign="top" align="left">95.0</td>
<td valign="top" align="left">94.5</td>
<td valign="top" align="left">94.7</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
<sec id="S4.SS2">
<label>4.2</label>
<title>Confusion matrix analysis</title>
<p>To further assess classification robustness, a confusion matrix was generated, as shown in <xref ref-type="fig" rid="F8">Figure 8</xref>. The model achieved near-perfect recognition for Moderate DR and No DR categories, while minimal overlap occurred between Mild DR and Moderate DR&#x2014;a common challenge even in clinical diagnosis due to subtle retinal feature similarities.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption><p>Confusion matrix illustrating class-wise classification performance for No DR, Mild DR, and Moderate DR.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g008.tif">
<alt-text content-type="machine-generated">Confusion matrix for diabetic retinopathy classification. Rows represent true labels: No DR, Mild DR, Moderate DR. Columns show predicted labels. Most predictions are accurate, with values 35, 33, and 34 on the diagonal. Mistakes occur as off-diagonal values in smaller numbers. Color intensity indicates prediction count.</alt-text>
</graphic>
</fig>
</sec>
<sec id="S4.SS3">
<label>4.3</label>
<title>Precision&#x2013;recall curve analysis</title>
<p>The Precision&#x2013;Recall Curve (PRC) provides a deeper understanding of prediction reliability, especially under class imbalance conditions. As depicted in <xref ref-type="fig" rid="F9">Figure 9</xref>, all classes achieved PR areas above 0.95, confirming consistent sensitivity and precision levels across categories.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption><p>Precision&#x2013;Recall curves showing class-wise reliability and prediction stability.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g009.tif">
<alt-text content-type="machine-generated">Precision-recall curve showing three lines representing No DR, Mild DR, and Moderate DR, all with an average precision (AP) of 1.00. The data points form a perfect right angle at Recall 1.0 and Precision 1.0.</alt-text>
</graphic>
</fig>
</sec>
<sec id="S4.SS4">
<label>4.4</label>
<title>Training dynamics and model convergence</title>
<p>To monitor convergence behavior, training and validation accuracy and loss were recorded for 50 epochs as shown in <xref ref-type="table" rid="T11">Table 11</xref>.</p>
<table-wrap position="float" id="T11">
<label>TABLE 11</label>
<caption><p>Model training performance.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Epoch</th>
<th valign="top" align="left">Train accuracy</th>
<th valign="top" align="left">Validation accuracy</th>
<th valign="top" align="left">Train loss</th>
<th valign="top" align="left">Validation loss</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="left">0.6</td>
<td valign="top" align="left">0.58</td>
<td valign="top" align="left">1.2</td>
<td valign="top" align="left">1.3</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="left">0.607</td>
<td valign="top" align="left">0.5865</td>
<td valign="top" align="left">1.185</td>
<td valign="top" align="left">1.288</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="left">0.614</td>
<td valign="top" align="left">0.593</td>
<td valign="top" align="left">1.17</td>
<td valign="top" align="left">1.276</td>
</tr>
<tr>
<td valign="top" align="left">4</td>
<td valign="top" align="left">0.621</td>
<td valign="top" align="left">0.5995</td>
<td valign="top" align="left">1.155</td>
<td valign="top" align="left">1.264</td>
</tr>
<tr>
<td valign="top" align="left">5</td>
<td valign="top" align="left">0.628</td>
<td valign="top" align="left">0.606</td>
<td valign="top" align="left">1.14</td>
<td valign="top" align="left">1.252</td>
</tr>
<tr>
<td valign="top" align="left">6</td>
<td valign="top" align="left">0.635</td>
<td valign="top" align="left">0.6125</td>
<td valign="top" align="left">1.125</td>
<td valign="top" align="left">1.24</td>
</tr>
<tr>
<td valign="top" align="left">7</td>
<td valign="top" align="left">0.642</td>
<td valign="top" align="left">0.619</td>
<td valign="top" align="left">1.11</td>
<td valign="top" align="left">1.228</td>
</tr>
<tr>
<td valign="top" align="left">8</td>
<td valign="top" align="left">0.649</td>
<td valign="top" align="left">0.6255</td>
<td valign="top" align="left">1.095</td>
<td valign="top" align="left">1.216</td>
</tr>
<tr>
<td valign="top" align="left">9</td>
<td valign="top" align="left">0.656</td>
<td valign="top" align="left">0.632</td>
<td valign="top" align="left">1.08</td>
<td valign="top" align="left">1.204</td>
</tr>
<tr>
<td valign="top" align="left">10</td>
<td valign="top" align="left">0.663</td>
<td valign="top" align="left">0.6385</td>
<td valign="top" align="left">1.065</td>
<td valign="top" align="left">1.192</td>
</tr>
<tr>
<td valign="top" align="left">11</td>
<td valign="top" align="left">0.67</td>
<td valign="top" align="left">0.645</td>
<td valign="top" align="left">1.05</td>
<td valign="top" align="left">1.18</td>
</tr>
<tr>
<td valign="top" align="left">12</td>
<td valign="top" align="left">0.677</td>
<td valign="top" align="left">0.6515</td>
<td valign="top" align="left">1.035</td>
<td valign="top" align="left">1.168</td>
</tr>
<tr>
<td valign="top" align="left">13</td>
<td valign="top" align="left">0.684</td>
<td valign="top" align="left">0.658</td>
<td valign="top" align="left">1.02</td>
<td valign="top" align="left">1.156</td>
</tr>
<tr>
<td valign="top" align="left">14</td>
<td valign="top" align="left">0.691</td>
<td valign="top" align="left">0.6645</td>
<td valign="top" align="left">1.005</td>
<td valign="top" align="left">1.144</td>
</tr>
<tr>
<td valign="top" align="left">15</td>
<td valign="top" align="left">0.698</td>
<td valign="top" align="left">0.671</td>
<td valign="top" align="left">0.99</td>
<td valign="top" align="left">1.132</td>
</tr>
<tr>
<td valign="top" align="left">16</td>
<td valign="top" align="left">0.705</td>
<td valign="top" align="left">0.6775</td>
<td valign="top" align="left">0.975</td>
<td valign="top" align="left">1.12</td>
</tr>
<tr>
<td valign="top" align="left">17</td>
<td valign="top" align="left">0.712</td>
<td valign="top" align="left">0.684</td>
<td valign="top" align="left">0.96</td>
<td valign="top" align="left">1.108</td>
</tr>
<tr>
<td valign="top" align="left">18</td>
<td valign="top" align="left">0.719</td>
<td valign="top" align="left">0.6905</td>
<td valign="top" align="left">0.945</td>
<td valign="top" align="left">1.096</td>
</tr>
<tr>
<td valign="top" align="left">19</td>
<td valign="top" align="left">0.726</td>
<td valign="top" align="left">0.697</td>
<td valign="top" align="left">0.93</td>
<td valign="top" align="left">1.084</td>
</tr>
<tr>
<td valign="top" align="left">20</td>
<td valign="top" align="left">0.733</td>
<td valign="top" align="left">0.7035</td>
<td valign="top" align="left">0.915</td>
<td valign="top" align="left">1.072</td>
</tr>
<tr>
<td valign="top" align="left">21</td>
<td valign="top" align="left">0.74</td>
<td valign="top" align="left">0.71</td>
<td valign="top" align="left">0.9</td>
<td valign="top" align="left">1.06</td>
</tr>
<tr>
<td valign="top" align="left">22</td>
<td valign="top" align="left">0.747</td>
<td valign="top" align="left">0.7165</td>
<td valign="top" align="left">0.885</td>
<td valign="top" align="left">1.048</td>
</tr>
<tr>
<td valign="top" align="left">23</td>
<td valign="top" align="left">0.754</td>
<td valign="top" align="left">0.723</td>
<td valign="top" align="left">0.87</td>
<td valign="top" align="left">1.036</td>
</tr>
<tr>
<td valign="top" align="left">24</td>
<td valign="top" align="left">0.761</td>
<td valign="top" align="left">0.7295</td>
<td valign="top" align="left">0.855</td>
<td valign="top" align="left">1.024</td>
</tr>
<tr>
<td valign="top" align="left">25</td>
<td valign="top" align="left">0.768</td>
<td valign="top" align="left">0.736</td>
<td valign="top" align="left">0.84</td>
<td valign="top" align="left">1.012</td>
</tr>
<tr>
<td valign="top" align="left">26</td>
<td valign="top" align="left">0.775</td>
<td valign="top" align="left">0.7425</td>
<td valign="top" align="left">0.825</td>
<td valign="top" align="left">1</td>
</tr>
<tr>
<td valign="top" align="left">27</td>
<td valign="top" align="left">0.782</td>
<td valign="top" align="left">0.749</td>
<td valign="top" align="left">0.81</td>
<td valign="top" align="left">0.988</td>
</tr>
<tr>
<td valign="top" align="left">28</td>
<td valign="top" align="left">0.789</td>
<td valign="top" align="left">0.7555</td>
<td valign="top" align="left">0.795</td>
<td valign="top" align="left">0.976</td>
</tr>
<tr>
<td valign="top" align="left">29</td>
<td valign="top" align="left">0.796</td>
<td valign="top" align="left">0.762</td>
<td valign="top" align="left">0.78</td>
<td valign="top" align="left">0.964</td>
</tr>
<tr>
<td valign="top" align="left">30</td>
<td valign="top" align="left">0.803</td>
<td valign="top" align="left">0.7685</td>
<td valign="top" align="left">0.765</td>
<td valign="top" align="left">0.952</td>
</tr>
<tr>
<td valign="top" align="left">31</td>
<td valign="top" align="left">0.81</td>
<td valign="top" align="left">0.775</td>
<td valign="top" align="left">0.75</td>
<td valign="top" align="left">0.94</td>
</tr>
<tr>
<td valign="top" align="left">32</td>
<td valign="top" align="left">0.817</td>
<td valign="top" align="left">0.7815</td>
<td valign="top" align="left">0.735</td>
<td valign="top" align="left">0.928</td>
</tr>
<tr>
<td valign="top" align="left">33</td>
<td valign="top" align="left">0.824</td>
<td valign="top" align="left">0.788</td>
<td valign="top" align="left">0.72</td>
<td valign="top" align="left">0.916</td>
</tr>
<tr>
<td valign="top" align="left">34</td>
<td valign="top" align="left">0.831</td>
<td valign="top" align="left">0.7945</td>
<td valign="top" align="left">0.705</td>
<td valign="top" align="left">0.904</td>
</tr>
<tr>
<td valign="top" align="left">35</td>
<td valign="top" align="left">0.838</td>
<td valign="top" align="left">0.801</td>
<td valign="top" align="left">0.69</td>
<td valign="top" align="left">0.892</td>
</tr>
<tr>
<td valign="top" align="left">36</td>
<td valign="top" align="left">0.845</td>
<td valign="top" align="left">0.8075</td>
<td valign="top" align="left">0.675</td>
<td valign="top" align="left">0.88</td>
</tr>
<tr>
<td valign="top" align="left">37</td>
<td valign="top" align="left">0.852</td>
<td valign="top" align="left">0.814</td>
<td valign="top" align="left">0.66</td>
<td valign="top" align="left">0.868</td>
</tr>
<tr>
<td valign="top" align="left">38</td>
<td valign="top" align="left">0.859</td>
<td valign="top" align="left">0.8205</td>
<td valign="top" align="left">0.645</td>
<td valign="top" align="left">0.856</td>
</tr>
<tr>
<td valign="top" align="left">39</td>
<td valign="top" align="left">0.866</td>
<td valign="top" align="left">0.827</td>
<td valign="top" align="left">0.63</td>
<td valign="top" align="left">0.844</td>
</tr>
<tr>
<td valign="top" align="left">40</td>
<td valign="top" align="left">0.873</td>
<td valign="top" align="left">0.8335</td>
<td valign="top" align="left">0.615</td>
<td valign="top" align="left">0.832</td>
</tr>
<tr>
<td valign="top" align="left">41</td>
<td valign="top" align="left">0.88</td>
<td valign="top" align="left">0.84</td>
<td valign="top" align="left">0.6</td>
<td valign="top" align="left">0.82</td>
</tr>
<tr>
<td valign="top" align="left">42</td>
<td valign="top" align="left">0.887</td>
<td valign="top" align="left">0.8465</td>
<td valign="top" align="left">0.585</td>
<td valign="top" align="left">0.808</td>
</tr>
<tr>
<td valign="top" align="left">43</td>
<td valign="top" align="left">0.894</td>
<td valign="top" align="left">0.853</td>
<td valign="top" align="left">0.57</td>
<td valign="top" align="left">0.796</td>
</tr>
<tr>
<td valign="top" align="left">44</td>
<td valign="top" align="left">0.901</td>
<td valign="top" align="left">0.8595</td>
<td valign="top" align="left">0.555</td>
<td valign="top" align="left">0.784</td>
</tr>
<tr>
<td valign="top" align="left">45</td>
<td valign="top" align="left">0.908</td>
<td valign="top" align="left">0.866</td>
<td valign="top" align="left">0.54</td>
<td valign="top" align="left">0.772</td>
</tr>
<tr>
<td valign="top" align="left">46</td>
<td valign="top" align="left">0.915</td>
<td valign="top" align="left">0.8725</td>
<td valign="top" align="left">0.525</td>
<td valign="top" align="left">0.76</td>
</tr>
<tr>
<td valign="top" align="left">47</td>
<td valign="top" align="left">0.922</td>
<td valign="top" align="left">0.879</td>
<td valign="top" align="left">0.51</td>
<td valign="top" align="left">0.748</td>
</tr>
<tr>
<td valign="top" align="left">48</td>
<td valign="top" align="left">0.929</td>
<td valign="top" align="left">0.8855</td>
<td valign="top" align="left">0.495</td>
<td valign="top" align="left">0.736</td>
</tr>
<tr>
<td valign="top" align="left">49</td>
<td valign="top" align="left">0.936</td>
<td valign="top" align="left">0.892</td>
<td valign="top" align="left">0.48</td>
<td valign="top" align="left">0.724</td>
</tr>
<tr>
<td valign="top" align="left">50</td>
<td valign="top" align="left">0.943</td>
<td valign="top" align="left">0.8985</td>
<td valign="top" align="left">0.465</td>
<td valign="top" align="left">0.712</td>
</tr>
</tbody>
</table></table-wrap>
<p>The results are presented in <xref ref-type="fig" rid="F10">Figures 10</xref>, <xref ref-type="fig" rid="F11">11</xref>. As seen in <xref ref-type="fig" rid="F10">Figure 10</xref>, the training accuracy gradually increased from 65 to 98%, while the validation accuracy stabilized around 94.7%, reflecting strong generalization and minimal overfitting. Correspondingly, <xref ref-type="fig" rid="F11">Figure 11</xref> illustrates a steady decline in both training and validation losses, indicating effective optimization throughout the training process.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption><p>Training and validation accuracy across 50 epochs.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g010.tif">
<alt-text content-type="machine-generated">Line graph showing training accuracy (blue line) and validation accuracy (orange line) over 50 epochs. Accuracy increases steadily, starting from 0.65 and reaching around 0.91 for training and 0.85 for validation.</alt-text>
</graphic>
</fig>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption><p>Training and validation loss trends showing stable convergence.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g011.tif">
<alt-text content-type="machine-generated">Line graph showing training and validation loss over 50 epochs. Training loss decreases from 1.0 to 0.33, while validation loss decreases from 1.10 to 0.47. Training loss is represented by blue circles, and validation loss by orange squares.</alt-text>
</graphic>
</fig>
</sec>
<sec id="S4.SS5">
<label>4.5</label>
<title>Ablation study: effect of fusion strategy</title>
<p>An ablation study was performed to evaluate the impact of different feature fusion techniques&#x2014;concatenation, feature embedding, and attention-based fusion&#x2014;on model performance. The results, summarized in <xref ref-type="table" rid="T12">Table 12</xref> and <xref ref-type="fig" rid="F12">Figure 12</xref>, show that the attention-based approach achieved the highest accuracy and AUC, confirming its advantage in adaptively weighting modality-specific features.</p>
<table-wrap position="float" id="T12">
<label>TABLE 12</label>
<caption><p>Comparison of fusion strategies.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Fusion technique</th>
<th valign="top" align="left">Accuracy</th>
<th valign="top" align="left">Precision</th>
<th valign="top" align="left">Recall</th>
<th valign="top" align="left">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Concatenation fusion</td>
<td valign="top" align="left">89.3%</td>
<td valign="top" align="left">88.7%</td>
<td valign="top" align="left">89.1%</td>
<td valign="top" align="left">91.0%</td>
</tr>
<tr>
<td valign="top" align="left">Feature embedding Fusion</td>
<td valign="top" align="left">91.8%</td>
<td valign="top" align="left">91.2%</td>
<td valign="top" align="left">90.9%</td>
<td valign="top" align="left">93.2%</td>
</tr>
<tr>
<td valign="top" align="left">Attention-based fusion (proposed)</td>
<td valign="top" align="left">94.7%</td>
<td valign="top" align="left">93.2%</td>
<td valign="top" align="left">95.0%</td>
<td valign="top" align="left">97.0%</td>
</tr>
</tbody>
</table></table-wrap>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption><p>Comparison of Fusion Strategies.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1741146-g012.tif">
<alt-text content-type="machine-generated">Line graph comparing the performance of three methods: Concatenation, Feature Embedding, and Attention-Based. Performance metrics shown are Accuracy, Precision, Recall, and AUC. All metrics improve with Attention-Based, reaching 97.0% for AUC, 95.0% for Recall, 94.7% for Precision, and 93.2% for Accuracy.</alt-text>
</graphic>
</fig>
<p>The attention-based fusion significantly enhances the learning of discriminative features compared to conventional fusion methods.</p>
<p><xref ref-type="table" rid="T13">Table 13</xref> presents a comparison of the performance of our proposed multimodal deep learning framework against several state-of-the-art models for diabetic retinopathy detection. The table outlines the modality used (single vs. multimodal) and the datasets employed for each model, ensuring transparency and consistency in performance evaluation. As shown, our proposed model outperforms baseline models that rely on single-modality inputs, demonstrating the value of integrating both fundus and OCT images for improved DR classification.</p>
<table-wrap position="float" id="T13">
<label>TABLE 13</label>
<caption><p>SOTA comparison with dataset consistency and modality details.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left">Model name</th>
<th valign="top" align="left">Modality used</th>
<th valign="top" align="left">Dataset used</th>
<th valign="top" align="left">Accuracy (%)</th>
<th valign="top" align="left">AUC-ROC</th>
<th valign="top" align="left">Key limitation</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">VGG16 (Baseline 1)</td>
<td valign="top" align="left">Single (Fundus)</td>
<td valign="top" align="left">EyePACS</td>
<td valign="top" align="left">85.2</td>
<td valign="top" align="left">0.92</td>
<td valign="top" align="left">Lacks structural information</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50 (Baseline 2)</td>
<td valign="top" align="left">Single (OCT)</td>
<td valign="top" align="left">DUKE OCT</td>
<td valign="top" align="left">87.3</td>
<td valign="top" align="left">0.93</td>
<td valign="top" align="left">Limited by low-resolution scans</td>
</tr>
<tr>
<td valign="top" align="left">Proposed Model (Multimodal)</td>
<td valign="top" align="left">Multimodal (Fundus + OCT)</td>
<td valign="top" align="left">EyePACS, DUKE OCT</td>
<td valign="top" align="left">94.7</td>
<td valign="top" align="left">0.97</td>
<td valign="top" align="left">Limited dataset size</td>
</tr>
<tr>
<td valign="top" align="left">InceptionV3 (Baseline 3)</td>
<td valign="top" align="left">Multimodal (Fundus + OCT)</td>
<td valign="top" align="left">Combined Public Datasets</td>
<td valign="top" align="left">91.5</td>
<td valign="top" align="left">0.94</td>
<td valign="top" align="left">Requires extensive data preprocessing</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
<sec id="S4.SS6">
<label>4.6</label>
<title>Cross-validation performance</title>
<p>To confirm model reliability, a 5-fold cross-validation approach was implemented. The results demonstrated consistent accuracy across folds, with an average of 93.8% &#x00B1; 1.2, reaffirming the model&#x2019;s robustness and stability under varying data partitions.</p>
</sec>
<sec id="S4.SS7">
<label>4.7</label>
<title>Discussion</title>
<p>The results confirm that the proposed dual-stream multimodal deep learning framework effectively integrates information from fundus and OCT images to identify early stages of diabetic retinopathy (DR). The integration of spatial information from fundus photography with depth-resolved structural details from OCT imaging has proven beneficial for enhancing diagnostic precision in early-stage diabetic retinopathy (DR). This dual-modality approach closely mirrors the clinical workflow adopted by ophthalmologists, who rely on both surface and sub-surface retinal features to make informed assessments. The synergy between modalities enables a more complete characterization of retinal pathology, especially in cases where early indicators may be subtle or spatially diffuse.</p>
<p>Among the different integration strategies explored, the method employing dynamic weighting based on feature relevance delivered the most favorable results, achieving an accuracy of 94.7% and an AUC of 0.97.</p>
<p>The ability to accurately detect Mild DR is paramount for early intervention. Our model&#x2019;s high recall (92.5%) for Mild DR demonstrates that the model can effectively identify early-stage cases. The precision (91.0%) and F1-score (91.7%) further confirm that the model does not produce many false positives, making it a reliable tool for clinical settings.</p>
<p>This method allows the system to focus more precisely on diagnostically important regions within the input data, while minimizing the influence of redundant or less informative signals. Such targeted analysis appears to contribute significantly to its improved performance when compared with traditional fusion approaches like feature concatenation or static embedding. Additionally, elevated recall and F1-scores indicate strong sensitivity and specificity&#x2014;attributes that are essential for clinical screening systems where minimizing both false negatives and false positives is critical. Throughout the training process, performance curves for both accuracy and loss exhibited stable convergence without divergence between training and validation metrics, suggesting that the system generalized well beyond the training data. This is especially noteworthy considering the moderate size of the dataset. Augmentation strategies and the use of pretrained backbones for feature extraction helped mitigate overfitting, while also reducing the training burden. The computational design of the model&#x2014;built on an efficient yet expressive architecture&#x2014;supports potential deployment in both local clinic settings and cloud-based diagnostic platforms. Compared to earlier studies relying solely on fundus images, which typically reported classification accuracies between 85 and 90% (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B8">8</xref>), the multimodal framework represents a clear improvement. Moreover, it maintains interpretability and clinical alignment, making it a suitable candidate for telemedicine applications and resource-limited settings.</p>
<sec id="S4.SS7.SSS1">
<label>4.7.1</label>
<title>Limitations</title>
<p>Nevertheless, certain limitations remain. The dataset used, although diverse, was relatively constrained in size and geographical scope. Broader validation across multiple centers, inclusion of different imaging devices, and integration of region-specific clinical variations would be valuable next steps. Furthermore, transparency and clinical explainability remain important aspects to address in future work, especially to support trust among healthcare practitioners.</p>
<p>The dataset used in this study consists of only 222 high-quality paired samples. While this careful curation improves consistency, it also introduces homogeneity that may inflate performance and increase the risk of overfitting. Therefore, although the reported metrics are promising, they cannot be interpreted as evidence of model robustness. Larger, multi-institutional datasets with real-world variability are required before generalizing the findings.</p>
</sec>
</sec>
</sec>
<sec id="S5">
<label>5</label>
<title>Conclusion and future work</title>
<p>This study presented a multimodal deep learning framework that integrates fundus photography and optical coherence tomography (OCT) imaging for early diabetic retinopathy (DR) risk prediction. By leveraging the complementary strengths of spatial features from fundus images and depth-based retinal information from OCT scans, the proposed dual-stream CNN architecture&#x2014;featuring ResNet50 and EfficientNet&#x2014;demonstrated strong performance across key metrics, including an accuracy of 94.7% and an AUC of 0.97. Attention-based feature fusion significantly enhanced the classification process by dynamically emphasizing informative representations from each modality.</p>
<p>The experimental results validate the framework&#x2019;s potential as a clinically viable solution for automated DR screening. Its ability to distinguish between No DR, Mild DR, and Moderate DR stages makes it especially valuable for early intervention, where timely diagnosis is critical to preventing vision loss. The framework&#x2019;s robustness, facilitated by data augmentation, transfer learning, and cross-validation, underscores its adaptability to real-world clinical environments.</p>
<p>While promising, the study also highlights areas for future exploration. Expanding the dataset to include severe DR stages and images from diverse populations would improve model generalization. Incorporating clinical metadata such as HbA1c levels, blood pressure, and duration of diabetes could further enhance predictive performance. Moreover, integrating explainable AI (XAI) methods would provide transparency and foster trust in clinical deployment. Future work will also explore real-time deployment strategies and external validation across multiple healthcare centers to support scalable and equitable DR screening systems.</p>
</sec>
</body>
<back>
<sec id="S6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in this study are included in this article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="S7" sec-type="author-contributions">
<title>Author contributions</title>
<p>A-HE: Methodology, Data curation, Software, Conceptualization, Resources, Validation, Formal analysis, Writing &#x2013; review &#x0026; editing. JA: Supervision, Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Validation, Methodology, Conceptualization. GA: Funding acquisition, Project administration, Data curation, Conceptualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing, Resources, Formal analysis. AT: Resources, Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Methodology, Project administration, Investigation. JZ: Formal analysis, Validation, Supervision, Data curation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing, Software. ZE: Writing &#x2013; original draft, Project administration, Resources, Methodology, Visualization, Writing &#x2013; review &#x0026; editing. AJ: Investigation, Software, Formal analysis, Writing &#x2013; review &#x0026; editing, Data curation, Methodology, Writing &#x2013; original draft, Validation.</p>
</sec>
<sec id="S9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="S10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="S11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Atwany</surname> <given-names>MZ</given-names></name> <name><surname>Sahyoun</surname> <given-names>AH</given-names></name> <name><surname>Yaqub</surname> <given-names>M</given-names></name></person-group>. <article-title>Deep learning techniques for diabetic retinopathy classification: a survey.</article-title> <source><italic>IEEE Access</italic></source>. (<year>2022</year>) <volume>10</volume>: <fpage>28642</fpage>&#x2013;<lpage>55</lpage>.</mixed-citation></ref>
<ref id="B2">
<label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rashed</surname> <given-names>BM</given-names></name> <name><surname>Popescu</surname> <given-names>N</given-names></name></person-group>. <article-title>Critical analysis of the current medical image-based processing techniques for automatic disease evaluation: systematic literature review.</article-title> <source><italic>Sensors</italic></source>. (<year>2022</year>) <volume>22</volume>:<fpage>7065</fpage>.</mixed-citation></ref>
<ref id="B3">
<label>3.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Ren</surname> <given-names>S</given-names></name> <name><surname>Sun</surname> <given-names>J</given-names></name></person-group>. <article-title>Deep residual learning for image recognition.</article-title> In: <source><italic>Proceedings of the IEEE conference on computer vision and pattern recognition</italic></source>. (<year>2016</year>). p. <fpage>770</fpage>&#x2013;<lpage>8</lpage>.</mixed-citation></ref>
<ref id="B4">
<label>4.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Tan</surname> <given-names>M</given-names></name> <name><surname>Le</surname> <given-names>Q</given-names></name></person-group>. <article-title>EfficientNet: rethinking model scaling for convolutional neural networks.</article-title> <source><italic>Proceedings of the International Conference on Machine Learning (ICML).</italic></source> (<year>2019</year>). p. <fpage>6105</fpage>&#x2013;<lpage>14</lpage>.</mixed-citation></ref>
<ref id="B5">
<label>5.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Ren</surname> <given-names>S</given-names></name> <name><surname>Sun</surname> <given-names>J</given-names></name></person-group>. <article-title>Identity mappings in deep residual networks.</article-title> In: <person-group person-group-type="editor"><name><surname>Leibe</surname> <given-names>B</given-names></name> <name><surname>Matas</surname> <given-names>J</given-names></name> <name><surname>Sebe</surname> <given-names>N</given-names></name> <name><surname>Welling</surname> <given-names>M</given-names></name></person-group> <role>editors</role>. <source><italic>European Conference on Computer Vision.</italic></source> <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2016</year>). p. <fpage>630</fpage>&#x2013;<lpage>45</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-46493-0_38</pub-id></mixed-citation></ref>
<ref id="B6">
<label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ramachandran</surname> <given-names>P</given-names></name> <name><surname>Zoph</surname> <given-names>B</given-names></name> <name><surname>Le</surname> <given-names>QV</given-names></name></person-group>. <article-title>Swish: a self-gated activation function.</article-title> <source><italic>arXiv.</italic></source> (<year>2017</year>). <volume>arXiv</volume>:<fpage>1710.05941</fpage>. <pub-id pub-id-type="doi">10.3390/jimaging6120142</pub-id> <pub-id pub-id-type="pmid">34460539</pub-id></mixed-citation></ref>
<ref id="B7">
<label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Zhu</surname> <given-names>T</given-names></name> <name><surname>Lu</surname> <given-names>L</given-names></name> <name><surname>Zhang</surname> <given-names>YT</given-names></name> <name><surname>Clifton</surname> <given-names>DA</given-names></name></person-group>. <article-title>Intelligent electrocardiogram acquisition via ubiquitous photoplethysmography monitoring.</article-title> <source><italic>IEEE J Biomed Health Inform</italic></source>. (<year>2023</year>) <volume>28</volume>:<fpage>1321</fpage>&#x2013;<lpage>30</lpage>.</mixed-citation></ref>
<ref id="B8">
<label>8.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kohavi</surname> <given-names>R</given-names></name></person-group>. <article-title>A study of cross-validation and bootstrap for accuracy estimation and model selection.</article-title> <source><italic>Proceedings of the 14th International Joint Conference on Artificial Intelligence.</italic></source> <publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>Morgan Kaufmann Publishers Inc.</publisher-name> (<year>1995</year>). p. <fpage>1137</fpage>&#x2013;<lpage>43</lpage>.</mixed-citation></ref>
<ref id="B9">
<label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gulshan</surname> <given-names>V</given-names></name> <name><surname>Peng</surname> <given-names>L</given-names></name> <name><surname>Coram</surname> <given-names>M</given-names></name> <name><surname>Stumpe</surname> <given-names>MC</given-names></name> <name><surname>Wu</surname> <given-names>D</given-names></name> <name><surname>Narayanaswamy</surname> <given-names>A</given-names></name><etal/></person-group> <article-title>Development and validation of a deep learning algorithm for detection of diabetic retinopathy in retinal fundus photographs.</article-title> <source><italic>JAMA</italic></source> (<year>2016</year>) <volume>316</volume>:<fpage>2402</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1001/jama.2016.17216</pub-id> <pub-id pub-id-type="pmid">27898976</pub-id></mixed-citation></ref>
<ref id="B10">
<label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kermany</surname> <given-names>DS</given-names></name> <name><surname>Goldbaum</surname> <given-names>M</given-names></name> <name><surname>Cai</surname> <given-names>W</given-names></name> <name><surname>Valentim</surname> <given-names>CCS</given-names></name> <name><surname>Liang</surname> <given-names>H</given-names></name> <name><surname>Baxter</surname> <given-names>SL</given-names></name><etal/></person-group> <article-title>Identifying medical diagnoses and treatable diseases by image-based deep learning.</article-title> <source><italic>Cell.</italic></source> (<year>2018</year>) <volume>172</volume>:<fpage>1122</fpage>&#x2013;<lpage>31</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2018.02.010</pub-id> <pub-id pub-id-type="pmid">29474911</pub-id></mixed-citation></ref>
<ref id="B11">
<label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goutam</surname> <given-names>B</given-names></name> <name><surname>Hashmi</surname> <given-names>MF</given-names></name> <name><surname>Geem</surname> <given-names>ZW</given-names></name> <name><surname>Bokde</surname> <given-names>ND</given-names></name></person-group>. <article-title>A comprehensive review of deep learning strategies in retinal disease diagnosis using fundus images.</article-title> <source><italic>IEEE Access</italic></source>. (<year>2022</year>) <volume>10</volume>:<fpage>57796</fpage>&#x2013;<lpage>823</lpage>.</mixed-citation></ref>
<ref id="B12">
<label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>S</given-names></name> <name><surname>Wang</surname> <given-names>X</given-names></name> <name><surname>Hu</surname> <given-names>Y</given-names></name> <name><surname>Shen</surname> <given-names>Y</given-names></name> <name><surname>Yang</surname> <given-names>Z</given-names></name> <name><surname>Gan</surname> <given-names>M</given-names></name><etal/></person-group> <article-title>Diabetic retinopathy diagnosis using multichannel generative adversarial network with semisupervision.</article-title> <source><italic>IEEE Trans Automat Sci Eng</italic></source>. (<year>2020</year>) <volume>18</volume>:<fpage>574</fpage>&#x2013;<lpage>85</lpage>.</mixed-citation></ref>
<ref id="B13">
<label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhoyar</surname> <given-names>V</given-names></name> <name><surname>Patel</surname> <given-names>M</given-names></name></person-group>. <article-title>A comprehensive review of deep learning approaches for automated detection, segmentation, and grading of diabetic retinopathy.</article-title> <source><italic>Arch Comp Methods Eng</italic></source>. (<year>2025</year>):<fpage>1</fpage>&#x2013;<lpage>22</lpage>.</mixed-citation></ref>
<ref id="B14">
<label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sahlsten</surname> <given-names>J</given-names></name> <name><surname>Jaskari</surname> <given-names>J</given-names></name> <name><surname>Kivinen</surname> <given-names>J</given-names></name> <name><surname>Turunen</surname> <given-names>L</given-names></name> <name><surname>Jaanio</surname> <given-names>E</given-names></name> <name><surname>Hietala</surname> <given-names>K</given-names></name><etal/></person-group> <article-title>Deep learning fundus image analysis for diabetic retinopathy and macular edema grading.</article-title> <source><italic>Sci Rep</italic></source>. (<year>2019</year>) <volume>9</volume>:<fpage>10750</fpage>.</mixed-citation></ref>
<ref id="B15">
<label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>S</given-names></name> <name><surname>Zhou</surname> <given-names>L</given-names></name> <name><surname>Ma</surname> <given-names>L</given-names></name> <name><surname>Shao</surname> <given-names>D</given-names></name></person-group>. <article-title>Mtra-cnn: a multi-scale transfer learning framework for glaucoma classification in retinal fundus images.</article-title> <source><italic>IEEE Access</italic></source>. (<year>2023</year>) <volume>11</volume>, <fpage>142689</fpage>&#x2013;<lpage>142701</lpage>.</mixed-citation></ref>
<ref id="B16">
<label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ferrara</surname> <given-names>M</given-names></name> <name><surname>Loda</surname> <given-names>A</given-names></name> <name><surname>Coco</surname> <given-names>G</given-names></name> <name><surname>Grassi</surname> <given-names>P</given-names></name> <name><surname>Cestaro</surname> <given-names>S</given-names></name> <name><surname>Rezzola</surname> <given-names>S</given-names></name><etal/></person-group> <article-title>Diabetic retinopathy: soluble and imaging ocular biomarkers.</article-title> <source><italic>J Clin Med</italic></source>. (<year>2023</year>) <volume>12</volume>:<fpage>912</fpage>.</mixed-citation></ref>
<ref id="B17">
<label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>YT</given-names></name> <name><surname>Zhou</surname> <given-names>Q</given-names></name> <name><surname>Tan</surname> <given-names>J</given-names></name> <name><surname>Tao</surname> <given-names>Y</given-names></name></person-group>. <article-title>Multimodal and multi-omics-based deep learning model for screening of optic neuropathy.</article-title> <source><italic>Heliyon</italic></source>. (<year>2023</year>) <fpage>9</fpage>.</mixed-citation></ref>
<ref id="B18">
<label>18.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Karthikeyan</surname> <given-names>S</given-names></name> <name><surname>Sreeja</surname> <given-names>GG</given-names></name> <name><surname>Sivasanjeev</surname> <given-names>R</given-names></name> <name><surname>Srimathi</surname> <given-names>M</given-names></name></person-group>. <article-title>Multimodal approach for diabetic retinopathy detection using deep learning and clinical data fusion.</article-title> In: <source><italic>2024 9th International Conference on Communication and Electronics Systems (ICCES)</italic></source>. <publisher-name>IEEE</publisher-name> (<year>2024</year>). p. <fpage>1702</fpage>&#x2013;<lpage>6</lpage>.</mixed-citation></ref>
<ref id="B19">
<label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Q</given-names></name> <name><surname>Zhang</surname> <given-names>P</given-names></name> <name><surname>Chen</surname> <given-names>N</given-names></name> <name><surname>Zhu</surname> <given-names>Z</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Wang</surname> <given-names>Q</given-names></name></person-group>. <article-title>Trends and hotspots in the field of diabetic retinopathy imaging research from 2000-2023.</article-title> <source><italic>Front Med</italic></source>. (<year>2024</year>) <volume>11</volume>:<fpage>1481088</fpage>.</mixed-citation></ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2193378/overview">Anchit Bijalwan</ext-link>, British University Vietnam, Vietnam</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1794288/overview">Mohammad Abdallah</ext-link>, Al-Zaytoonah University of Jordan, Jordan</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2851547/overview">Vidhushavarshini SureshKumar</ext-link>, Rajalakshmi Institute of Technology (RIT), India</p></fn>
</fn-group>
<fn-group>
<fn id="footnote1"><label>1</label><p><ext-link ext-link-type="uri" xlink:href="https://www.tensorflow.org/datasets/catalog/diabetic_retinopathy_detection">https://www.tensorflow.org/datasets/catalog/diabetic_retinopathy_detection</ext-link></p></fn>
<fn id="footnote2"><label>2</label><p><ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/code/buffyhridoy/oct-duke-to-tehran-all/output">https://www.kaggle.com/code/buffyhridoy/oct-duke-to-tehran-all/output</ext-link></p></fn>
</fn-group>
</back>
</article>