<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1731633</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Graph-enhanced multimodal fusion of vascular biomarkers and deep features for diabetic retinopathy detection</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Deepsahith</surname> <given-names>K. V.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3341110"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Shashank</surname> <given-names>Basineni</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3341117"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kumar</surname> <given-names>Bangipavan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3341122"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Alphonse</surname> <given-names>Sherly</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3254184"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Subburaj</surname> <given-names>Brindha</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/2801924"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Subramanian</surname> <given-names>Girish</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3288493"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Computer Science and Engineering, Vellore Institute of Technology</institution>, <city>Chennai</city>, <country country="in">India</country></aff>
<aff id="aff2"><label>2</label><institution>School of Business Administration, Penn State Harrisburg</institution>, <city>Middletown, PA</city>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Sherly Alphonse, <email xlink:href="mailto:sherly.a@vit.ac.in">sherly.a@vit.ac.in</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-02">
<day>02</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1731633</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>22</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Deepsahith, Shashank, Kumar, Alphonse, Subburaj and Subramanian.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Deepsahith, Shashank, Kumar, Alphonse, Subburaj and Subramanian</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-02">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Diabetic retinopathy (DR) detection can be performed through both deep retinal representations and vascular biomarkers. This proposed work suggests a multimodal framework that combines deep features with vascular descriptors in transformer fusion architecture. Fundus images are preprocessed using CLAHE, Canny edge detection, Top-hat transformation, and U-Net vessel segmentation. Then, the images are passed through a convolutional block attention module (CBAM)-fused enhanced MobileNetV3 backbone for deep spatial feature extraction. In parallel, the segmented vasculature is skeletonized to create a vascular graph, and the descriptors are computed using fractal dimension analysis (FDA), artery-to-vein ratio (AVR), and gray level co-occurrence matrix (GLCM) texture. A graph neural network (GNN) then generates a global topology-aware embedding using all that information. The different modalities are integrated using a transformer-based cross-modal fusion, where the feature vectors from MobileNet and GNN-based vascular embeddings interact using multi-head cross-attention. The fused representation is then given to a Softmax classifier for DR prediction. The model demonstrates superior performance compared to traditional deep learning baselines, achieving an accuracy of 93.8%, a precision of 92.1%, a recall of 92.8%, and an AUC-ROC of 0.96 for the DR prediction in the Messidor-2 dataset. The proposed approach also achieves above 98% accuracy for Eyepacs and APTOS 2019 datasets for DR detection. The findings demonstrate that the proposed system provides a reliable framework compared with the existing state-of-the-art methods.</p></abstract>
<kwd-group>
<kwd>contrast limited adaptive histogram equalization (CLAHE)</kwd>
<kwd>Convolutional Neural Networks (CNNs)</kwd>
<kwd>deep learning</kwd>
<kwd>MobileNetV3</kwd>
<kwd>retinal images</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The APC charge was supported by Vellore Institute of Technology, Chennai.</funding-statement>
</funding-group>
<counts>
<fig-count count="10"/>
<table-count count="10"/>
<equation-count count="35"/>
<ref-count count="58"/>
<page-count count="18"/>
<word-count count="9844"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medicine and Public Health</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Diabetic Retinopathy (DR) is a microvascular complication of diabetes and affects the retinal vasculature. This also alters the retinal characteristics, like microaneurysms and hemorrhages, which are important biomarkers for early-stage detection. Therefore, quantifiable retinal features and vascular patterns are observable in fundus images. Artificial intelligence (AI) and deep learning approaches have significantly enhanced automated screening and diagnosis, and also led to accurate DR detection (<xref ref-type="bibr" rid="B1">Aljohani and Aburasain, 2024</xref>). The retinal characteristics also work as an effective biomarker for systemic pathologies like hypertension, diabetes, and cardiovascular disease(CVD) (<xref ref-type="bibr" rid="B34">Poplin et al., 2018</xref>; <xref ref-type="bibr" rid="B18">Ikram et al., 2006</xref>). Based on the retinal vascular alterations, it is possible to forecast these diseases and enable interventions on time. This resulted in incorporating AI and deep learning algorithms for machine-based analysis that improved the efficacy and accuracy of retinal image-based diagnosis (<xref ref-type="bibr" rid="B10">French et al., 2022</xref>). Yet, it is time-consuming, subject to inter-observer variation, and not feasible in large-scale screening.</p>
<p>Despite the improvement in medical technology, traditional diagnostic methods are still mainly invasive, costly, and unavailable to some countries (<xref ref-type="bibr" rid="B4">Chang et al., 2020</xref>). Sophisticated medical facilities and professional expertise are needed for established methods such as coronary angiography, echocardiography, and cardiac MRI, thus restricting their applicability. Consequently, researchers have been driven to create new alternatives that exploit low-cost and noninvasive strategies for detecting early disease (<xref ref-type="bibr" rid="B37">Rim et al., 2021</xref>).</p>
<p>Deep learning allows for automatic feature extraction and classification, lowering the reliance on manual interpretation. Convolutional Neural Networks (CNNs) have proved enormously successful in detecting and classifying abnormalities on medical images, from tumor detection in radiology to retinal pathology detection in ophthalmology (<xref ref-type="bibr" rid="B22">Kermany et al., 2018</xref>).</p>
<p>In this proposed work, to improve classification accuracy, various image preprocessing methods, such as contrast limited adaptive histogram equalization (CLAHE), Canny edge detection, Top-hat transformation, and U-Net for vessel segmentation, are employed. A vascular graph is created from the segmented and then skeletonized images. Then gray-level co-occurrence matrix (GLCM) is used for regional feature extraction. GLCM offers texture-based features that assist in distinguishing normal and abnormal retinal patterns. Further, the fractal dimension analysis (FDA) is integrated to measure vascular complexity and structural abnormalities for the early detection of DR. Artery-to-vein ratio (AVR) is also an important biomarker to indicate DR severity. A graph neural network(GNN) embeds the vascular graph with other feature descriptors like GLCM, FDA, and AVR to create the graph-embedded features.</p>
<p>The segmented images are also given as input to a lightweight CNN model, MobileNetV3, which is optimized for high efficiency and low computational overhead, as the basis for an efficient and scalable automated DR detection. In contrast to traditional CNN models that require heavy computational resources, MobileNetV3 uses depth-wise separable convolutions, which greatly minimize the number of parameters without compromising accuracy. MobileNetV3 is highly suitable for real-time applications like mobile health systems and telemedicine platforms. Components like squeeze-and-excitation(SE), block attention mechanism, and dilated convolutions are also added to enhance it further in the proposed work. The SE attention mechanism recalibrates the feature maps dynamically, and hence the model focuses on critical vascular areas (<xref ref-type="bibr" rid="B46">Tseng et al., 2023</xref>). The dilated convolutions enhance the receptive field, which helps the model to detect the fine-grained vascular patterns, which are highly useful for identifying early disease. The transformer-based cross-modal fusion used in the proposed system helps in the fusion of deep features from MobileNetV3 and graph-embedded features. The contributions of the proposed work are listed as follows:</p>
<list list-type="bullet">
<list-item><p><bold>Preprocessing techniques:</bold> CLAHE, Canny, and Top-hat transformation help in enhanced visibility of vessels.</p></list-item>
<list-item><p><bold>Vessel segmentation:</bold> U-Net helps in vessel segmentation, thereby boosting overall accuracy while extracting global and local features.</p></list-item>
<list-item><p><bold>Local features extraction:</bold> GLCM, FDA, and AVR calibration help in extracting local features.</p></list-item>
<list-item><p><bold>Global features extraction:</bold> MobileNetV3 and SE block attention mechanism enhance feature selection by dynamically recalibrating channel-wise feature responses, ensuring the model focuses on critical vascular regions such as microaneurysms, vessel narrowing, and tortuosity, which are the key indicators of DR.</p></list-item>
<list-item><p><bold>Dilated convolutions:</bold> Increases receptive field without elevating computational expense, allowing for the identification of fine retinal vascular abnormalities, including subtle vessel deformity and capillary dropout, that are frequently linked to DR prediction.</p></list-item>
<list-item><p><bold>Convolutional block attention(CBAM) Module:</bold> Helps in enhancing vessel structures, suppressing noise.</p></list-item>
<list-item><p><bold>Graph-based embedding:</bold> GNN helps in graph-enhanced feature embedding and also preserves the information about the vascular junctions and branches.</p></list-item>
<list-item><p><bold>Cross-modal fusion:</bold> The deep features from MobileNetV3 and graph-embedded features are fused using a transformer-based cross-modal fusion technique.</p></list-item>
</list>
<p>In the existing literature, several studies exist that focus on graph-based learning, multimodal fusion, and attention mechanisms for DR detection. But most of the existing models use only feature-level fusion across CNN streams and not physiological structures. Also, the graph-based approaches mostly rely on handcrafted descriptors, without vascular biomarkers. Most of the existing works treat the modalities as independent channels, without any standardized method for cross-modal interactions.</p>
<p>The proposed framework helps in addressing these gaps. (i) This work proposes a vascular biomarker graph in which nodes encode the descriptors, and edges model the anatomical relationships. This representation helps in capturing the disease-relevant dependencies that are not seen in other conventional attention-based fusion models. (ii) A graph-enhanced multimodal fusion module is proposed that uses a relation-aware fusion mechanism. Thus, the model learns complementary interactions between learned deep features and structured biomarker information, which is better than the existing hybrid pipelines. The proposed system also uses vascular biomarkers FDA, and the arteriolar-to-venular ratio (AVR) that captures the earlier microvascular changes due to DR. The transformer-based cross-modal fusion module has better interaction modeling that improves the robustness.</p>
<p>Section 2 discusses other existing works in the literature. Section 3 outlines the methodology, wherein preprocessing improves vascular structures before feeding them into a MobileNetV3-based model with dilated convolutions and SE attention. It also explains the proposed integrated methodology used for DR detection. Section 4 reports experimental results on the different datasets (<xref ref-type="bibr" rid="B15">Herrerot, 2022</xref>), providing metrics such as accuracy, precision, recall, and AUC-ROC scores. Section 5 concludes the findings.</p></sec>
<sec id="s2">
<label>2</label>
<title>Related works</title>
<p>(<xref ref-type="bibr" rid="B11">Gulshan et al., 2016</xref>) constructed a deep learning model for diagnosing DR based on retinal fundus images. The system had high specificity and sensitivity, demonstrating the viability of CNNs in automating diagnosis. The work of Gulshan et al. emphasizes the benefits of non-invasive imaging methods for high-volume screening. Limitations lie in the need for large annotated datasets and computation for training and deployment. Solutions to these might make it more viable in low-resource settings.</p>
<p>(<xref ref-type="bibr" rid="B6">Das and Pumrin 2024</xref>) investigated the application of MobileNet in the classification of retinal images to diagnose DR. MobileNet&#x00027;s thin model supports low-cost computation, which is useful for low-resource environments. Data preprocessing methods, such as resizing and augmentation, were demonstrated in the study to significantly enhance model performance. The study, however, did not conduct an exhaustive examination of the effects of varying preprocessing approaches on prediction accuracy, leaving it for future studies to further improve these techniques.</p>
<p>(<xref ref-type="bibr" rid="B14">He et al. 2015</xref>) presented the ResNet architecture that overcomes the problem of vanishing gradients in deep networks using residual connections. (<xref ref-type="bibr" rid="B26">Litjens et al. 2017</xref>) used CNN-based architectures as a building block for processing challenging medical images such as retinal scans. (<xref ref-type="bibr" rid="B55">Zhang et al. 2019</xref>) suggested the use of attention mechanisms with deep learning frameworks. The application of attention mechanisms improves the diagnostic performance and interpretability. (<xref ref-type="bibr" rid="B27">Liu et al. 2024</xref>) proposed an adversarial learning-based framework for the segmentation that leads to better feature representation and edge detection. The model was good for noisy and complicated datasets. (<xref ref-type="bibr" rid="B16">Huang et al. 2023</xref>) explored the contrastive learning methods to classify retinal images. It lowers the dependency on expert-annotated examples. (<xref ref-type="bibr" rid="B1">Aljohani and Aburasain 2024</xref>) suggested a hybrid glaucoma detection system with Random Forest and CNNs (ResNet50, VGG-16) for glaucoma detection. (<xref ref-type="bibr" rid="B45">Ting et al. 2017</xref>) considered the effects of automated deep learning models on early disease identification, workflow performance, and diagnostic accuracy.</p>
<p>Shipra et al. (<xref ref-type="bibr" rid="B43">2024</xref>) used explainable AI (XAI) in medical imaging. The work used Grad-CAM and SHAP values to visualize outputs that also helped the clinicians to understand and believe AI-derived predictions. The incorporation of XAI into CNN models enhanced confidence in automated diagnostic systems. Nonetheless, there were issues raised in terms of balancing explainability and predictive performance, as a few interpretable models had a slightly lower accuracy compared to their black-box variants. Future research should investigate how to improve interpretability without losing classification accuracy, perhaps through hybrid AI-human decision-making systems.</p>
<p>(<xref ref-type="bibr" rid="B38">Ronneberger et al. 2015</xref>) proposed the U-Net architecture, which has been well used in medical image segmentation, including retinal vessel extraction. The experiment proved that skip connections and upsampling policies of U-Net were better at maintaining spatial details than standard CNNs, leading to better segmentation accuracy. The model&#x00027;s capability of performing well on small datasets was especially useful in medical applications. Nevertheless, the experiment showed a reliance on the quality of the datasets and domain-specific fine-tuning. <xref ref-type="table" rid="T1">Table 1</xref> gives a detailed review of some of the existing works.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Summary of the DR Detection methods in literature.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>No</bold>.</th>
<th valign="top" align="left"><bold>References</bold></th>
<th valign="top" align="left"><bold>Focus</bold></th>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Source</bold></th>
<th valign="top" align="left"><bold>Methodology</bold></th>
<th valign="top" align="left"><bold>Findings</bold></th>
<th valign="top" align="left"><bold>Keywords</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B35">Pratt et al., 2016</xref></td>
<td valign="top" align="left">DR Classification</td>
<td valign="top" align="left">EyePACS</td>
<td valign="top" align="left">Variable quality</td>
<td valign="top" align="left">CNN &#x0002B; data augmentation</td>
<td valign="top" align="left">Robust</td>
<td valign="top" align="left">Retinal images, classification, CNN</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B11">Gulshan et al., 2016</xref></td>
<td valign="top" align="left">DR Detection</td>
<td valign="top" align="left">EyePACS, Messidor-2</td>
<td valign="top" align="left">High-resolution fundus images</td>
<td valign="top" align="left">Inception-V3 CNN</td>
<td valign="top" align="left">High sensitivity</td>
<td valign="top" align="left">DR, deep learning, screening</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B47">Voets et al., 2019</xref></td>
<td valign="top" align="left">Cross-domain DR performance</td>
<td valign="top" align="left">EyePACS, Messidor</td>
<td valign="top" align="left">Mixed clinical</td>
<td valign="top" align="left">Comparative CNN analysis</td>
<td valign="top" align="left">Performance drop in domain shift</td>
<td valign="top" align="left">Domain adaptation</td>
</tr>
<tr>
<td valign="top" align="left">4</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B24">Lam et al., 2018</xref></td>
<td valign="top" align="left">DR Lesion Detection</td>
<td valign="top" align="left">Messidor</td>
<td valign="top" align="left">Good quality images</td>
<td valign="top" align="left">Transfer learning (ResNet)</td>
<td valign="top" align="left">Enhanced microaneurysm detection</td>
<td valign="top" align="left">Transfer learning</td>
</tr>
<tr>
<td valign="top" align="left">5</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B25">Li et al., 2019</xref></td>
<td valign="top" align="left">DR Grading</td>
<td valign="top" align="left">DDR Dataset</td>
<td valign="top" align="left">Clinical dataset</td>
<td valign="top" align="left">Attention-based CNN</td>
<td valign="top" align="left">Attention maps</td>
<td valign="top" align="left">DR grading</td>
</tr>
<tr>
<td valign="top" align="left">6</td>
<td valign="top" align="left">Fang and Qiao, <xref ref-type="bibr" rid="B9">2022</xref></td>
<td valign="top" align="left">Early DR Detection</td>
<td valign="top" align="left">DIARETDB1</td>
<td valign="top" align="left">Medium-quality</td>
<td valign="top" align="left">Hybrid DL &#x0002B; handcrafted features</td>
<td valign="top" align="left">Improved early lesion detection</td>
<td valign="top" align="left">Hybrid ML</td>
</tr>
<tr>
<td valign="top" align="left">7</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B7">Dixit and Jha, 2025</xref></td>
<td valign="top" align="left">DR Staging</td>
<td valign="top" align="left">APTOS, Messidor</td>
<td valign="top" align="left">High-quality image</td>
<td valign="top" align="left">EfficientNet classifier</td>
<td valign="top" align="left">Lightweight model</td>
<td valign="top" align="left">EfficientNet</td>
</tr>
<tr>
<td valign="top" align="left">8</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B21">Keel et al., 2019</xref></td>
<td valign="top" align="left">DR Screening</td>
<td valign="top" align="left">Primary dataset</td>
<td valign="top" align="left">Low and variable real-world images</td>
<td valign="top" align="left">DL-based clinical screening system</td>
<td valign="top" align="left">Real-world clinical workflows</td>
<td valign="top" align="left">Screening system</td>
</tr></tbody>
</table>
</table-wrap>
<p>The transformer architectures have recently enhanced multimodal learning approaches. (<xref ref-type="bibr" rid="B42">Shamshad et al. 2023</xref>) in their survey have highlighted the ability to model the cross-modal interactions better than CNNs. (<xref ref-type="bibr" rid="B56">Zhou et al. 2023</xref>) introduced a transformer-based model that processes radiographs, text, and laboratory data using intra- and inter-modal attention, which performed better than image-only pipelines. (<xref ref-type="bibr" rid="B50">Warner et al. 2024</xref>) examined multimodal machine learning in clinical biomedicine, indicating the fusion and alignment problems that actually motivate for graph-aware and transformer-based models. (<xref ref-type="bibr" rid="B8">Dong et al. 2025</xref>) proposed a multimodal transformer system that combines fundus images with clinical data for DR diagnosis, showing the importance of cross-attention compared to retinal and systemic features to improve the performance.</p>
<p>Haq et al. (<xref ref-type="bibr" rid="B12">2024</xref>) reviewed the DR detection models, indicating the vision transformers&#x00027; good performance. (<xref ref-type="bibr" rid="B3">Bhoopalan et al. 2025</xref>) proposed a task-optimized vision transformer (TOViT) for DR detection. (<xref ref-type="bibr" rid="B32">Mutawa et al. 2024</xref>) designed a CNN-based DR staging model with CLAHE and discrete wavelet transform to pre-process the images. (<xref ref-type="bibr" rid="B41">Senapati et al. 2024</xref>) reviewed CNNs, hybrid, and transformer-based methods, which support the use of transformer-based multimodal fusion for DR detection. The deep learning-based models have displayed potential in classifying retinal disease (<xref ref-type="bibr" rid="B11">Gulshan et al., 2016</xref>; <xref ref-type="bibr" rid="B22">Kermany et al., 2018</xref>), they tend to be based on large annotated images and are less interpretable (<xref ref-type="bibr" rid="B49">Wang et al., 2020</xref>; <xref ref-type="bibr" rid="B26">Litjens et al., 2017</xref>). General models such as EfficientNet and ResNet need domain-level fine-tuning (<xref ref-type="bibr" rid="B14">He et al., 2015</xref>), while attention-based algorithms are computationally expensive. To overcome such limitations, the proposed method promotes contrast with CLAHE (<xref ref-type="bibr" rid="B58">Zuiderveld, 1994</xref>) and obtains clinically meaningful biomarkers, such as AVR (<xref ref-type="bibr" rid="B40">Seidelmann et al., 2016</xref>), FDA, and GLCM-based texture features (<xref ref-type="bibr" rid="B13">Haralick et al., 1973</xref>), while maintaining physiological relevance.</p></sec>
<sec id="s3">
<label>3</label>
<title>Proposed methodology</title>
<p>The goal of the proposed system is to enable an integrated system for the efficient and interpretable diagnosis of DR detection by analyzing retinal fundus images. The system, illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>, is designed around deep learning and traditional image processing techniques to capture both macro-level and micro-level features in retinal vasculature. Initially, the system acquires high-resolution retinal images, which are subjected to a series of preprocessing steps aimed at enhancing visual quality and suppressing noise. The use of CLAHE and Canny edge detection helps in improving the vessel contrast and delineation.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Overview of the proposed multi-modal framework with preprocessing pipeline, deep-features, graph-embedded features, and transformer-based cross-modal fusion.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a process for diabetic retinopathy detection. It begins with a retinal fundus image, followed by preprocessing steps: CLAHE, Canny, and TopHat. The image undergoes vessel segmentation using U-Net, followed by stages using Enhanced MobileNet V3. Outputs are processed through FDA, GLCM, and AVR biomarkers. A transformer-based cross-modal fusion integrates results before classification with a SoftMax classifier, concluding with diabetic retinopathy detection.</alt-text>
</graphic>
</fig>
<p>After preprocessing, the U-Net creates a segmented image that&#x00027;s passed into an enhanced MobileNetV3 network. To complement the learned representations, handcrafted features are extracted from the same preprocessed and segmented images. These include GLCM descriptors that capture vascular texture properties and FDA, which quantifies the complexity of vessel branching. The AVR is also computed, and a vascular embedding is created using GNN. Then this is fused with deep features from MobileNetV3, using transformer-based cross-modal fusion that enhances the model&#x00027;s interpretability and robustness.</p>
<sec>
<label>3.1</label>
<title>Dataset</title>
<p>Each of the DR tests in the Messidor-2 dataset consists of two macula-centered eye fundus images, one for each eye. The dataset only contained photos that were macula-centered. There are 874 examinations (1748 pictures) in Messidor-2. The excess black background has been removed from this preprocessed version of the Messidor-2 dataset, which is accessible at Messidor-2. The MESSIDOR-2 DR grades are the source of the DR grades (<ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/mariaherrerot/messidor2preprocess">https://www.kaggle.com/datasets/mariaherrerot/messidor2preprocess</ext-link>).</p>
<p>Blindness detection was separated into groups for training, validation, and testing in the APTOS 2019 dataset. The Asia Pacific Tele-Ophthalmology Society 2019 Blindness Detection (APTOS 2019 BD) collection contains 3662 samples collected from numerous individuals in rural India. The Aravind Eye Hospital in India organized the dataset. The fundus images were collected from a number of locations and conditions over a long period of time. The samples were then analyzed and categorized by a group of trained medical experts using the International Clinical DR Disease Severity Scale (ICDRSS) as a reference. According to the scale system, the APTOS 2019 BD samples are divided into five groups: proliferative DR, mild DR, moderate DR, severe DR, and no DR (<ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/mariaherrerot/aptos2019">https://www.kaggle.com/datasets/mariaherrerot/aptos2019</ext-link>).</p>
<p>The International Clinical Diabetic Retinopathy (ICDR) grading scale, which divides retinal fundus pictures into five DR severity categories, is used in the EyePACS dataset. A healthy retina with no discernible microaneurysms or lesions is represented by class 0 (No DR). Only microaneurysms, which manifest as tiny red spots on the retina, are seen in Class 1 (Mild DR). Microaneurysms are included in Class 2 (moderate DR), which also includes moderate vascular anomalies or other hemorrhages. Intra-retinal microvascular abnormalities (IRMA) and multiple hemorrhages are characteristics of class 3 (severe DR); however, proliferative DR is not present. The most advanced stage, known as Class 4 (Proliferative DR), is characterized by neovascularization and vitreous or preretinal hemorrhages, increasing the risk of visual loss (<ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/competitions/diabetic-retinopathy-detection">https://www.kaggle.com/competitions/diabetic-retinopathy-detection</ext-link>). In the experiments, a five-fold cross-validation is used.</p>
<p>To further support vessel segmentation and feature validation, an additional publicly available dataset, the retina blood vessel dataset (<xref ref-type="bibr" rid="B48">Wagih, 2023</xref>), is incorporated. These datasets provide a broader spectrum of retinal characteristics and enhance model generalization (<ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/abdallahwagih/retina-blood-vessel">https://www.kaggle.com/datasets/abdallahwagih/retina-blood-vessel</ext-link>).</p></sec>
<sec>
<label>3.2</label>
<title>Preprocessing techniques</title>
<p>Preprocessing improves image quality and emphasizes diagnostically relevant structures. This study employs a series of transformations to highlight blood vessels, reduce image noise, and extract spatial texture information.</p>
<sec>
<label>3.2.1</label>
<title>Vessel visibility enhancement using CLAHE</title>
<p>CLAHE improves local contrast by equalizing intensity values in small image tiles, avoiding over-enhancement and preserving fine details as in <xref ref-type="fig" rid="F2">Figure 2</xref>. The transformation is computed using:</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>T</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>C</mml:mi><mml:mi>D</mml:mi><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>C</mml:mi><mml:mi>D</mml:mi><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>-</mml:mo><mml:mi>C</mml:mi><mml:mi>D</mml:mi><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>L</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where:</p>
<list list-type="bullet">
<list-item><p><italic>CDF</italic>(<italic>x</italic>) is the cumulative histogram value at intensity <italic>x</italic>,</p></list-item>
<list-item><p><italic>CDF</italic><sub>min</sub> is the minimum histogram value in the tile,</p></list-item>
<list-item><p><italic>M</italic> is the number of pixels per tile,</p></list-item>
<list-item><p><italic>L</italic> is the maximum pixel intensity.</p></list-item>
</list>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Illustration of vessel visibility enhancement in retinal fundus images using CLAHE.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0002.tif">
<alt-text content-type="machine-generated">A grayscale retinal scan showing the intricate network of blood vessels radiating from the optic disc on the right side. The macula appears as a darker area in the center.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>3.2.2</label>
<title>Highlighting vessel boundaries using Canny algorithm</title>
<p>The Canny algorithm identifies edges by detecting gradients and applying non-maximum suppression. The steps include Gaussian smoothing and gradient estimation as in <xref ref-type="fig" rid="F3">Figure 3</xref>:</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>&#x003C0;</mml:mi><mml:msup><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:msup><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>G</mml:mi><mml:mo>=</mml:mo><mml:msqrt><mml:mrow><mml:msubsup><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:msqrt></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>where <italic>G</italic><sub><italic>x</italic></sub> and <italic>G</italic><sub><italic>y</italic></sub> are the gradients in the <italic>x</italic>&#x02212; and <italic>y</italic>&#x02212; directions.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Result of Canny edge detection and highlighting vessel boundaries in retinal fundus images.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0003.tif">
<alt-text content-type="machine-generated">Outline of a human eye showing the retina with branching blood vessels, depicted in white lines on a black background. The optic disc is visible on the right.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>3.2.3</label>
<title>Morphological vessel enhancement (Top-hat transform)</title>
<p>The Top-hat transform isolates small, bright objects such as vessels. The mathematical formula is given as:</p>
<disp-formula id="EQ4"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">top-hat</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>I</mml:mi><mml:mo>-</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>I</mml:mi><mml:mo>&#x000B0;</mml:mo><mml:mi>B</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>where &#x000B0; denotes morphological opening.</p></sec>
<sec>
<label>3.2.4</label>
<title>U-Net for segmentation</title>
<p>The combination of preprocessing techniques leads to:</p>
<list list-type="bullet">
<list-item><p>Enhanced visibility of fine vascular patterns.</p></list-item>
<list-item><p>Suppression of imaging artifacts and irrelevant background.</p></list-item>
<list-item><p>Improved feature extraction by the deep learning backbone.</p></list-item>
</list>
<p>These effects collectively improve the system&#x00027;s diagnostic accuracy, specificity, and generalizability for real-world screening applications in DR detection. Here, the U-Net is used for segmentation purposes before extracting the features using MobileNet and handcrafted features. It also removes the noisy background and improves accuracy and interpretability (<xref ref-type="bibr" rid="B38">Ronneberger et al., 2015</xref>). The segmented vasculature is then converted into a skeleton representing the branching topology. Here, the nodes represent the anatomical points, the edges represent the vessel segments, and the attributes are the features. The graph representation helps to preserve the local and global properties.</p></sec></sec>
<sec>
<label>3.3</label>
<title>Feature extraction blocks</title>
<p>The feature extraction blocks used help in the extraction of both the semantic features and fine-grained statistical cues from retinal images. After preprocessing, the segmented image from the U-Net is given to the MobileNetV3, which helps in capturing vessel tortuosity, branching, and lesion features. An SE attention block with the dilated convolution layer improves the focus on relevant areas within the image. Simultaneously, the segmented image is skeletonized into a vascular graph, and the features are extracted using GLCM and FDA. GLCM extracts the second-order texture information, such as contrast, correlation, and homogeneity, and FDA computes the complexity and self-similarity of the vascular structures. AVR, which is a vital biomarker used in the proposed approach, is also computed. The features are embedded in a graph-based representation using GNN, along with the topology information about the junctions and branches. The deep feature and the graph-embedded features are then fused using a transform-based cross-modal fusion, which is then passed to a classification head that performs the final prediction.</p>
<p>This proposed approach has both the strength of deep features and handcrafted features that improve the sensitivity even to subtle vascular variations.</p></sec>
<sec>
<label>3.4</label>
<title>Enhanced MobileNetV3</title>
<p>The enhanced MobileNetV3 extracts the deep features regarding microaneurysms, hemorrhages, exudates, and vessel abnormalities, which are the primary indicators of DR.</p>
<sec>
<label>3.4.1</label>
<title>Dilated convolution</title>
<p>Pooling is typically performed after a primary convolution operation to reduce dimensionality and strengthen the local features. Pooling also enhances the receptive field, and more global features can be extracted. However, the fine-grained information is lost in the feature maps, which can reduce image recognition accuracy. Without pooling, the receptive field may still be too limited, as it would prevent the extraction of larger spatial relations. With pooling being included, the receptive field of the convolutional kernel is larger, allowing for broader feature extraction. To overcome the disadvantage of pooling, dilated convolution was introduced as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. This technique modifies the convolution process by introducing gaps (or dilation) among kernel elements, increasing the receptive field without losing the resolution of the feature maps. Unlike pooling, dilated convolution doesn&#x00027;t alter the sizes of input and output feature maps; therefore, no spatial information is lost.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Dilated convolution used in MobileNetV3.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0004.tif">
<alt-text content-type="machine-generated">Diagram showing the process of transforming an original enhanced retinal image into a concatenated feature map using 1-dilated, 2-dilated, and 3-dilated three-by-three convolution layers.</alt-text>
</graphic>
</fig>
<p>Dilated convolution has several advantages. One, through the addition of a dilation rate, the receptive field is widened without sacrificing resolution, with the relative spatial relation between pixels remaining intact. Two, through the addition of more dilated convolutions with varied rates, multiscale contextual features are obtained. Three, computational cost is relieved because the receptive field is widened without new parameters added (<xref ref-type="bibr" rid="B54">Yu and Koltun, 2015</xref>). Algebraically, dilated convolution is written as:</p>
<disp-formula id="EQ5"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi>q</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x02211;</mml:mo><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>d</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>q</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>d</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p>where:</p>
<list list-type="bullet">
<list-item><p><italic>p, q</italic> are the horizontal and vertical coordinates in the feature map.</p></list-item>
<list-item><p><italic>h, j</italic> are the coordinates in the convolution kernel.</p></list-item>
<list-item><p><italic>f</italic> represents the feature map values.</p></list-item>
<list-item><p><italic>g</italic> represents the convolution kernel values.</p></list-item>
<list-item><p><italic>d</italic> is the dilation rate, determining the spacing between kernel elements.</p></list-item>
</list></sec>
<sec>
<label>3.4.2</label>
<title>Squeeze-and-Excitation block (SE) attention mechanism</title>
<p>The SE block improves the performance of MobileNetV3 by adaptively recalibrating channel-wise feature responses. SE blocks, as in <xref ref-type="fig" rid="F5">Figure 5</xref>, enhance the informative ones while suppressing less relevant channels. This helps in detecting the vascular abnormalities in retinal images, like vessel narrowing, tortuosity, and microaneurysms, better.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Squeeze-and-Excitation Block used in MobileNetV3.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0005.tif">
<alt-text content-type="machine-generated">Retinal image on the left. Top row shows three graphs: Global Pooling Output (blue), ReLU Activation (yellow), and Sigmoid Activation (green bars). Bottom row has three purple graphs labeled SEB Output Parts 1 to 3.</alt-text>
</graphic>
</fig>
<p>The squeeze and excitation step compresses the spatial features (of size <italic>H</italic>&#x000D7;<italic>W</italic>) in each channel using Global Average Pooling (GAP) (<xref ref-type="bibr" rid="B20">Jin et al., 2022</xref>), resulting in single descriptor per channel:</p>
<disp-formula id="EQ6"><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>W</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>Here, <italic>X</italic><sub><italic>c</italic></sub>(<italic>i, j</italic>) is the activation at pixel (<italic>i, j</italic>) in channel <italic>c</italic>.</p>
<p>The channel-wise descriptors are given to a bottleneck consisting of two fully connected (FC) layers with non-linear activations (ReLU and sigmoid) that result in a learned attention weight for each channel:</p>
<disp-formula id="EQ7"><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>z</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>Where:</p>
<list list-type="bullet">
<list-item><p><italic>W</italic><sub>1</sub> and <italic>W</italic><sub>2</sub> are the weight matrices,</p></list-item>
<list-item><p>&#x003B4; is the ReLU activation function,</p></list-item>
<list-item><p>&#x003C3; is the sigmoid activation function where output is in the range [0, 1].</p></list-item>
</list>
<p>The original feature maps are then scaled by the learned weights.</p>
<disp-formula id="EQ8"><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>where <italic>s</italic><sub><italic>c</italic></sub> is the learned attention weight, and <italic>X</italic><sub><italic>c</italic></sub> is the original feature map.</p>
<p>The dilated convolutions capture the multiscale spatial context without affecting the resolution. The global average pooling used in SE blocks aggregates global channel-wise statistics. This ensures that high-level contextual cues are significantly enhanced without compromising spatial details. This highlights the diagnostic features and suppresses the noisy channels.</p>
<p>The MobileNetV3 used in this framework has dilated convolutions and SE-block attention mechanisms. The use of SE blocks enhances feature selection. The non-linear(NL) functions, such as Hard-Swish (HS) and ReLU activation functions, enhance the efficiency. CBAM helps in focusing better on relevant features.</p></sec></sec>
<sec>
<label>3.5</label>
<title>Vascular graph construction</title>
<p>The U-Net model segments the vessel structures, and the binary vessel map was obtained using morphological thinning. Bifurcation points and crossovers are identified using connectivity analysis. Each location, based on proper retinal vasculature, is a node, and the edges represent vessel continuity. Artery-vein (A/V) classification is identified using discriminative descriptors, local intensity statistics, and vessel width. GLCM-based texture descriptors are computed along each segment, and a lightweight classifier assists this identification. Each node is encoded with FDA, AVR, and GLCM-derived texture measures. Vessel segments belonging to the same branch are assigned a consistent A/V label, which is later used to augment the node attributes and other features. The graph was then processed using a GNN to create a global embedding summarizing morphology, topology, and descriptors.</p>
<p>The preprocessed retinal fundus image is defined as</p>
<disp-formula id="EQ9"><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>I</mml:mi><mml:mo>:</mml:mo><mml:mtext>&#x003A9;</mml:mtext><mml:mo>&#x02282;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x02192;</mml:mo><mml:mi>&#x0211D;</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>where &#x003A9; is the retinal image domain. After vessel segmentation, the vessel set is obtained as:</p>
<disp-formula id="EQ10"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>S</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mtext>&#x003A9;</mml:mtext><mml:mo>:</mml:mo><mml:mtext class="textrm" mathvariant="normal">vessel</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>The operator <sans-serif>Skel</sans-serif>(&#x000B7;) produces a reduced skeleton structure using:</p>
<disp-formula id="EQ11"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mi mathvariant="script">K</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mstyle mathvariant="sans-serif"><mml:mi>S</mml:mi><mml:mi>k</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02282;</mml:mo><mml:mtext>&#x003A9;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>which preserves the vascular topology.</p>
<p>For each skeleton pixel <inline-formula><mml:math id="M12"><mml:mi>p</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">K</mml:mi></mml:mrow></mml:math></inline-formula>, 8-connected neighborhood is defined as</p>
<disp-formula id="EQ12"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mi mathvariant="script">N</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:mi>q</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">K</mml:mi></mml:mrow><mml:mo>:</mml:mo><mml:mo>||</mml:mo><mml:mi>p</mml:mi><mml:mo>-</mml:mo><mml:mi>q</mml:mi><mml:msub><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x0221E;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>Nodes <italic>V</italic> are indicated as</p>
<disp-formula id="EQ13"><mml:math id="M14"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mi>V</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi mathvariant='script'>K</mml:mi><mml:mo>:</mml:mo><mml:mo>&#x0007C;</mml:mo><mml:mi mathvariant='script'>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x0007C;</mml:mo><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mtext>&#x000A0;&#x000A0;(endpoints)&#x000A0;&#x000A0;&#x000A0;or</mml:mtext></mml:mrow></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x0007C;</mml:mo><mml:mi mathvariant='script'>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x0007C;</mml:mo><mml:mo>&#x02265;</mml:mo><mml:mn>3</mml:mn><mml:mrow><mml:mrow><mml:mtext>&#x000A0;(junctions)</mml:mtext></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<p>The edges <italic>E</italic> are the maximal simple paths in <inline-formula><mml:math id="M16"><mml:mrow><mml:mi mathvariant="script">K</mml:mi></mml:mrow></mml:math></inline-formula> between two nodes, and all intermediate pixels have a degree <inline-formula><mml:math id="M17"><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">N</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>|</mml:mo><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula>.</p>
<p>The retinal vasculature is represented as a graph as:</p>
<disp-formula id="EQ14"><mml:math id="M18"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>V</mml:mi><mml:mo>,</mml:mo><mml:mi>E</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<p>where the nodes are the endpoints/junctions, and edges are the vessel segments.</p>
<p>Let <italic>A</italic> &#x02208; {0, 1}<sup>|<italic>V</italic>| &#x000D7; |<italic>V</italic>|</sup> represent the adjacency matrix with</p>
<disp-formula id="EQ15"><mml:math id="M19"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mn>1</mml:mn></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mtext>if&#x000A0;</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02208;</mml:mo><mml:mi>E</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mn>0</mml:mn></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mtext>otherwise</mml:mtext><mml:mo>,</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math><label>(15)</label></disp-formula>
<p>and <italic>D</italic> &#x0003D; diag(<italic>d</italic><sub><italic>i</italic></sub>) be the degree matrix with <inline-formula><mml:math id="M20"><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. The normalized adjacency is defined as:</p>
<disp-formula id="EQ16"><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x000C3;</mml:mi><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>I</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mover accent="true"><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">diag</mml:mtext><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>&#x000C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<p>This is used in graph neural network (GNN) processing (<xref ref-type="bibr" rid="B23">Kipf and Welling, 2017</xref>).</p>
<p>Also, deep retinal features are extracted using a MobileNet backbone enhanced with CBAM. To integrate the information from deep features and vascular-graph embeddings, a transformer-based cross-modal fusion was used. The MobileNet-CBAM feature vector and the GNN-derived vascular embedding are different modalities, and multi-head cross-attention helps in modeling the interactions. The final representation has both structural vascular biomarkers and appearance-based cues. This final fused feature vector is given to a Softmax classification layer to predict DR severity.</p></sec>
<sec>
<label>3.6</label>
<title>GLCM and FDA</title>
<p>Gray-Level Co-occurrence Matrix (GLCM) texture descriptors are calculated as:</p>
<disp-formula id="EQ17"><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Contrast</mml:mtext><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mi>G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula>
<disp-formula id="EQ18"><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Correlation</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(18)</label></disp-formula>
<disp-formula id="EQ19"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Energy</mml:mtext><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mi>G</mml:mi><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(19)</label></disp-formula>
<disp-formula id="EQ20"><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Homogeneity</mml:mtext><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:mo>|</mml:mo><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mi>j</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(20)</label></disp-formula>
<p>where <italic>G</italic>(<italic>i, j</italic>) is the normalized GLCM, and &#x003BC;<sub><italic>i</italic></sub>, &#x003C3;<sub><italic>i</italic></sub> are the mean and standard deviation of row <italic>i</italic>. The RGB histogram is calculated as:</p>
<disp-formula id="EQ21"><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:mi>c</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>R</mml:mi><mml:mo>,</mml:mo><mml:mi>G</mml:mi><mml:mo>,</mml:mo><mml:mi>B</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(21)</label></disp-formula>
<p>where &#x003B4; is the Kronecker delta, and <italic>I</italic><sub><italic>c</italic></sub>(<italic>x, y</italic>) is the pixel intensity at (<italic>x, y</italic>) for channel <italic>c</italic>. GLCM captures the texture patterns regarding microaneurysms and hemorrhages effectively. Here, FDA is a non-invasive biomarker capturing retinal abnormalities. The box-counting technique is used to compute the fractal dimension in FDA as:</p>
<disp-formula id="EQ22"><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mi>D</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">lim</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003F5;</mml:mi><mml:mo>&#x02192;</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:munder></mml:mstyle><mml:mfrac><mml:mrow><mml:mo class="qopname">log</mml:mo><mml:mi>N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003F5;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>/</mml:mo><mml:mi>&#x003F5;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(22)</label></disp-formula>
<p>where <italic>N</italic>(&#x003F5;) is the number of boxes of size &#x003F5; used to cover the vessel&#x00027;s structure, regaining the vascular branching&#x00027;s complexity.</p></sec>
<sec>
<label>3.7</label>
<title>AVR calibration</title>
<p>The vessel caliber is computed using the arteries and veins identified in the zone of 0.5&#x02013;1.0 optical disc diameters from the disc margin. The central retinal arteriolar equivalent (CRAE) and central retinal venular equivalent (CRVE) are estimated using the Parr-Hubbard formulas:</p>
<disp-formula id="EQ23"><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>C</mml:mi><mml:mi>R</mml:mi><mml:mi>A</mml:mi><mml:mi>E</mml:mi><mml:mo>=</mml:mo><mml:msqrt><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>87</mml:mn><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn><mml:mo>.</mml:mo><mml:mn>01</mml:mn><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msqrt><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(23)</label></disp-formula>
<disp-formula id="EQ24"><mml:math id="M29"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>C</mml:mi><mml:mi>R</mml:mi><mml:mi>V</mml:mi><mml:mi>E</mml:mi><mml:mo>=</mml:mo><mml:msqrt><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>72</mml:mn><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>91</mml:mn><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msqrt><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(24)</label></disp-formula>
<p>where <italic>D</italic><sub>1</sub>, <italic>D</italic><sub>2</sub> are the highest arteriolar diameters, and <italic>d</italic><sub>1</sub>, <italic>d</italic><sub>2</sub> are the highest venular diameters. A lower AVR reflects narrower arterioles associated with significantly increased risk (<xref ref-type="bibr" rid="B18">Ikram et al., 2006</xref>; <xref ref-type="bibr" rid="B10">French et al., 2022</xref>). The formula for computing the AVR is given as:</p>
<disp-formula id="EQ25"><mml:math id="M30"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>V</mml:mi><mml:mi>R</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>C</mml:mi><mml:mi>R</mml:mi><mml:mi>A</mml:mi><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>R</mml:mi><mml:mi>V</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:mfrac><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(25)</label></disp-formula>
<p>Canny edge detection enhances the accuracy of delineating vessel boundaries, especially in low-contrast or noisy fundus images, by finding the edges accurately. Hence, a more reliable segmentation of arterioles and venules is possible, which results in accurate CRAE/CRVE calculation (<xref ref-type="bibr" rid="B40">Seidelmann et al., 2016</xref>; <xref ref-type="bibr" rid="B30">McGeechan et al., 2008</xref>).</p>
<p>The AVR is a crucial retinal biomarker for DR detection. Dilated convolution modules in the architecture expand the receptive field without losing spatial resolution, thereby capturing multiscale vessel structures like fine capillaries and larger branches needed for robust vessel segmentation and AVR estimation. The inclusion of Frangi filters helps identify broken or small arterioles. DR results in lower AVR values, due to venular widening (<xref ref-type="bibr" rid="B19">Islam et al., 2009</xref>; <xref ref-type="bibr" rid="B2">Ashraf et al., 2021</xref>). The arteriolar narrowing is seen in regions of retinal non-perfusion and increased DR severity. Wider retinal venules predict the progression of DR over time (<xref ref-type="bibr" rid="B28">Liu et al., 2022</xref>). AVR is a helpful quantitative indicator of microvascular alterations in DR. Fused with other features, it is more effective (<xref ref-type="bibr" rid="B36">Quellec et al., 2017</xref>).</p></sec>
<sec>
<label>3.8</label>
<title>Graph neural network (GNN) encoder</title>
<p>The vascular graph <inline-formula><mml:math id="M31"><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>V</mml:mi><mml:mo>,</mml:mo><mml:mi>E</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is embedded with regional features, such as GLCM descriptors, and global vascular biomarkers such as AVR, and FDA features (<xref ref-type="bibr" rid="B23">Kipf and Welling, 2017</xref>).</p>
<p>Let <inline-formula><mml:math id="M32"><mml:mi>X</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mi>V</mml:mi><mml:mo>|</mml:mo><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> denote the node feature matrix, where each node feature vector <italic>x</italic><sub><italic>i</italic></sub> includes:</p>
<disp-formula id="EQ26"><mml:math id="M33"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">con</mml:mtext></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">ent</mml:mtext></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(26)</label></disp-formula>
<p>with <italic>c</italic><sub><italic>i</italic></sub> the vessel caliber, <inline-formula><mml:math id="M34"><mml:msubsup><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">con</mml:mtext></mml:mstyle></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">ent</mml:mtext></mml:mstyle></mml:mrow></mml:msubsup></mml:math></inline-formula> GLCM contrast/entropy, and <italic>t</italic><sub><italic>i</italic></sub> the artery/vein label.</p>
<p>Each GNN layer embeds the node information across vessel connections:</p>
<disp-formula id="EQ27"><mml:math id="M35"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x02113;</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msup><mml:mi>&#x000C3;</mml:mi><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x02113;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(27)</label></disp-formula>
<p>where &#x000C3; is the adjacency with self-loops, <inline-formula><mml:math id="M36"><mml:mover accent="true"><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:math></inline-formula> the degree matrix, <italic>W</italic><sup>(&#x02113;)</sup> trainable parameters, and &#x003C3;(&#x000B7;) a non-linearity.</p>
<p>Along with the local encoding, this model also incorporates the global vascular biomarkers:</p>
<disp-formula id="EQ28"><mml:math id="M37"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mtext class="textrm" mathvariant="normal">AVR</mml:mtext><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">FD</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>&#x01E21;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">con</mml:mtext></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>&#x01E21;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">ent</mml:mtext></mml:mrow></mml:msup><mml:mo stretchy="false">]</mml:mo><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(28)</label></disp-formula>
<p>where AVR is the arteriovenous ratio in the optic disc annulus, <italic>D</italic><sub>FD</sub> the fractal dimension of the vascular tree, and &#x01E21;<sup>con</sup>, &#x01E21;<sup>ent</sup> are mean GLCM descriptors computed over the vasculature.</p>
<p>After <italic>L</italic> GNN layers, the node embeddings <inline-formula><mml:math id="M38"><mml:msub><mml:mrow><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>V</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are pooled to form a graph-level representation:</p>
<disp-formula id="EQ29"><mml:math id="M39"><mml:mrow><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003C1;</mml:mi><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>:</mml:mo><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>V</mml:mi></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo stretchy="true">)</mml:mo><mml:mo>&#x02225;</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where &#x003C1;(&#x000B7;) is an attention pooling, and &#x02225; denotes aggregation with the handcrafted global biomarker vector <italic>s</italic>.</p>
<p>Thus, the final embedding <inline-formula><mml:math id="M40"><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> has both the vascular structural information acquired using GNN and clinically interpretable global biomarkers (AVR, FDA, and GLCM).</p></sec>
<sec>
<label>3.9</label>
<title>Transformer-based cross-modal fusion</title>
<p>To aggregate all the descriptors, a cross-modal fusion is done using a transformer encoder (<xref ref-type="bibr" rid="B29">Lu et al., 2019</xref>).</p>
<list list-type="bullet">
<list-item><p><inline-formula><mml:math id="M41"><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">img</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">img</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>: fundus image embedding from MobileNet.</p></list-item>
<list-item><p><inline-formula><mml:math id="M42"><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>: vascular graph embedding from the GNN encoder.</p></list-item>
<list-item><p><inline-formula><mml:math id="M43"><mml:mi>s</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>: handcrafted vascular descriptors (AVR, fractal dimension, GLCM features).</p></list-item>
</list>
<p>All features are projected into a manifold of dimension <italic>d</italic>:</p>
<disp-formula id="EQ30"><mml:math id="M44"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">img</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">img</mml:mtext></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">img</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">stat</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">stat</mml:mtext></mml:mrow></mml:msub><mml:mi>s</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(29)</label></disp-formula>
<p>where <inline-formula><mml:math id="M45"><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">img</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">stat</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> are the trainable projection matrices.</p>
<p>The input token sequence is constructed as</p>
<disp-formula id="EQ31"><mml:math id="M46"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">cls</mml:mtext></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">img</mml:mtext></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="script">G</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">stat</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">]</mml:mo><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(30)</label></disp-formula>
<p>where <italic>t</italic><sub>cls</sub> is a learnable classification token.</p>
<p>Each transformer block uses multi-head self-attention (MHSA) succeeded by feed-forward layers:</p>
<disp-formula id="EQ32"><mml:math id="M47"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">MHSA</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Concat</mml:mtext></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="true">(</mml:mo><mml:mtext class="textrm" mathvariant="normal">softmax</mml:mtext><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="true">)</mml:mo><mml:msup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(31)</label></disp-formula>
<p>where <inline-formula><mml:math id="M48"><mml:msub><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M49"><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M50"><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>.</p>
<p>After <italic>B</italic> transformer layers, the fused representation is obtained from the classification token:</p>
<disp-formula id="EQ33"><mml:math id="M51"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">fused</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">cls</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(32)</label></disp-formula></sec>
<sec>
<label>3.10</label>
<title>Prediction</title>
<p>DR is predicted using the classifier as in :</p>
<disp-formula id="EQ34"><mml:math id="M52"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x00177;</mml:mi><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">softmax</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">fused</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(33)</label></disp-formula>
<p>This architecture enables joint reasoning across image-level features, vascular topology, and handcrafted descriptors, improving robustness and interpretability.</p></sec></sec>
<sec id="s4">
<label>4</label>
<title>Experimental results and analysis</title>
<p>The datasets used in the experiments, such as APTOS 2019, EyePACS, and Messidor-2, have different grading protocols, image quality, and image acquisition techniques. Class-balanced augmentation is used to manage the imbalancing problem. Messidor-2, APTOS2019, and EyePACS use different DR grading schemes, and therefore, the labels were standardized to a unified 5-class International Clinical Diabetic Retinopathy (ICDR) scale to ensure consistency. To handle dataset heterogeneity, preprocessing and normalization techniques are applied to three datasets. The preprocessing pipeline, which includes CLAHE, Canny edge detection, and Top-Hat filtering, is applied to all datasets. The parameter values can be adjusted to handle the variations in illumination, resolution, and image quality across datasets. Here, all experiments employ 5-fold cross-validation for all experiments, with patient-level splitting applied for Messidor-2 and EyePACS. All images from a single patient stay in the same folder, which prevents cross-patient data leakage. For APTOS2019, stratified 5-fold image-level splitting is used while maintaining class balance.</p>
<sec>
<label>4.1</label>
<title>Dataset preprocessing and augmentation</title>
<p>To guarantee high-quality input data, CLAHE was utilized for contrast improvement to highlight fine retinal blood vessel pathology. Canny edge detection was used for accurate vessel segmentation, and morphological Top-hat filtering was employed to improve the measured vessel morphology. GLCM texture features were also used to determine spatial relationships among retinal microstructures. The FDA was utilized to estimate vascular complexity to provide a more quantitative measure of structural pathology.</p>
<p><xref ref-type="table" rid="T2">Table 2</xref> shows a comparison of three retinal image datasets. Messidor-2, being the main dataset used in this work, had the accuracy (93.8%), followed by EyePACS (98.2%) and APTOS 2019 (99.2%).</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Accuracy obtained by the proposed model on different datasets.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Classes</bold></th>
<th valign="top" align="center"><bold>Images</bold></th>
<th valign="top" align="center"><bold>Accuracy</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Messidor-2</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">1,748</td>
<td valign="top" align="center">93.8%</td>
</tr>
<tr>
<td valign="top" align="left">EyePACS</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">88,700</td>
<td valign="top" align="center">98.2%</td>
</tr>
<tr>
<td valign="top" align="left">APTOS 2019</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">3,662</td>
<td valign="top" align="center">99.2%</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>4.2</label>
<title>Model training and optimization</title>
<p>Training was carried out with categorical cross-entropy loss and Adam optimizer with the initial learning rate of 0.0001, which was reduced step-by-step using the ReduceLROnPlateau scheduler to avoid overfitting. Early stopping criterion tracked validation loss and stopped the training process if performance was satisfactory, avoiding repeated computation for the best convergence.</p>
<p>To test every component&#x00027;s contribution, several test runs were undertaken with and without notable enhancements like SE attention, dilated convolutions, and upgraded preprocessing. How each such component added performance is illuminated by the ablation studies (described in Section 4.5).</p></sec>
<sec>
<label>4.3</label>
<title>Performance metrics and evaluation</title>
<p>To empirically assess the performance of the aforementioned model, a set of evaluation performance measures was used, such as accuracy, precision, recall, specificity, F1-score, and AUC-ROC. Accuracy is one of the main measures of whether the model classifies DR classes correctly, but classification evaluation cannot be described at all by it, and a set of these measures is more crucial to obtaining the balance of false positives as well as false negatives. The hyperparameters used are given in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Hyperparameters used in the proposed model.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Parameter</bold></th>
<th valign="top" align="left"><bold>Setting</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Batch size</td>
<td valign="top" align="left">32</td>
</tr>
<tr>
<td valign="top" align="left">Epochs</td>
<td valign="top" align="left">100</td>
</tr>
<tr>
<td valign="top" align="left">Optimizer</td>
<td valign="top" align="left">Adam</td>
</tr>
<tr>
<td valign="top" align="left">Initial learning rate</td>
<td valign="top" align="left">1 &#x000D7; 10<sup>&#x02212;4</sup></td>
</tr>
<tr>
<td valign="top" align="left">Weight decay</td>
<td valign="top" align="left">1 &#x000D7; 10<sup>&#x02212;5</sup></td>
</tr>
<tr>
<td valign="top" align="left">Hidden dimension of transformer</td>
<td valign="top" align="left">256</td>
</tr>
<tr>
<td valign="top" align="left">Attention heads</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td valign="top" align="left">Adam &#x003B2;<sub>1</sub>, &#x003B2;<sub>2</sub></td>
<td valign="top" align="left">0.9, 0.999</td>
</tr>
<tr>
<td valign="top" align="left">Hardware</td>
<td valign="top" align="left">NVIDIA RTX 3090 GPU (24 GB), 64 GB RAM</td>
</tr>
<tr>
<td valign="top" align="left">Framework</td>
<td valign="top" align="left">PyTorch 2.1.0 with CUDA 12.2</td>
</tr></tbody>
</table>
</table-wrap>
<p>Accuracy computes the number of true positives divided by all cases labeled DR, which decreases the number of false-positives as illustrated in <xref ref-type="fig" rid="F6">Figure 6</xref>.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Performance analysis of the proposed model using different metrics on Messidor-2 dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0006.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x0201C;Performance Metrics and Evaluation&#x0201D; showing six metrics: Accuracy (93.8%), Precision (92.1%), Recall (92.8%), Specificity (94.2%), F1-Score (92.4%), and AUC-ROC (96.0%). Each bar varies in color.</alt-text>
</graphic>
</fig>
<p><xref ref-type="table" rid="T4">Table 4</xref> presents the performance metrics of the proposed model, achieving 93.8% accuracy, ensuring reliable classification. The recall of 92.8 indicates strong detection of DR cases, while the specificity of 94.2% minimizes false positives. The AUC-ROC of 0.96 highlights its excellent discriminatory power, confirming the model&#x00027;s effectiveness in automated DR detection.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Performance analysis of the proposed model for Messidor-2 using different metrics.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Metric</bold></th>
<th valign="top" align="center"><bold>Value (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="center">93.8</td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="center">92.1</td>
</tr>
<tr>
<td valign="top" align="left">Recall</td>
<td valign="top" align="center">92.8</td>
</tr>
<tr>
<td valign="top" align="left">Specificity</td>
<td valign="top" align="center">94.2</td>
</tr>
<tr>
<td valign="top" align="left">F1-Score</td>
<td valign="top" align="center">92.4</td>
</tr>
<tr>
<td valign="top" align="left">AUC-ROC</td>
<td valign="top" align="center">0.96</td>
</tr></tbody>
</table>
</table-wrap>
<sec>
<label>4.3.1</label>
<title>Output visualizations and explainability using grad-CAM heatmap</title>
<p>As depicted in <xref ref-type="fig" rid="F7">Figure 7</xref>, the figure depicts the visualization of outputs after preprocessing, after applying FDA and GLCM.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Visualization of outputs obtained during preprocessing, FDA, and GLCM.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0007.tif">
<alt-text content-type="machine-generated">Grid of retinal images displaying various stages of processing. Each row starts with an original retinal scan, followed by grayscale versions, edge-detected variants, topographic views, and colored data visualizations. Each transformation highlights different details such as blood vessels and nerve structures.</alt-text>
</graphic>
</fig>
<p>Gradient-weighted class activation mapping (Grad-CAM) was used to visualize the key retinal regions impacting the predictions to improve the interpretability of the suggested deep learning model. Grad-CAM generates class-discriminative heat maps that highlight the geographical regions that have the most effects on model confidence, providing insights into the convolutional layers&#x00027; decision-making process.</p>
<p>The model mainly targets areas of high vascular complexity and optic disk boundaries. These highlighted regions are clinically relevant, as microvascular irregularities in these regions are strongly correlated with DR. The use of Grad-CAM ensures that the model&#x00027;s predictions align with clinically interpretable biomarkers, thereby enhancing trustworthiness for potential integration into real-world diagnostic systems.</p>
<p>DR is an eye disease, and therefore, models are typically interpreted through heat maps. The Grad-CAM heat map provides a pixel-level visualization of the regions that influence the decision of the model. The green, orange, and yellow regions indicate the areas of close attention. The yellow color indicates the areas that strongly contribute to the DR class. These regions coincide with hemorrhages, microaneurysms, exudates, and areas of vascular leakage. In addition, purple/blue represents areas (outer retinal periphery in which fewer lesions are visible) with less influence on the prediction. The color distribution in <xref ref-type="fig" rid="F8">Figure 8</xref> indicates that the proposed model always attends to the regions rich in lesions, which makes predictions driven by clinically important retinal features. The green zones indicate the boundaries of the vessel, the perivascular regions, and the first lesions.</p>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Grad-cam heatmaps obtained using the proposed model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0008.tif">
<alt-text content-type="machine-generated">Three retinal images with heat maps highlighting areas related to diabetic retinopathy in green and blue shades. Above each image, numerical values represent different stages: No DR, Mild, Moderate, Severe, and Proliferative, indicating varying levels of disease progression.</alt-text>
</graphic>
</fig></sec></sec>
<sec>
<label>4.4</label>
<title>Comparative analysis with baseline models</title>
<p>For comparison purposes, the performance of the proposed model was also compared to the existing models in the literature, including MobileNetV3 without SE augmentation, ResNet50, EfficientNet-B0, DenseNet-121, and Vision Transformer. The suggested model surpassed all the rest, with 93.8% accuracy and 0.96 AUC-ROC as illustrated in <xref ref-type="fig" rid="F9">Figure 9</xref>, proving that SE is effective in recalibrating features and dilated convolutions help increase the detection of vessel pathology. <xref ref-type="table" rid="T5">Table 5</xref> depicts that the extraction of the vessel segmentation masks by the proposed approach is good when applying it on the retina blood vessel dataset and analyzing the performance before applying it on Messidor-2. The AVR annulus map with arteries and veins identified is given in <xref ref-type="fig" rid="F10">Figure 10</xref>.</p>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Comparative analysis with different existing models for Messidor-2.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0009.tif">
<alt-text content-type="machine-generated">Bar chart comparing model performance across various metrics: Accuracy, Precision, Recall, F1-Score, and AUC-ROC. MobileNetV3 scores lowest overall, while the proposed model consistently achieves the highest scores, peaking at 96 percent for AUC-ROC. Other models include ResNet-50, EfficientNet-B0, DenseNet-121, and ViT, each with varying performance scores generally between 85 and 94 percent.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Performance analysis of vessel segmentation masks extraction by CLAHE &#x0002B; Canny &#x0002B; Top-hat &#x0002B; U-Net on retina blood vessel dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Metric</bold></th>
<th valign="top" align="center"><bold>Value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">95.85</td>
</tr>
<tr>
<td valign="top" align="left">Sensitivity (%)</td>
<td valign="top" align="center">92.12</td>
</tr>
<tr>
<td valign="top" align="left">Specificity (%)</td>
<td valign="top" align="center">90.92</td>
</tr>
<tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">91.45</td>
</tr>
<tr>
<td valign="top" align="left">F1-Score (%)</td>
<td valign="top" align="center">93.27</td>
</tr>
<tr>
<td valign="top" align="left">Dice Coefficient</td>
<td valign="top" align="center">0.873</td>
</tr>
<tr>
<td valign="top" align="left">IoU</td>
<td valign="top" align="center">0.872</td>
</tr>
<tr>
<td valign="top" align="left">AUC</td>
<td valign="top" align="center">0.937</td>
</tr></tbody>
</table>
</table-wrap>
<fig position="float" id="F10">
<label>Figure 10</label>
<caption><p>Arteries and veins for AVR biomarker.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1731633-g0010.tif">
<alt-text content-type="machine-generated">Two side-by-side retinal images. Left image shows the original retina with a yellow circle highlighting the optic disc and measurement points in red and blue. Right image depicts segmented arteries in red and veins in blue overlaying the retina.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>4.5</label>
<title>Ablation study</title>
<p>Ablation experiments play an important role in estimating the contribution of different elements in deep learning models. In this work, the impact of the SE-Block attention mechanism, dilated convolutions, and preprocessing techniques on the accuracy of the proposed MobileNetV3-based retinal image classifier for DR detection is thoroughly analyzed. This analysis is achieved by the stepwise addition or elimination of significant components through an ablation study.</p>
<list list-type="bullet">
<list-item><p><bold>Baseline Model (MobileNetV3 Only)</bold> There is no fine-grained vessel detection, and as a result has moderate accuracy.</p></list-item>
<list-item><p><bold>&#x0002B;</bold> <bold>SE-Block Attention</bold> It has improved feature representation and thereby enhances sensitivity.</p></list-item>
<list-item><p><bold>&#x0002B;</bold> <bold>SE-Block</bold> <bold>&#x0002B;</bold> <bold>Dilated Convolutions</bold> Expands the receptive field, and there is better detection of fine detailed patterns.</p></list-item>
<list-item><p><bold>&#x0002B;</bold> <bold>SE-Block</bold> <bold>&#x0002B;</bold> <bold>Dilated Convolutions</bold> <bold>&#x0002B;</bold> <bold>CBAM</bold> Improves the vessel visibility for mild abnormalities.</p></list-item>
<list-item><p><bold>GLCM</bold> Helps in capturing statistical and structural details.</p></list-item>
<list-item><p><bold>FDA</bold> Helps in measuring the irregularity in retinal structures.</p></list-item>
<list-item><p><bold>AVR</bold> The decreased AVR value helps in flagging the high-risk patients by identifying the DR severity.</p></list-item>
<list-item><p><bold>Proposed full model</bold> Combines all enhancements, achieving the highest accuracy, proving that each added feature significantly contributes to overall performance.</p></list-item>
</list>
<p><xref ref-type="table" rid="T6">Table 6</xref> shows the results of the ablation experiment. Baseline MobileNetV3-alone model achieves 86.5% accuracy, but has low sensitivity to fine vascular pathology. The SE-Block attention mechanism improves the accuracy to 88.3%. Dilated convolutions boost the accuracy to 88.8% by capturing the fine retinal details. Preprocessing techniques such as CLAHE and edge detection significantly enhance vessel visibility, particularly in mild DR cases, boosting contrast and structural definition. Stepwise performance improvement is evidence of the necessity for combining spatial attention, multiscale feature extraction, and advanced preprocessing techniques to reach peak classification accuracy. AVR boosts the accuracy to 93%. The entire model achieves 93.8% accuracy, indicating the overall effect of feature extraction and classification improvement. These results verify the necessity of a hybrid domain-specific and deep learning technique for medical image analysis. Also, cross-domain experiments, as in <xref ref-type="table" rid="T7">Table 7</xref>, are conducted to analyze the effect of domain shift due to variations in illumination, resolutions, and grading. Therefore, the results show the competitive performance across datasets and its ability to perform well in real-world environments. The present work focuses on publicly available datasets, but the methodology can be extended to suit naturally to hospital-based environments, which is a crucial direction for future validation.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Ablation study of the proposed model using Messidor-2 dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Configuration</bold></th>
<th valign="top" align="center"><bold>Acc</bold>.</th>
<th valign="top" align="center"><bold>Sens</bold>.</th>
<th valign="top" align="center"><bold>Spec</bold>.</th>
<th valign="top" align="center"><bold>AUC</bold></th>
<th valign="top" align="center"><bold>F1</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">MobileNetV3 baseline model</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">91.5</td>
<td valign="top" align="center">0.910</td>
<td valign="top" align="center">81.3</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; SE</td>
<td valign="top" align="center">88.3</td>
<td valign="top" align="center">83.8</td>
<td valign="top" align="center">91.1</td>
<td valign="top" align="center">0.926</td>
<td valign="top" align="center">82.1</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; CBAM</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">84.1</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">0.919</td>
<td valign="top" align="center">82.5</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; Dilated Convolution</td>
<td valign="top" align="center">88.8</td>
<td valign="top" align="center">86.3</td>
<td valign="top" align="center">91.5</td>
<td valign="top" align="center">0.920</td>
<td valign="top" align="center">82.6</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; SE &#x0002B; CBAM</td>
<td valign="top" align="center">91.2</td>
<td valign="top" align="center">90.6</td>
<td valign="top" align="center">93.0</td>
<td valign="top" align="center">0.935</td>
<td valign="top" align="center">87.1</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; SE &#x0002B; Dilated Convolution</td>
<td valign="top" align="center">91.0</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">91.8</td>
<td valign="top" align="center">0.933</td>
<td valign="top" align="center">87.9</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; CBAM &#x0002B; Dilated Convolution</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">91.0</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">0.937</td>
<td valign="top" align="center">88.4</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; SE &#x0002B; CBAM &#x0002B; Dilated Convolution</td>
<td valign="top" align="center">91.8</td>
<td valign="top" align="center">91.1</td>
<td valign="top" align="center">92.5</td>
<td valign="top" align="center">0.941</td>
<td valign="top" align="center">88.9</td>
</tr>
<tr>
<td valign="top" align="left">(above) &#x0002B; CLAHE</td>
<td valign="top" align="center">91.2</td>
<td valign="top" align="center">91.1</td>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">0.935</td>
<td valign="top" align="center">89.6</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; Canny</td>
<td valign="top" align="center">91.5</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">0.920</td>
<td valign="top" align="center">90.2</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; U-Net</td>
<td valign="top" align="center">91.7</td>
<td valign="top" align="center">91.5</td>
<td valign="top" align="center">91.6</td>
<td valign="top" align="center">0.923</td>
<td valign="top" align="center">90.6</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; GLCM</td>
<td valign="top" align="center">92.0</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">0.929</td>
<td valign="top" align="center">90.6</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; FDA</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">92.6</td>
<td valign="top" align="center">0.924</td>
<td valign="top" align="center">91.1</td>
</tr>
<tr>
<td valign="top" align="left">&#x0002B; AVR</td>
<td valign="top" align="center">93.0</td>
<td valign="top" align="center">92.9</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">0.937</td>
<td valign="top" align="center">92.5</td>
</tr>
<tr>
<td valign="top" align="left">Full proposed model</td>
<td valign="top" align="center"><bold>93.8</bold></td>
<td valign="top" align="center"><bold>94.2</bold></td>
<td valign="top" align="center"><bold>94.2</bold></td>
<td valign="top" align="center"><bold>0.960</bold></td>
<td valign="top" align="center"><bold>92.4</bold></td>
</tr></tbody>
</table>
</table-wrap>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Cross-domain evaluation of the proposed model.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Training dataset</bold></th>
<th valign="top" align="left"><bold>Testing Dataset</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>AUC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Messidor-2</td>
<td valign="top" align="left">EyePACS</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">97.8</td>
<td valign="top" align="center">0.98</td>
</tr>
<tr>
<td valign="top" align="left">APTOS 2019</td>
<td valign="top" align="left">Messidor-2</td>
<td valign="top" align="center">93.5</td>
<td valign="top" align="center">92.0</td>
<td valign="top" align="center">0.95</td>
</tr>
<tr>
<td valign="top" align="left">EyePACS</td>
<td valign="top" align="left">Messidor-2</td>
<td valign="top" align="center">93.5</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">0.95</td>
</tr>
<tr>
<td valign="top" align="left">Messidor-2</td>
<td valign="top" align="left">APTOS 2019</td>
<td valign="top" align="center">98.0</td>
<td valign="top" align="center">97.5</td>
<td valign="top" align="center">0.98</td>
</tr></tbody>
</table>
</table-wrap>
<p>The results in <xref ref-type="table" rid="T7">Table 7</xref> indicate the generalization capability across datasets that have different imaging characteristics. The results of Messidor-2 show some variation because of domain shift, and the model has high accuracy and AUC even after the transfer to APTOS 2019 and EyePACS, featuring robustness. These results indicate the suitability for deployment even in heterogeneous environments.</p>
<p>A five-fold cross-validation is performed on all three datasets. <xref ref-type="table" rid="T8">Table 8</xref> shows the results and their 95% confidence intervals, indicating stability across folds. A paired t-test conducted on the 5 folds resulted in a statistically significant performance improvement when compared to the best baseline vision transformer, as <italic>p</italic> &#x0003C; 0.05. The proposed model also achieves greater than 98% accuracy on APTOS and EyePACS, with the best performance of 93.8% on Messidor-2, indicating robustness. Also, the baseline models were trained with the same preprocessing pipeline, data splits, and configuration, and the experiments are done and reported in <xref ref-type="table" rid="T9">Table 9</xref>. It indicates that the proposed work performs better than the standard CNN and transformer-based architectures for all datasets. These results also indicate the advantages of combining deep representations with vascular morphology and other descriptors.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>Five-fold cross-validation performance of the proposed model.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>AUC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Messidor-2</td>
<td valign="top" align="center">93.8 &#x000B1; 0.7</td>
<td valign="top" align="center">92.1 &#x000B1; 0.5</td>
<td valign="top" align="center">92.8 &#x000B1; 0.6</td>
<td valign="top" align="center">0.960 &#x000B1; 0.008</td>
</tr>
<tr>
<td valign="top" align="left">APTOS 2019</td>
<td valign="top" align="center">99.2 &#x000B1; 0.5</td>
<td valign="top" align="center">98.8 &#x000B1; 0.4</td>
<td valign="top" align="center">99.0 &#x000B1; 0.4</td>
<td valign="top" align="center">0.990 &#x000B1; 0.004</td>
</tr>
<tr>
<td valign="top" align="left">EyePACS</td>
<td valign="top" align="center">98.2 &#x000B1; 0.4</td>
<td valign="top" align="center">97.3 &#x000B1; 0.3</td>
<td valign="top" align="center">97.5 &#x000B1; 0.4</td>
<td valign="top" align="center">0.982 &#x000B1; 0.005</td>
</tr></tbody>
</table>
</table-wrap>
<table-wrap position="float" id="T9">
<label>Table 9</label>
<caption><p>Comparison with baseline models under identical conditions.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Messidor-2 Acc (%)</bold></th>
<th valign="top" align="center"><bold>APTOS Acc (%)</bold></th>
<th valign="top" align="center"><bold>EyePACS Acc (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">EfficientNet-B0</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">97.2</td>
<td valign="top" align="center">97.1</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">96.54</td>
<td valign="top" align="center">96.0</td>
</tr>
<tr>
<td valign="top" align="left">ViT-Base</td>
<td valign="top" align="center">92.7</td>
<td valign="top" align="center">97.3</td>
<td valign="top" align="center">97.1</td>
</tr>
<tr>
<td valign="top" align="left">Proposed model</td>
<td valign="top" align="center">93.8</td>
<td valign="top" align="center">99.2</td>
<td valign="top" align="center">98.2</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>4.6</label>
<title>State-of-the-art</title>
<p>In medical image analysis, deep learning has been a significant advancement, especially in retinal imaging, where it allows for automated evaluation of ocular disease like DR. Because of the Messidor dataset&#x00027;s high-resolution and detailed retinal images, this work has investigated the utilization of retinal fundus photos.</p>
<p>As summarized in <xref ref-type="table" rid="T10">Table 10</xref>, the proposed approach is compared to the state-of-the-art models using Messidor-2, EyePACS, and APTOS-2019 datasets for DR detection. Performance metrics, such as accuracy, sensitivity, specificity, and AUC, are used for comparison. The proposed method achieves good performance for all the datasets. The existing methods, such as ConvNeXt, EfficientNet, and vision transformer variants, are used for the comparison. The proposed approach achieves the best performance when compared to other existing works. There will be challenges due to poor illumination, demographic bias, and the presence of artifacts. In the proposed work, CLAHE eliminates poor illumination by improving the local contrast. Canny &#x0002B; Top-hat suppresses artifacts and highlights the vessel and lesions. GLCM and FDA quantify vascular complexity and are robust to noise. MobileNetV3 also learns discriminative features, eliminating demographic/device bias while enhancing generalization. AVR helps in normalizing vessel caliber, eliminating the demographic bias due to age, sex, and ethnicity. SE and CBAM adaptively re-weight spatial regions, eliminating the artifacts and only focusing on lesions. Dilated convolutions magnify the receptive field, maintaining good resolution, thus helping MobileNetV3 to capture information under poor illumination and varying image quality.</p>
<table-wrap position="float" id="T10">
<label>Table 10</label>
<caption><p>Comparison with the state-of-the-art models for DR detection.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Model used</bold></th>
<th valign="top" align="center"><bold>Acc. (%)</bold></th>
<th valign="top" align="center"><bold>Sens. (%)</bold></th>
<th valign="top" align="center"><bold>Spec. (%)</bold></th>
<th valign="top" align="center"><bold>AUC</bold></th>
<th valign="top" align="left"><bold>References</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Messidor-2</td>
<td valign="top" align="left"><bold>Proposed</bold></td>
<td valign="top" align="center">93.8</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="left">This work</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">DR-ConvNeXt</td>
<td valign="top" align="center">83.6</td>
<td valign="top" align="center">74.0</td>
<td valign="top" align="center">94.6</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B44">Song and Wu, 2025</xref></td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">DRStageNet</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B31">Men et al., 2023</xref></td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">Swin Transformer var.</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B52">Yao et al., 2022</xref>; Saadna et al., <xref ref-type="bibr" rid="B39">2025</xref></td>
</tr>
<tr>
<td valign="top" align="left">EyePACS</td>
<td valign="top" align="left"><bold>Proposed</bold></td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">98.1</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="left">This work</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">EfficientNet</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B5">Chetoui and Akhloufi, 2020</xref>; <xref ref-type="bibr" rid="B53">Yi et al., 2021</xref></td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">ViT / Swin</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B17">Huang et al., 2024</xref>; <xref ref-type="bibr" rid="B51">Yang et al., 2024</xref></td>
</tr>
<tr>
<td valign="top" align="left">APTOS 2019</td>
<td valign="top" align="left"><bold>Proposed</bold></td>
<td valign="top" align="center">99.2</td>
<td valign="top" align="center">99.1</td>
<td valign="top" align="center">99.3</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="left">This work</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">GPMKLE-Net</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="left">Zhou et al., <xref ref-type="bibr" rid="B57">2023</xref></td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">ConvNeXt</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">&#x02014;</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="left"><xref ref-type="bibr" rid="B44">Song and Wu, 2025</xref>; <xref ref-type="bibr" rid="B33">Nadeem et al., 2022</xref></td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>4.7</label>
<title>Limitations</title>
<p>There is a relatively high computational cost involved both during training and inference. This will slightly affect the deployment on low-resource systems or edge devices without hardware acceleration. Also, scalability requires more optimization strategies such as model compression. The proposed work also needs further evaluation on multi-center and handheld screening devices to verify its deployment to real-world scenarios. The experiments on hospital-based data will also be done as future work, as it requires some more steps regarding domain adaptation.</p></sec></sec>
<sec sec-type="conclusions" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>This study suggests a novel and effective deep learning framework for DR prediction from retinal fundus images. The proposed architecture gathers local features from retinal fundus images using MobileNetV3, incorporating SE attention blocks and dilated convolutions to better capture fine-grained vascular features indicative of ocular disease such as DR prediction. Through comprehensive experiments and ablation studies, it is demonstrated that the inclusion of preprocessing techniques such as CLAHE-based contrast enhancement, Canny edge detection, and Top-hat transformation and segmentation using U-Net improves the performance. Also, the regional features captured using GLCM, the global biomarker features captured using AVR, and the FDA contribute significantly to improving model sensitivity, specificity, and overall robustness. The features are embedded in a graph-based representation using GNN that preserves vascular topology. The transformer-based cross-modal fusion integrates the multi-modal features so effectively. The model achieved an AUC-ROC of 0.96 on the Messidor dataset&#x02014;outperforming conventional risk scoring systems and previously published deep learning benchmarks. Moreover, the model ensures feasibility for real-time screening in both hospital and remote settings. The AVR biomarker individually helps in DR detection after being fused with MobileNet, GLCM, and FDA features.</p>
<p>In future, it is aimed to expand the model&#x00027;s utility through multi-modal learning by integrating retinal image data with electronic health records, demographic information, and lifestyle factors to improve DR detection. Additionally, prospective validation in real-world clinical environments will be explored in collaboration with healthcare institutions to assess its diagnostic impact, usability, and integration into clinical workflows.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. The datasets are available at the following links: <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/mariaherrerot/messidor2preprocess">https://www.kaggle.com/datasets/mariaherrerot/messidor2preprocess</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/mariaherrerot/aptos2019">https://www.kaggle.com/datasets/mariaherrerot/aptos2019</ext-link>, and <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/competitions/diabetic-retinopathy-detection">https://www.kaggle.com/competitions/diabetic-retinopathy-detection</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>KD: Supervision, Writing &#x02013; review &#x00026; editing. BS: Conceptualization, Supervision, Writing &#x02013; review &#x00026; editing. BK: Supervision, Writing &#x02013; review &#x00026; editing. SA: Formal analysis, Visualization, Writing &#x02013; original draft. BS: Conceptualization, Supervision, Writing &#x02013; review &#x00026; editing. GS: Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aljohani</surname> <given-names>A.</given-names></name> <name><surname>Aburasain</surname> <given-names>R. Y.</given-names></name></person-group> (<year>2024</year>). <article-title>A hybrid framework for glaucoma detection through federated machine learning and deep learning models</article-title>. <source>BMC Med. Inform. Decis. Mak</source>. <volume>24</volume>:<fpage>115</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12911-024-02518-y</pub-id><pub-id pub-id-type="pmid">38698412</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ashraf</surname> <given-names>M.</given-names></name> <name><surname>Shokrollahi</surname> <given-names>S.</given-names></name> <name><surname>Pisig</surname> <given-names>A. U.</given-names></name> <name><surname>Sampani</surname> <given-names>K.</given-names></name> <name><surname>Abdelal</surname> <given-names>O.</given-names></name> <name><surname>Cavallerano</surname> <given-names>J. D.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Retinal vascular caliber association with nonperfusion and diabetic retinopathy severity depends on vascular caliber measurement location</article-title>. <source>Ophthalmol. Retina</source> <volume>5</volume>, <fpage>571</fpage>&#x02013;<lpage>579</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.oret.2020.09.003</pub-id><pub-id pub-id-type="pmid">32927151</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhoopalan</surname> <given-names>R.</given-names></name> <name><surname>Sekar</surname> <given-names>P.</given-names></name> <name><surname>Nagaprasad</surname> <given-names>N.</given-names></name> <name><surname>Mamo</surname> <given-names>T. R.</given-names></name> <name><surname>Krishnaraj</surname> <given-names>R.</given-names></name></person-group> (<year>2025</year>). <article-title>Task-optimized vision transformer for diabetic retinopathy detection and classification in resource-constrained early diagnosis settings</article-title>. <source>Sci. Rep</source>. <volume>15</volume>:<fpage>39047</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-25399-1</pub-id><pub-id pub-id-type="pmid">41203681</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chang</surname> <given-names>J.</given-names></name> <name><surname>Ko</surname> <given-names>A.</given-names></name> <name><surname>Park</surname> <given-names>S. M.</given-names></name> <name><surname>Choi</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>K.</given-names></name> <name><surname>Kim</surname> <given-names>S. M.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Association of cardiovascular mortality and deep learning-funduscopic atherosclerosis score derived from retinal fundus images</article-title>. <source>Am. J. Ophthalmol</source>. <volume>217</volume>, <fpage>121</fpage>&#x02013;<lpage>130</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ajo.2020.03.027</pub-id><pub-id pub-id-type="pmid">32222370</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Chetoui</surname> <given-names>M.</given-names></name> <name><surname>Akhloufi</surname> <given-names>M. A.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Explainable diabetic retinopathy using EfficientNET,&#x0201D;</article-title> in <source>2020 42nd Annual International Conference of the IEEE Engineering in Medicine &#x00026; Biology Society (EMBC)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1966</fpage>&#x02013;<lpage>1969</lpage>. doi: <pub-id pub-id-type="doi">10.1109/EMBC44109.2020.9175664</pub-id><pub-id pub-id-type="pmid">33018388</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Das</surname> <given-names>P. K.</given-names></name> <name><surname>Pumrin</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Diabetic retinopathy classification: performance evaluation of pre-trained lightweight cnn using imbalance dataset</article-title>. <source>Eng. J</source>. <volume>28</volume>, <fpage>13</fpage>&#x02013;<lpage>25</lpage>. doi: <pub-id pub-id-type="doi">10.4186/ej.2024.28.7.13</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dixit</surname> <given-names>R. B.</given-names></name> <name><surname>Jha</surname> <given-names>C. K.</given-names></name></person-group> (<year>2025</year>). <article-title>Fundus image based diabetic retinopathy detection using EfficientNetB3 with squeeze and excitation block</article-title>. <source>Med. Eng. Phys.</source> 104350. doi: <pub-id pub-id-type="doi">10.1016/j.medengphy.2025.104350</pub-id><pub-id pub-id-type="pmid">40436513</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dong</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Pan</surname> <given-names>S.</given-names></name> <name><surname>Weng</surname> <given-names>T.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name> <name><surname>Jiang</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>A multimodal transformer system for non-invasive diabetic nephropathy diagnosis via retinal imaging</article-title>. <source>NPJ Digit. Med</source>. <volume>8</volume>:<fpage>50</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-024-01393-1</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><collab>Fang L. Qiao, H. 2022. Diabetic retinopathy classification using a novel DAG network based on multi-feature of fundus images. Biomed. Signal Process. Control. 77:103810. 10.1016/j.bspc.2022.103810</collab></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>French</surname> <given-names>C.</given-names></name> <name><surname>Cubbidge</surname> <given-names>R. P.</given-names></name> <name><surname>Heitmar</surname> <given-names>R.</given-names></name></person-group> (<year>2022</year>). <article-title>The application of arterio-venous ratio (AVR) cut-off values in clinic to stratify cardiovascular risk in patients</article-title>. <source>Ophthalmic Physiol. Optics</source> <volume>42</volume>, <fpage>666</fpage>&#x02013;<lpage>674</lpage>. doi: <pub-id pub-id-type="doi">10.1111/opo.12967</pub-id><pub-id pub-id-type="pmid">35257402</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gulshan</surname> <given-names>V.</given-names></name> <name><surname>Peng</surname> <given-names>L.</given-names></name> <name><surname>Coram</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Development and validation of a deep learning algorithm for detection of diabetic retinopathy in retinal fundus photographs</article-title>. <source>JAMA</source> <volume>316</volume>, <fpage>2402</fpage>&#x02013;<lpage>2410</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2016.17216</pub-id><pub-id pub-id-type="pmid">27898976</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Haq</surname> <given-names>N.</given-names></name> <name><surname>Waheed</surname> <given-names>M.</given-names></name> Others</person-group> (<year>2024</year>). <article-title>Computationally efficient deep learning models for diabetic retinopathy detection: a systematic review</article-title>. <source>Artif. Intellig. Rev</source>. <volume>57</volume>, <fpage>1</fpage>&#x02013;<lpage>34</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10462-024-10942-9</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Haralick</surname> <given-names>R. M.</given-names></name> <name><surname>Shanmugam</surname> <given-names>K.</given-names></name> <name><surname>Dinstein</surname> <given-names>I.</given-names></name></person-group> (<year>1973</year>). <article-title>Textural features for image classification</article-title>. <source>IEEE Trans. Syst. Man Cybern</source>. <volume>3</volume>, <fpage>610</fpage>&#x02013;<lpage>621</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TSMC.1973.4309314</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Deep residual learning for image recognition</article-title>. <source>arXiv</source> preprint arXiv:1512.03385. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Herrerot</surname> <given-names>M.</given-names></name></person-group> (<year>2022</year>). <source>Messidor-2 Preprocessed Dataset</source>.</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>S.-C.</given-names></name> <name><surname>Pareek</surname> <given-names>A.</given-names></name> <name><surname>Jensen</surname> <given-names>M.</given-names></name> <name><surname>Lungren</surname> <given-names>M. P.</given-names></name> <name><surname>Yeung</surname> <given-names>S.</given-names></name> <name><surname>Chaudhari</surname> <given-names>A. S.</given-names></name></person-group> (<year>2023</year>). <article-title>Self-supervised learning for medical image classification: a systematic review and implementation guidelines</article-title>. <source>NPJ Digit. Med</source>. <volume>6</volume>:<fpage>74</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-023-00811-0</pub-id><pub-id pub-id-type="pmid">37100953</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Lyu</surname> <given-names>J.</given-names></name> <name><surname>Cheng</surname> <given-names>P.</given-names></name> <name><surname>Tam</surname> <given-names>R.</given-names></name> <name><surname>Tang</surname> <given-names>X.</given-names></name></person-group> (<year>2024</year>). <article-title>Ssit: Saliency-guided self-supervised image transformer for diabetic retinopathy grading</article-title>. <source>IEEE J. Biomed. Health Inform.</source> <volume>28</volume>, <fpage>2806</fpage>&#x02013;<lpage>2817</lpage>. <pub-id pub-id-type="pmid">38319784</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ikram</surname> <given-names>M. K.</given-names></name> <name><surname>de Jong</surname> <given-names>F. J.</given-names></name> <name><surname>Bos</surname> <given-names>M. J.</given-names></name> <name><surname>Vingerling</surname> <given-names>J. R.</given-names></name> <name><surname>Hofman</surname> <given-names>A.</given-names></name> <name><surname>Koudstaal</surname> <given-names>P. J.</given-names></name> <etal/></person-group>. (<year>2006</year>). <article-title>Retinal vessel diameters and risk of stroke: the rotterdam study</article-title>. <source>Neurology</source> <volume>66</volume>, <fpage>1339</fpage>&#x02013;<lpage>1343</lpage>. doi: <pub-id pub-id-type="doi">10.1212/01.wnl.0000210533.24338.ea</pub-id><pub-id pub-id-type="pmid">16682664</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Islam</surname> <given-names>F. M. A.</given-names></name> <name><surname>Nguyen</surname> <given-names>T. T.</given-names></name> <name><surname>Wang</surname> <given-names>J. J.</given-names></name> <name><surname>Tai</surname> <given-names>E. S.</given-names></name> <name><surname>Shankar</surname> <given-names>A.</given-names></name> <name><surname>Saw</surname> <given-names>S. M.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>Quantitative retinal vascular calibre changes in diabetes and retinopathy: the Singapore Malay eye study</article-title>. <source>Eye</source> <volume>23</volume>, <fpage>1719</fpage>&#x02013;<lpage>1724</lpage>. doi: <pub-id pub-id-type="doi">10.1038/eye.2008.362</pub-id><pub-id pub-id-type="pmid">19079148</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>X.</given-names></name> <name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Wei</surname> <given-names>X.-S.</given-names></name> <name><surname>Zhao</surname> <given-names>B.-R.</given-names></name> <name><surname>Chen</surname> <given-names>Z.-M.</given-names></name> <name><surname>Tan</surname> <given-names>X.</given-names></name></person-group> (<year>2022</year>). <article-title>Delving deep into spatial pooling for squeeze-and-excitation networks</article-title>. <source>Pattern Recognit</source>. <volume>121</volume>:<fpage>108159</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patcog.2021.108159</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Keel</surname> <given-names>S.</given-names></name> <name><surname>Lee</surname> <given-names>P.</given-names></name> <name><surname>Scheetz</surname> <given-names>J.</given-names></name> <name><surname>He</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>Visualizing deep learning models for the detection of referable diabetic retinopathy and glaucoma</article-title>. <source>JAMA Ophthalmol.</source> <volume>137</volume>, <fpage>288</fpage>&#x02013;<lpage>292</lpage>. <pub-id pub-id-type="pmid">30570648</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kermany</surname> <given-names>D. S.</given-names></name> <name><surname>Goldbaum</surname> <given-names>M.</given-names></name> <name><surname>Cai</surname> <given-names>W.</given-names></name> <name><surname>Valentim</surname> <given-names>C. C.</given-names></name> <name><surname>Liang</surname> <given-names>H.</given-names></name> <name><surname>Baxter</surname> <given-names>S. L.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Identifying medical diagnoses and treatable diseases by image-based deep learning</article-title>. <source>Cell</source> <volume>172</volume>, <fpage>1122</fpage>&#x02013;<lpage>1131</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cell.2018.02.010</pub-id><pub-id pub-id-type="pmid">29474911</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kipf</surname> <given-names>T. N.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Semi-supervised classification with graph convolutional networks,&#x0201D;</article-title> in <source>International Conference on Learning Representations (ICLR)</source> (<publisher-loc>Toulon</publisher-loc>: <publisher-name>ICLR Conference/OpenReview</publisher-name>).</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lam</surname> <given-names>C.</given-names></name> <name><surname>Yi</surname> <given-names>D.</given-names></name> <name><surname>Guo</surname> <given-names>M.</given-names></name> <name><surname>Lindsey</surname> <given-names>T.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Automated detection of diabetic retinopathy using deep learning,&#x0201D;</article-title> in <source>AMIA Summits on Translational Science Proceedings</source>, 147. <pub-id pub-id-type="pmid">29888061</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>T.</given-names></name> <name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>K.</given-names></name> <name><surname>Guo</surname> <given-names>S.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Kang</surname> <given-names>H.</given-names></name></person-group> (<year>2019</year>). <article-title>Diagnostic assessment of deep learning algorithms for diabetic retinopathy screening</article-title>. <source>Inf. Sci</source>. <volume>501</volume>, <fpage>511</fpage>&#x02013;<lpage>522</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ins.2019.06.011</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Litjens</surname> <given-names>G.</given-names></name> <name><surname>Kooi</surname> <given-names>T.</given-names></name> <name><surname>Bejnordi</surname> <given-names>B. E.</given-names></name> <name><surname>Setio</surname> <given-names>A. A. A.</given-names></name> <name><surname>Ciompi</surname> <given-names>F.</given-names></name> <name><surname>Ghafoorian</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>A survey on deep learning in medical image analysis</article-title>. <source>Med. Image Anal</source>. <volume>42</volume>, <fpage>60</fpage>&#x02013;<lpage>88</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2017.07.005</pub-id><pub-id pub-id-type="pmid">28778026</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>J.</given-names></name> <name><surname>Xiao</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Xu</surname> <given-names>P.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Unsupervised domain adaptation multi-level adversarial learning-based crossing-domain retinal vessel segmentation</article-title>. <source>Comput. Biol. Med</source>. <volume>178</volume>:<fpage>108759</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108759</pub-id><pub-id pub-id-type="pmid">38917530</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>T.</given-names></name> <name><surname>Lin</surname> <given-names>W.</given-names></name> <name><surname>Shi</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Feng</surname> <given-names>M.</given-names></name> <name><surname>Xie</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Retinal and choroidal vascular perfusion and thickness measurement in diabetic retinopathy patients by the swept-source optical coherence tomography angiography</article-title>. <source>Front. Med.</source> <volume>9</volume>:<fpage>786708</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmed.2022.786708</pub-id><pub-id pub-id-type="pmid">35372401</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>J.</given-names></name> <name><surname>Batra</surname> <given-names>D.</given-names></name> <name><surname>Parikh</surname> <given-names>D.</given-names></name> <name><surname>Lee</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems (NeurIPS), Volume 32</source> (<publisher-loc>Red Hook</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>).</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McGeechan</surname> <given-names>K.</given-names></name> <name><surname>Liew</surname> <given-names>G.</given-names></name> <name><surname>Macaskill</surname> <given-names>P.</given-names></name> <name><surname>Irwig</surname> <given-names>L.</given-names></name> <name><surname>Klein</surname> <given-names>R.</given-names></name> <name><surname>Klein</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2008</year>). <article-title>Meta-analysis: retinal vessel caliber and risk for coronary heart disease</article-title>. <source>Ann. Intern. Med</source>. <volume>149</volume>, <fpage>404</fpage>&#x02013;<lpage>413</lpage>. doi: <pub-id pub-id-type="doi">10.7326/0003-4819-151-6-200909150-00005</pub-id><pub-id pub-id-type="pmid">19755365</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Men</surname> <given-names>Y.</given-names></name> <name><surname>Fhima</surname> <given-names>J.</given-names></name> <name><surname>Celi</surname> <given-names>L. A.</given-names></name> <name><surname>Ribeiro</surname> <given-names>L. Z.</given-names></name> <name><surname>Nakayama</surname> <given-names>L. F.</given-names></name> <name><surname>Behar</surname> <given-names>J. A.</given-names></name></person-group> (<year>2023</year>). <article-title>DRStageNet: deep learning for diabetic retinopathy staging from fundus images</article-title>. <source>arXiv</source> [preprint] arXiv:2312.14891. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2312.14891</pub-id></mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mutawa</surname> <given-names>A. M.</given-names></name> <name><surname>Al-Sabti</surname> <given-names>K.</given-names></name> <name><surname>Raizada</surname> <given-names>S.</given-names></name> <name><surname>Sruthi</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>A deep learning model for detecting diabetic retinopathy stages with discrete wavelet transform</article-title>. <source>Appl. Sci</source>. <volume>14</volume>:<fpage>4428</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app14114428</pub-id></mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nadeem</surname> <given-names>M. W.</given-names></name> <name><surname>Goh</surname> <given-names>H. G.</given-names></name> <name><surname>Hussain</surname> <given-names>M.</given-names></name> <name><surname>Liew</surname> <given-names>S. Y.</given-names></name> <name><surname>Andonovic</surname> <given-names>I.</given-names></name> <name><surname>Khan</surname> <given-names>M. A.</given-names></name></person-group> (<year>2022</year>). <article-title>Deep learning for diabetic retinopathy analysis: a review, research challenges, and future directions</article-title>. <source>Sensors</source> <volume>22</volume>:<fpage>6780</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s22186780</pub-id><pub-id pub-id-type="pmid">36146130</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Poplin</surname> <given-names>R.</given-names></name> <name><surname>Varadarajan</surname> <given-names>A. V.</given-names></name> <name><surname>Blumer</surname> <given-names>K.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>McConnell</surname> <given-names>M. V.</given-names></name> <name><surname>Corrado</surname> <given-names>G. S.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Prediction of cardiovascular risk factors from retinal fundus photographs via deep learning</article-title>. <source>Nat. Biomed. Eng</source>. <volume>2</volume>, <fpage>158</fpage>&#x02013;<lpage>164</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41551-018-0195-0</pub-id><pub-id pub-id-type="pmid">31015713</pub-id></mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pratt</surname> <given-names>H.</given-names></name> <name><surname>Coenen</surname> <given-names>F.</given-names></name> <name><surname>Broadbent</surname> <given-names>D. M.</given-names></name> <name><surname>Harding</surname> <given-names>S. P.</given-names></name> <name><surname>Zheng</surname> <given-names>Y.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Convolutional neural networks for diabetic retinopathy,&#x0201D;</article-title> in <source>Procedia Computer Science, 20th Conference on Medical Image Understanding and Analysis (MIUA 2016), Vol. 90</source>, 200&#x02013;205. doi: <pub-id pub-id-type="doi">10.1016/j.procs.2016.07.014</pub-id></mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Quellec</surname> <given-names>G.</given-names></name> <name><surname>Charriere</surname> <given-names>K.</given-names></name> <name><surname>Boudi</surname> <given-names>Y.</given-names></name> <name><surname>Cochener</surname> <given-names>B.</given-names></name> <name><surname>Lamard</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>Deep image mining for diabetic retinopathy screening</article-title>. <source>Med. Image Anal</source>. <volume>39</volume>, <fpage>178</fpage>&#x02013;<lpage>193</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2017.04.012</pub-id><pub-id pub-id-type="pmid">28511066</pub-id></mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rim</surname> <given-names>T. H.</given-names></name> <name><surname>Lee</surname> <given-names>C. J.</given-names></name> <name><surname>Tham</surname> <given-names>Y. C.</given-names></name> <name><surname>Cheung</surname> <given-names>N.</given-names></name> <name><surname>Yu</surname> <given-names>M.</given-names></name> <name><surname>Lee</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Deep-learning-based cardiovascular risk stratification using coronary artery calcium scores predicted from retinal photographs</article-title>. <source>Lancet Digit. Health</source> <volume>3</volume>, <fpage>e306</fpage>&#x02013;<lpage>e316</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S2589-7500(21)00043-1</pub-id><pub-id pub-id-type="pmid">33890578</pub-id></mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O.</given-names></name> <name><surname>Fischer</surname> <given-names>P.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;U-Net: Convolutional networks for biomedical image segmentation,&#x0201D;</article-title> in <source>International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI)</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>234</fpage>&#x02013;<lpage>241</lpage>.</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal"><collab>Saadna Y. Mezzoudj S. Khelifa, M. 2025. Efficient transformer architectures for diabetic retinopathy classification from fundus images: DR-MobileViT DR-EfficientFormer, DR-SwinTiny. Informatica 49.</collab></mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Seidelmann</surname> <given-names>S. B.</given-names></name> <name><surname>Claggett</surname> <given-names>B.</given-names></name> <name><surname>Bravo</surname> <given-names>P. E.</given-names></name> <name><surname>Gupta</surname> <given-names>A.</given-names></name> <name><surname>Farhad</surname> <given-names>H.</given-names></name> <name><surname>Klein</surname> <given-names>B. E.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Retinal vessel calibers in predicting long-term cardiovascular outcomes: the atherosclerosis risk in communities study</article-title>. <source>Circulation</source> <volume>134</volume>, <fpage>1328</fpage>&#x02013;<lpage>1338</lpage>. doi: <pub-id pub-id-type="doi">10.1161/CIRCULATIONAHA.116.023425</pub-id><pub-id pub-id-type="pmid">27682886</pub-id></mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Senapati</surname> <given-names>S.</given-names></name> <name><surname>Tripathy</surname> <given-names>H. K.</given-names></name> <name><surname>Sharma</surname> <given-names>V.</given-names></name> <name><surname>Gandomi</surname> <given-names>A. H.</given-names></name></person-group> (<year>2024</year>). <article-title>Artificial intelligence for diabetic retinopathy detection: a systematic review</article-title>. <source>Inform. Med. Unlocked</source>. <volume>45</volume>:<fpage>101445</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.imu.2024.101445</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shamshad</surname> <given-names>F.</given-names></name> <name><surname>Khan</surname> <given-names>S. H.</given-names></name> <name><surname>Zamir</surname> <given-names>S. W.</given-names></name> <name><surname>Khan</surname> <given-names>M. H.</given-names></name> <name><surname>Hayat</surname> <given-names>M.</given-names></name> <name><surname>Khan</surname> <given-names>F. S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Transformers in medical imaging: a survey</article-title>. <source>Med. Image Anal</source>. <volume>88</volume>:<fpage>102802</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2023.102802</pub-id><pub-id pub-id-type="pmid">37315483</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Shipra</surname> <given-names>H. E.</given-names></name> <name><surname>Rahman</surname> <given-names>M. S.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;An explainable artificial intelligence strategy for transparent deep learning in the classification of eye diseases,&#x0201D;</article-title> in <source>2024 IEEE International Conference on Computing, Applications and Systems (COMPAS)</source> (<publisher-loc>Cox&#x00027;s Bazar</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>6</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>P.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>DR-ConvNeXt: DR classification method for reconstructing ConvNeXt model structure</article-title>. <source>J. X-ray Sci. Technol.</source> <volume>33</volume>, <fpage>448</fpage>&#x02013;<lpage>460</lpage>. <pub-id pub-id-type="pmid">39973787</pub-id></mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ting</surname> <given-names>D. S. W.</given-names></name> <name><surname>Cheung</surname> <given-names>C. Y.-L.</given-names></name> <name><surname>Lim</surname> <given-names>G.</given-names></name> <name><surname>Tan</surname> <given-names>G. S. W.</given-names></name> <name><surname>Quang</surname> <given-names>N. D.</given-names></name> <name><surname>Gan</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Development and validation of a deep learning system for diabetic retinopathy and related eye diseases using retinal images from multiethnic populations with diabetes</article-title>. <source>JAMA</source> <volume>318</volume>, <fpage>2211</fpage>&#x02013;<lpage>2223</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2017.18152</pub-id><pub-id pub-id-type="pmid">29234807</pub-id></mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tseng</surname> <given-names>R. M. W. W.</given-names></name> <name><surname>Rim</surname> <given-names>T. H.</given-names></name> <name><surname>Shantsila</surname> <given-names>E.</given-names></name> <name><surname>Yi</surname> <given-names>J. K.</given-names></name> <name><surname>Park</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>S. S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Validation of a deep-learning-based retinal biomarker (reti-CVD) in the prediction of cardiovascular disease: data from uk biobank</article-title>. <source>BMC Med</source>. <volume>21</volume>:<fpage>28</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12916-022-02684-8</pub-id><pub-id pub-id-type="pmid">36691041</pub-id></mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Voets</surname> <given-names>M.</given-names></name> <name><surname>M&#x000F8;llersen</surname> <given-names>K.</given-names></name> <name><surname>Bongo</surname> <given-names>L. A.</given-names></name></person-group> (<year>2019</year>). <article-title>Replication study: development and validation of a deep learning algorithm for detection of diabetic retinopathy in retinal fundus photographs</article-title>. <source>PLoS ONE</source> <volume>14</volume>:<fpage>e0217541</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0217541</pub-id></mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wagih</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <source>Retina Blood Vessel Dataset</source>.</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <name><surname>Kong</surname> <given-names>W.</given-names></name> <name><surname>He</surname> <given-names>Y.</given-names></name> <name><surname>Jiang</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Domain adaptation model for retinopathy detection from cross-domain oct images,&#x0201D;</article-title> in <source>Proceedings of the Third Conference on Medical Imaging with Deep Learning (MIDL)</source>, eds. T. Arbel, I. B. Ayed, M. de Bruijne, M. Descoteaux, H. Lombaert, and C. Pal (New York: PMLR), <fpage>795</fpage>&#x02013;<lpage>810</lpage>.</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Warner</surname> <given-names>A.</given-names></name> <name><surname>Lee</surname> <given-names>J.</given-names></name> <name><surname>Hsu</surname> <given-names>W.</given-names></name> <name><surname>Syeda-Mahmood</surname> <given-names>T.</given-names></name> <name><surname>Kahn</surname> <given-names>C. E.</given-names></name> <name><surname>Gevaert</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Multimodal machine learning in image-based and clinical biomedicine: survey and prospects</article-title>. <source>Int. J. Comp. Vision</source>. <volume>132</volume>, <fpage>3753</fpage>&#x02013;<lpage>3769</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-024-02032-8</pub-id><pub-id pub-id-type="pmid">39211895</pub-id></mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Cai</surname> <given-names>Z.</given-names></name> <name><surname>Qiu</surname> <given-names>S.</given-names></name> <name><surname>Xu</surname> <given-names>P.</given-names></name></person-group> (<year>2024</year>). <article-title>Vision transformer with masked autoencoders for referable diabetic retinopathy classification based on large-size retina image</article-title>. <source>PLoS One</source> <volume>19</volume>:<fpage>e0299265</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0299265</pub-id><pub-id pub-id-type="pmid">38446810</pub-id></mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>Z.</given-names></name> <name><surname>Yuan</surname> <given-names>Y.</given-names></name> <name><surname>Shi</surname> <given-names>Z.</given-names></name> <name><surname>Mao</surname> <given-names>W.</given-names></name> <name><surname>Zhu</surname> <given-names>G.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>FunSwin: A deep learning method to analysis diabetic retinopathy grade and macular edema risk based on fundus images</article-title>. <source>Front. Physiol.</source> <volume>13</volume>:<fpage>961386</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fphys.2022.961386</pub-id><pub-id pub-id-type="pmid">35957992</pub-id></mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>S. L.</given-names></name> <name><surname>Yang</surname> <given-names>X. L.</given-names></name> <name><surname>Wang</surname> <given-names>T. W.</given-names></name> <name><surname>She</surname> <given-names>F. R.</given-names></name> <name><surname>Xiong</surname> <given-names>X.</given-names></name> <name><surname>He</surname> <given-names>J. F.</given-names></name></person-group> (<year>2021</year>). <article-title>Diabetic retinopathy diagnosis based on RA-EfficientNet</article-title>. <source>Appl. Sci.</source> <volume>11</volume>:<fpage>11035</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app112211035</pub-id></mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>F.</given-names></name> <name><surname>Koltun</surname> <given-names>V.</given-names></name></person-group> (<year>2015</year>). <article-title>Multi-scale context aggregation by dilated convolutions</article-title>. <source>arXiv</source> preprint arXiv:1511.07122. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1511.07122</pub-id></mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Fu</surname> <given-names>H.</given-names></name> <name><surname>Yan</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Wu</surname> <given-names>Q.</given-names></name> <name><surname>Yang</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;Attention guided network for retinal image segmentation,&#x0201D;</article-title> in <source>Proceedings of Medical Image Computing and Computer-Assisted Intervention (MICCAI)</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Science and Business Media Deutschland GmbH</publisher-name>).</mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>H.-Y.</given-names></name> <name><surname>Yu</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Pan</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>A transformer-based representation-learning model with unified processing of multimodal input for clinical diagnostics</article-title>. <source>Nat. Biomed. Eng</source>. <volume>7</volume>, <fpage>743</fpage>&#x02013;<lpage>755</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41551-023-01045-x</pub-id><pub-id pub-id-type="pmid">37308585</pub-id></mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Q.</given-names></name> <name><surname>Guo</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>Enhancing pathological feature discrimination in diabetic retinopathy multi-classification with self-paced progressive multi-scale training</article-title>. <source>Sci. Rep.</source> <volume>15</volume>:<fpage>25705</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-07050-1</pub-id><pub-id pub-id-type="pmid">40670454</pub-id></mixed-citation>
</ref>
<ref id="B58">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zuiderveld</surname> <given-names>K.</given-names></name></person-group> (<year>1994</year>). <source>Contrast Limited Adaptive Histogram Equalization</source>. <publisher-loc>London</publisher-loc>: <publisher-name>Academic Press</publisher-name>.</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3089114/overview">Vinitra Swamy</ext-link>, Swiss Federal Institute of Technology Lausanne, Switzerland</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3264871/overview">DrBiswadip Basu Mallik</ext-link>, Institute of Engineering and Management (IEM), India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3288670/overview">Jinyuan Wang</ext-link>, Tsinghua University, China</p>
</fn>
</fn-group>
</back>
</article>