<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2026.1732109</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Enhancing fundus image analysis for diabetic retinopathy using CheXNet with CBAM and Grad-CAM visualization</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Al-Dolat</surname> <given-names>Wedad</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Alhatamleh</surname> <given-names>Salem</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3093003"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Alqudah</surname> <given-names>Noor</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/1485699"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Alhazimi</surname> <given-names>Amro</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3255319"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Amin</surname> <given-names>Mohammad</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/2898364"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Daamseh</surname> <given-names>Aseel</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Madain</surname> <given-names>Rola</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3168219"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Malkawi</surname> <given-names>Raghad</given-names></name>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Al- Omari</surname> <given-names>Rami</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3255151"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Almarek</surname> <given-names>Faisal</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Aljefri</surname> <given-names>Sarah Husam</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Ophthalmology, Faculty of Medicine, Yarmouk University</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff2"><label>2</label><institution>Computer Science Department, Faculty of Information Technology and Computer Sciences, Yarmouk University</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Ophthalmology, Faculty of Medicine, Jordan University of Science and Technology</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Ophthalmology, College of Medicine, Imam Mohammad Ibn Saud Islamic University (IMSIU)</institution>, <city>Riyadh</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff5"><label>5</label><institution>Faculty of Medicine, Jordan University of Science and Technology</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff6"><label>6</label><institution>Department of Obstetrics and Gynecology, Faculty of Medicine, Jordan University of Science and Technology</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff7"><label>7</label><institution>Department of Pharmacy, Jordan University of Science and Technology</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Amro Alhazimi, <email xlink:href="mailto:ayalhazimi@imamu.edu.sa">ayalhazimi@imamu.edu.sa</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-25">
<day>25</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1732109</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>06</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Al-Dolat, Alhatamleh, Alqudah, Alhazimi, Amin, Daamseh, Madain, Malkawi, Al- Omari, Almarek and Aljefri.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Al-Dolat, Alhatamleh, Alqudah, Alhazimi, Amin, Daamseh, Madain, Malkawi, Al- Omari, Almarek and Aljefri</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-25">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Diabetic retinopathy (DR) is a leading cause of vision impairment among individuals with diabetes. Early detection and accurate grading are essential for timely clinical management. However, developing robust models for automated interpretation and grading of fundus images remains challenging due to variability in lesion appearance and image quality.</p></sec>
<sec>
<title>Methods</title>
<p>This study proposes a deep learning framework for DR classification from fundus images based on a DenseNet121 backbone initialized with CheXNet weights. A Convolutional Block Attention Module (CBAM) is integrated to enhance feature representation through channel and spatial attention mechanisms in a data-driven manner. In addition, Gradient&#x02013;weighted Class Activation Mapping (Grad&#x02013;CAM) is employed to provide post hoc visual explanations of model predictions. The proposed CheXNet_CBAM model is evaluated against several convolutional neural network architectures, including CheXNet, DenseNet121, MobileNetV2, VGG19, and ResNet50, using the APTOS 2019 and DDR datasets.</p></sec>
<sec>
<title>Results</title>
<p>On the APTOS 2019 dataset, the proposed model achieves an accuracy of 96.12%, while on the DDR dataset it attains 96.33%, outperforming the compared architectures on both benchmarks.</p></sec>
<sec>
<title>Discussion</title>
<p>The results indicate that incorporating CBAM improves discriminative feature learning within a DenseNet121&#x02013;based framework. While the model demonstrates strong performance across two public datasets, further prospective evaluation and external validation are required to assess its clinical applicability in real&#x02013;world settings.</p></sec></abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>diabetic retinopathy</kwd>
<kwd>fundus imaging</kwd>
<kwd>Grad-CAM</kwd>
<kwd>image classification</kwd>
</kwd-group>
<funding-group>
  <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported and funded by the Deanship of Scientific Research at Imam Mohammad Ibn Saud Islamic University (IMSIU) (grant number IMSIU-DDRSP2601).</funding-statement>
</funding-group>
<counts>
<fig-count count="9"/>
<table-count count="10"/>
<equation-count count="25"/>
<ref-count count="53"/>
<page-count count="19"/>
<word-count count="12604"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Ophthalmology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Diabetes mellitus (DM) describes a group of chronic endocrinological disorders characterized by sustained hyperglycemia in untreated individuals (<xref ref-type="bibr" rid="B1">1</xref>). A total of 415 million people between the ages of 20 and 79 were estimated to be suffering from the disease in 2015, 5 million of whom are estimated to have lost their lives from diabetic complications in that same year, and historical data show a steady increase in incidence rate as the years pass by and the global population increases (<xref ref-type="bibr" rid="B2">2</xref>).</p>
<p>Diabetic retinopathy (DR) is a neurovascular complication of diabetes that damages the microvasculature in the retina. Endothelial cells, gliocytes, pericytes, and white blood cells are among the various cell types affected by sustained high blood glucose levels. Therefore, leading to changes in the permeability of small blood vessels and blood perfusion levels to the retina, causing retinal leakage and retinal ischemia (<xref ref-type="bibr" rid="B3">3</xref>).</p>
<p>DR remains the primary cause of vision loss among the working-age population. Worldwide, DR ranked as the fifth most common cause of both preventable blindness and moderate to severe visual impairment between 1990 and 2010 (<xref ref-type="bibr" rid="B4">4</xref>). It&#x00027;s diagnosed based on characteristic vascular abnormalities seen during clinical examination of the eye. In 1968, the Airlie House classification was first introduced and is considered the basis of all modern DR classification systems. This classification categorizes DR into non-proliferative diabetic retinopathy (NPDR) and proliferative diabetic retinopathy (PDR) (<xref ref-type="bibr" rid="B5">5</xref>). NPDR features two main vascular changes in the retina: elevated permeability and capillary occlusion. During this phase, microaneurysms, hemorrhages, and hard exudates are observed by fundus photography (<xref ref-type="bibr" rid="B6">6</xref>). Proliferative diabetic retinopathy, the more advanced phase, is characterized by abnormal growth of new retinal vessels. Most patients during this stage experience vision impairment due to retinal detachment or vitreous hemorrhage (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>Slight modifications were made to the Airlie House classification in 1991 for the Early Treatment of Diabetic Retinopathy Study (ETDRS). The ETDRS severity scale is widely used in clinical research and trials to predict progression and evaluate treatment plans (<xref ref-type="bibr" rid="B8">8</xref>). The retinopathy severity level was assessed at the eye level, with 14 levels ranging from level 10 (No DR) to level 85 (PDR) (<xref ref-type="bibr" rid="B5">5</xref>). Starting with (level 10&#x02013;13), known as no diabetic retinopathy, the retina is healthy with no evidence of microaneurysms or hemorrhages. Followed by mild non-proliferative retinopathy (level 14&#x02013;20) that is characterized by microaneurysms only. On the other hand, moderate non-proliferative retinopathy (levels 35&#x02013;53) is characterized by the presence of two or more of the following features, such as microaneurysms, hemorrhages, and hard exudates. Severe non-proliferative retinopathy (level 53&#x02013;61) is marked by 20 hemorrhages in each of the four quadrants, venous beading in two quadrants, or intraretinal microvascular abnormalities in one quadrant. The most advanced stage is proliferative diabetic retinopathy (level 61&#x02013;85), where the vasoproliferative factors produced by the retina trigger neovascularization, often leading to vitreous hemorrhage (<xref ref-type="bibr" rid="B9">9</xref>).</p>
<p>The type of DM influences the occurrence and progression of DR. 10%&#x02212;15% of diabetics are of type 1, whereas the remaining patients are type 2 diabetics. Within 10 years, 71%&#x02212;90% of patients with type 1 diabetes and 67% of type 2 diabetes will develop DR. Thus, screening is highly significant throughout the disease&#x00027;s long latent phase to prevent vision loss (<xref ref-type="bibr" rid="B10">10</xref>).</p>
<p>Color fundus photography (CFP) has been acknowledged as the gold standard for the screening and analysis of diabetic retinopathy (DR), while dilated fundus examination is the mainstay of early diagnosis. Several major organizations, including the American Academy of Ophthalmology, the American Diabetes Association, and the Canadian Ophthalmological Society, released screening guidelines stating that patients with type 2 DM should have a dilated fundus examination at the time of diagnosis, while those with type 1 DM should have their first examination 5 years after diagnosis. In both cases, yearly follow-up examinations are suggested (<xref ref-type="bibr" rid="B11">11</xref>). Although there are still measures taken to inform patients and doctors about the situation, a significant portion of the patients who are supposed to get treated medically remain untreated because of the reasons such as poor compliance or lack of access to screening for retinal diseases (<xref ref-type="bibr" rid="B12">12</xref>). With traditional techniques to determine DR often being slow, needing specific clinical inpatient appointments, and making the patient wait for a prolonged period to get prepared are the reasons why such diagnoses are sometimes delayed or the patients have no access to screenings (<xref ref-type="bibr" rid="B13">13</xref>).</p>
<p>As diabetes incidence approaches pandemic levels, implementing advanced cost-effective methods for early detection of DR is essential (<xref ref-type="bibr" rid="B13">13</xref>). One promising approach is the use of AI models. AI has evolved over the years from basic experimental models to systems used daily in healthcare settings (<xref ref-type="bibr" rid="B14">14</xref>). Processing and evaluating large quantities of retinal images is difficult due to early detection concerns; these activities usually require higher accuracy and robustness. According to studies, these disadvantages have shifted the focus of research toward deep learning-based automated detection systems. Deep learning is increasingly being used to solve problems with medical image classification. Deep convolutional neural networks (DNNs) outperform other computer vision techniques. Over the last decade, numerous scholars have developed unique DNN architectures for picture categorization. MobileNet (<xref ref-type="bibr" rid="B15">15</xref>), VGGNet, ResNet50, InceptionNet, and XceptionNet (<xref ref-type="bibr" rid="B16">16</xref>), many other models are popular.</p>
<p>The goal of this study is to use deep learning techniques based on the APTOS 2019 and DDR datasets to create a fully automated system for diagnosing the severity of diabetic retinopathy. The primary contributions of the suggested model are as follows:</p>
<list list-type="bullet">
<list-item><p>This study introduces a new hybrid model, CheXNet_CBAM, for detecting and distinguishing diabetic retinopathy from normal fundus images.</p></list-item>
<list-item><p>Use of Gradient Maps (Grad-CAM) to illustrate the regions most influential in the classification process, enhancing the interpretability of the model.</p></list-item>
<list-item><p>Achieving high performance on diverse datasets (96.12% on APTOS-2019 and 96.33% on DDR) demonstrates the effectiveness of the model in supporting early diagnosis and clinical practice.</p></list-item>
<list-item><p>This approach achieves superior diagnosis accuracy compared to other state-of-the-art methods, including CheXNet, DenseNet121, MobileNetV2, VGG19, and ResNet50 models.</p></list-item>
<list-item><p>The study showed that the proposed model outperforms most previous models and methods in the field of diabetic retinopathy diagnosis, in terms of accuracy and ability to focus on clinically important areas.</p></list-item>
</list>
<p>In Section 2, the methods used are discussed, followed by an extensive discussion on the dataset, and the last part is a sketch of the proposed approach and training techniques. Section 3 performs data analysis and rates the proposed model as to its effectiveness in the diabetic retinopathy diagnostic tests. Section 4 highlights the most significant research in the field of diabetic retinopathy diagnosis and contrasts it with the proposed model. Finally, Section 5 wraps up the paper with conclusions and suggestions for future research.</p></sec>
<sec id="s2">
<label>2</label>
<title>Methodology</title>
<p>A deep learning framework is put forth in this research, which classifies diabetic retinopathy (DR) automatically into five levels of severity, namely: No DR, mild, moderate, severe, and proliferative. The model proposed, CheXNet_CBAM, depicted in <xref ref-type="fig" rid="F1">Figure 1</xref> is a modification of the architecture of DenseNet121. It imitates the configuration of CheXNet for examining chest X-ray images and modifies it for retinal fundus imaging. To improve the representation of features, we add a convolutional block attention module (CBAM), which integrates channel and spatial attention, thus allowing the network to concentrate more on the retinal regions that are clinically important. To make the model more robust and at the same time to further reduce the likelihood of overfitting, the DropBlock algorithm is used, which is a technique to drop contiguous patches of feature maps instead of just isolated pixels. The Transformer encoder layers are placed after the convolutional backbone to effectively model long-range dependencies, capturing thus both local textures and global structural patterns. Class-weighted loss functions prevent existing imbalances in the dataset, working effectively with the aggressive data augmentation techniques such as rotation, brightness changes, and zooming employed. Among the key hyperparameters are the learning rate, dropout value, and block size, all thoroughly tuned to guarantee stable, effective training. For interpretation, we use an improved Grad-CAM visualization to highlight retinal regions used in the model prediction. These improvements-CBAM, DropBlock, hybrid CNN-Transformer architecture, and explainable visualizations to a huge leap in accuracy, generalization, and clinical dependability over and above the baseline CheXNet model.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>The complete architecture of the CheXNet_CBAM model for diagnosing diabetic retinopathy.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a computer vision pipeline for retinal image analysis, including steps for preprocessing, data augmentation, model input, and neural network architecture. Boxes detail processes like resizing, normalization, augmentation, and layers such as Dense Block, CBAM, Batch Norm, and Grad-CAM output.</alt-text>
</graphic>
</fig>
<sec>
<label>2.1</label>
<title>Dataset acquisition</title>
<p>The dataset considered in this study consists of retinal fundus images, which were pre-processed with an isotropic Gaussian filter to remove noise and improve clarity. The original dataset was obtained during the APTOS 2019 Blindness Detection challenge (<xref ref-type="bibr" rid="B17">17</xref>). Each image was labeled with the severity of diabetic retinopathy (DR) via the train.csv file that came along with the original dataset. Images are arranged into five separate folders, each corresponding to one DR stage. The class-wise distribution is given in the <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Distribution of retinal images across diabetic retinopathy severity levels for the APTOS 2019 dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Class (Severity)</bold></th>
<th valign="top" align="center"><bold>Number of images</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">No_DR (No Diabetic Retinopathy)</td>
<td valign="top" align="center">1,805</td>
</tr>
<tr>
<td valign="top" align="left">Mild</td>
<td valign="top" align="center">370</td>
</tr>
<tr>
<td valign="top" align="left">Moderate</td>
<td valign="top" align="center">999</td>
</tr>
<tr>
<td valign="top" align="left">Severe</td>
<td valign="top" align="center">193</td>
</tr>
<tr>
<td valign="top" align="left">Proliferate_DR (Proliferative DR)</td>
<td valign="top" align="center">295</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Total</bold></td>
<td valign="top" align="center"><bold>3,662</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold value (Total = 3,662) indicates the aggregate sum of the dataset distribution.</p>
</table-wrap-foot>
</table-wrap>
<p>The Diabetic Retinopathy (DDR) dataset comprises 13,673 retinal fundus images collected from 147 hospitals across 23 provinces in China (<xref ref-type="bibr" rid="B18">18</xref>). Each image is assigned one of five classes depicting diabetic retinopathy (DR) severity: No_DR, Mild, Moderate, Severe, and Proliferative_DR. The original dataset included a sixth category corresponding to poor-quality images, which were excluded in this study to ensure reliable training. This brings the total number to 12,522 images. Also, all of the images were preprocessed to remove the black background since it boosted the visibility of retinal structures and would have taken away from irrelevant information. The distribution of images in the five DR severity levels is given in <xref ref-type="table" rid="T2">Table 2</xref>. <xref ref-type="fig" rid="F2">Figure 2</xref> shows fundus images from the dataset.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Distribution of images in the DDR dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Class (Severity)</bold></th>
<th valign="top" align="center"><bold>Number of images</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">No_DR (No Diabetic Retinopathy)</td>
<td valign="top" align="center">6,266</td>
</tr>
<tr>
<td valign="top" align="left">Mild</td>
<td valign="top" align="center">630</td>
</tr>
<tr>
<td valign="top" align="left">Moderate</td>
<td valign="top" align="center">4,477</td>
</tr>
<tr>
<td valign="top" align="left">Severe</td>
<td valign="top" align="center">236</td>
</tr>
<tr>
<td valign="top" align="left">Proliferate_DR (Proliferative DR)</td>
<td valign="top" align="center">913</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Total</bold></td>
<td valign="top" align="center"><bold>12,522</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold value (Total = 12,522) indicates the aggregate sum of the dataset distribution.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Example of each class of the data set.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0002.tif">
<alt-text content-type="machine-generated">Five retinal fundus photographs are arranged left to right, each labeled below: No_DR (clear retina), Mild (slight brightness), Moderate (darker with subtle changes), Severe (more visible lesions), and Proliferate_DR (intense red-orange with abnormal vessels).</alt-text>
</graphic>
</fig>
<p>Potential confounding factors such as patient age, gender, geographic region, and imaging device variability were addressed at the dataset acquisition level. The DDR dataset was collected from 147 hospitals across 23 provinces using standardized acquisition principles and diverse fundus camera types, with balanced demographic representation. This multicenter and multi-device design reduces systematic bias and improves generalizability. In both the DDR and APTOS 2019 datasets, diabetic retinopathy severity labels serve as the primary stratification variable, reflecting clinically meaningful disease progression. As the study is image-centered and does not involve patient-level intervention or outcome modeling, statistical confounding adjustment methods such as propensity score matching are not directly applicable.</p>
<p>The sample size used in this study was determined by the availability of publicly accessible benchmark datasets, namely APTOS-2019 and DDR, which are widely adopted in diabetic retinopathy research. As this work follows a data-driven deep learning framework rather than a hypothesis-testing statistical design, conventional sample size calculation, effect size estimation, and power analysis are not directly applicable. Model validity is instead assessed through generalization performance on independent test sets and consistent comparative evaluation across multiple architectures and datasets. This approach is standard practice in contemporary medical image analysis research.</p>
</sec>
<sec>
<label>2.2</label>
<title>CheXNet model structure</title>
<p>CheXNet refers to a deep CNN model designed specifically for the automated identification of pneumonia and thoracic illnesses from chest X-ray images (<xref ref-type="bibr" rid="B19">19</xref>). The model is based on DenseNet121, a 121-layer deep architecture, which introduces dense connections between all the layers and their preceding layers. This direct linkage of layers enables a layer to access the feature maps of all previous layers at once, which improves the gradient flow, promotes reuse of features and has fewer parameters than a conventional deep CNN of the same depth. This architecture helps a lot with medical imaging since it recognizes very subtle patterns at multiple scales. DenseNet121 is a construction of dense blocks with transition layers, such that in a dense block, each layer takes the concatenation of the feature maps of all preceding layers as input. Formally, for a layer <italic>l</italic> with input feature maps <italic>x</italic><sub>0</sub>, <italic>x</italic><sub>1</sub>, &#x02026;, <italic>x</italic><sub><italic>l</italic>&#x02212;1</sub>, the output <italic>x</italic><sub><italic>l</italic></sub> is:</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>Where ([<italic>x</italic><sub>0</sub>, <italic>x</italic><sub>1</sub>, &#x02026;, <italic>x</italic><sub><italic>l</italic>&#x02212;1</sub>]) is the concatenation operator and <italic>H</italic><sub><italic>l</italic></sub>(&#x000B7;) is a composite function of Batch Normalization, ReLU activation, and a 3 &#x000D7; 3 convolution. Thus, this setup allows the network to blend low- and high-level features efficiently to pick up subtle patterns in medical images. Transition layers lie between dense blocks to shrink the size of feature maps and reduce channel numbers to increase computational efficiency and counter overfitting. Each transition layer carries out 1 &#x000D7; 1 convolution with 2 &#x000D7; 2 average pooling as follows:</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mtext>transition</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>After the final dense block, CheXNet applies Global Average Pooling (GAP) to reduce each feature map to a single value (<xref ref-type="bibr" rid="B20">20</xref>). The resulting vector is then passed through a fully connected layer with sigmoid activation for multi-label classification:</p>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x022EF;</mml:mo><mml:mspace width="0.3em" class="thinspace"/><mml:mo>,</mml:mo><mml:mi>C</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>where <italic>C</italic> is the number of disease classes, <inline-formula><mml:math id="M4"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>and <italic>b</italic><sub><italic>i</italic></sub> are learnable weights and biases, respectively, and &#x003C3;(&#x000B7;) gives the probability of each class between 0 and 1. CheXNet is trained by means of binary cross-entropy loss so that the network can output probabilities closely reflecting actual presence and absence of each disease. There are a number of limitations of the CheXNet though. It belongs to the class of domain-specific algorithms because it has been originally trained on chest X-ray images; thus, direct application to other modalities, such as retinal images, is suboptimal. DenseNet (<xref ref-type="bibr" rid="B21">21</xref>), on the other hand, mainly extracts local features and thus it cannot maximally utilize the long-range spatial relation between features. Either way, CheXNet, with its dense connectivity, multi-scale feature reuse, and deep design, provides a very good backbone for analysis of medical images. Nonetheless, applying it to a new domain usually requires modifications to the architecture in terms of attention modules, stronger regularization, and improved interpretability techniques. In short, to make CheXNet work for retinal fundus images, it must be channeled to outweigh certain clinically relevant eye regions and to capture long-range dependencies. Hence, attention modules, hybrid CNN-transformer architectures, and other specialized training techniques are some of the improvements that must be considered for reliable classification of eye diseases. These improvements and architectural changes are described in detail in the next section. <xref ref-type="fig" rid="F3">Figure 3</xref> shows the hierarchical structure of the CheXNet model.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Hierarchical structure of the CheXNet model with details of condensed blocks and processing layers.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0003.tif">
<alt-text content-type="machine-generated">Schematic diagram of a deep learning pipeline for image analysis, showing fundus images from a real dataset processed through convolutional, pooling, dense, transition, and DropBlock layers, ending with an output classification stage.</alt-text>
</graphic>
</fig>
<p>Although CheXNet was originally proposed for chest X-ray image analysis, in this work, it is adapted to the retinal fundus image domain through transfer learning and architectural modification. The DenseNet121 backbone of CheXNet is initialized using pretrained weights, while the original classification layer&#x02014;designed for thoracic disease detection&#x02014;is removed and replaced with a task-specific fully connected layer tailored to diabetic retinopathy classification. The network is then fine-tuned on retinal fundus datasets, allowing the learned low-level and mid-level visual representations to be adjusted to the distinct texture, color distribution, and lesion patterns of fundus images. Furthermore, to mitigate the domain gap between chest X-ray and retinal imaging modalities, a Convolutional Block Attention Module (CBAM) is incorporated into the architecture. This enables the model to emphasize clinically relevant retinal structures such as microaneurysms, hemorrhages, and exudates, while suppressing irrelevant background information. Through this adaptation strategy, CheXNet serves as a robust and transferable backbone rather than a domain-restricted model, making it suitable for accurate diabetic retinopathy diagnosis.</p>
<p>Unlike existing diabetic retinopathy studies that primarily apply attention mechanisms directly to generic DenseNet-based architectures or design task-specific CNNs from scratch, this work introduces a novel adaptation of the CheXNet architecture&#x02014;originally developed and validated for chest X-ray analysis&#x02014;into the retinal fundus imaging domain. The novelty of the proposed CheXNet_CBAM model lies in three key aspects. First, it repurposes CheXNet as a medical-image backbone beyond thoracic imaging and systematically enhances it with a convolutional block attention module (CBAM) to address the unique spatial and pathological characteristics of retinal fundus images. Second, the integration of CBAM enables joint channel-wise and spatial attention within a densely connected architecture, allowing the model to emphasize clinically relevant retinal regions (e.g., microaneurysms, hemorrhages, and exudates) without introducing additional supervision or lesion annotations. Third, the proposed framework is extensively validated across two heterogeneous public datasets (APTOS-2019 and DDR), demonstrating strong cross-dataset generalization, which is rarely examined in prior attention-augmented DenseNet or CheXNet-based DR studies. Furthermore, the combined use of CBAM with Grad-CAM visualization enhances model interpretability, providing clinically meaningful explanations that support practical deployment. Together, these contributions distinguish the proposed CheXNet_CBAM model from existing attention-based DR approaches and highlight its incremental methodological and clinical value.</p>
</sec>
<sec>
<label>2.3</label>
<title>Enhancements on CheXNet</title>
<p>In the proposed model, we highlight a number of modifications that have to do with CheXNet in such a way that it becomes a suitable tool for retinal fundus image analysis. As a first step, various data preprocessing and augmentation techniques have been performed to alleviate the problems caused by the variability of images and the imbalance of classes. After that, a Convolutional Block Attention Module (CBAM) is added to the model to inform the network of clinically significant retinal areas. To conclude, DropBlock regularization is introduced as a means of preventing overfitting and improving the model&#x00027;s capability of generalization. Additionally, Transformer encoder layers are added after the feature extraction from convolutional networks to allow the model to capture long-range dependencies and global structural patterns. The learning rate, dropout, and block size are among the hyperparameters whose careful tuning ensures the training stability. Lastly, for the sake of model interpretability, enhanced Grad-CAM visualization is employed to delineate regions that are specific to the lesion, and this, in turn, provides clinical transparency. In total, these changes raise the baseline CheXNet applied to retinal disease classification in terms of accuracy, robustness, and reliability to a considerable extent.</p>
<sec>
<label>2.3.1</label>
<title>Data preprocessing and augmentation for imbalanced datasets</title>
<p>Images in the retinal fundus dataset were resized to 224 &#x000D7; 224 pixels in order to get them ready for model training. The input resolution needed by DenseNet121 (<xref ref-type="bibr" rid="B22">22</xref>) was fulfilled this way. Normalization of pixel intensities was performed so that all the pixel values fell within the range of 0&#x02013;1; then, they were standardized to have zero mean and unit variance. This was expressed as:</p>
<disp-formula id="EQ4"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mtext>x</mml:mtext></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>x</mml:mtext><mml:mo>-</mml:mo><mml:mtext>&#x003BC;</mml:mtext></mml:mrow><mml:mrow><mml:mtext>&#x003C3;</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>x stands for the original pixel value, &#x003BC; stands for the mean of the dataset, and &#x003C3; stands for the standard deviation. This preprocessing step not only reduces light variability but also improves contrast and hence, more stable and effective feature extraction is possible (<xref ref-type="bibr" rid="B23">23</xref>). To deal with the issue of class imbalance in the dataset, extensive augmentation techniques were used. Random geometric transformations, including rotation, scaling, and horizontal flipping, mimicked changes in patient positioning, meanwhile photometric adjustments, like brightness and contrast jittering, took care of the variations in imaging conditions. Brightness adjustment can be defined mathematically as:</p>
<disp-formula id="EQ5"><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mtext>I</mml:mtext></mml:mrow><mml:mrow><mml:mtext>&#x02032;</mml:mtext></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mtext>&#x003B1;I</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x003B2;</mml:mtext></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p><italic>I</italic> stands for the original image, &#x003B1; is the parameter for controlling contrast and &#x003B2; is for adjusting brightness. Implementation details (as used in this study). In practice, augmentation was implemented using Keras Imag eData Generator to improve robustness to acquisition and positioning variability. The applied transformations were random rotations (up to 40&#x000B0;), width and height shifts (up to 0.2), shear (up to 0.2), zoom (up to 0.2), horizontal flipping, and nearest-neighbor filling. These geometric transformations enhance generalization by simulating plausible variations in imaging conditions and patient positioning. In addition, photometric variations (e.g., brightness/contrast changes) are conceptually captured by <xref ref-type="disp-formula" rid="EQ5">Equation 5</xref> and can be applied through brightness/contrast jittering within the augmentation policy when needed.</p>
<p>Besides the augmentation, the classes&#x00027; sample sizes were equalized by means of specific oversampling and dataset balancing, which made it possible for all classes to have almost the same number of images (<xref ref-type="bibr" rid="B24">24</xref>). Concretely, we constructed a balanced dataset using random oversampling with replacement, where minority-class images were duplicated until each class matched the number of samples in the majority class. This method of balancing has the effect of giving the model the same opportunity to experience all the severity levels thus reducing the bias toward the majority classes and enhancing the classification performance on the underrepresented categories.</p>
<sec>
<label>2.3.1.1</label>
<title>Balancing statistics (APTOS 2019)</title>
<p>The original APTOS dataset was imbalanced; therefore, we balanced it by oversampling all classes to match the majority class size. After balancing, each class contained 1,805 images, yielding a total of 9,025 images (five classes). The balanced dataset was then split into 80% training, 10% validation, and 10% testing, resulting in Train = 7,220, Validation = 902, and Test = 903 samples.</p></sec>
<sec>
<label>2.3.1.2</label>
<title>Balancing statistics (DDR)</title>
<p>The original DDR class counts were: No_DR = 6,266, Mild = 630, Moderate = 4,477, Severe = 236, and Proliferate_DR = 913 (total = 12,522). After oversampling, each class contained 6,266 images, yielding a balanced dataset of 31,330 images. The balanced dataset was then split into 80% training, 10% validation, and 10% testing, resulting in Train = 25,064, Validation = 3,133, and Test = 3,133 samples. Evaluation consistency (confusion matrix). All reported confusion matrices were computed exclusively from the held-out test split of the balanced dataset (i.e., the 10% test partition after balancing), and their row/column totals match the corresponding test split class counts. This ensures internal consistency between the dataset split used for evaluation and the reported confusion matrix values. Furthermore, class-weighted loss was incorporated during training to penalize misclassification of minority classes more heavily.</p>
<disp-formula id="EQ6"><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p><italic>N</italic> stands for the entire training sample count, <italic>C</italic> denotes the class quantity, and <italic>n</italic><sub><italic>i</italic></sub> refers to the sample count of class <italic>i</italic>. This method not only balances the learning process throughout all the severity levels but also reduces the negative effect of the dataset imbalance on the model&#x00027;s performance (<xref ref-type="bibr" rid="B23">23</xref>). All these steps together with the preprocessing and augmentation strategies like image resizing, normalization, photometric and geometric transformations, class balancing, and class-weighted loss as well, talk in favor of the model&#x00027;s learning ability to get hold of strong feature representations, generalization to untouched data, and precise classification of diabetic retinopathy severity levels.</p>
<p>The datasets used in this study contain complete retinal fundus images with corresponding labels, and no missing pixel-level or annotation data were encountered. Consequently, no image exclusion or imputation procedures were applied. Variability in image quality and acquisition conditions was addressed implicitly through normalization, standardization, and extensive photometric and geometric augmentation, enabling the model to learn robust feature representations without introducing imputation-related bias.</p></sec></sec>
<sec>
<label>2.3.2</label>
<title>CBAM attention module</title>
<p>CBAM was integrated after convolutional feature extraction to improve the model&#x00027;s ability to focus on clinically relevant regions of retinal images (<xref ref-type="bibr" rid="B25">25</xref>). Attention is applied sequentially in CBAM; thus, channel attention and then spatial attention are applied. The informative regions are emphasized while the uninformative ones are suppressed, thereby making the features more representative.</p>
<sec>
<label>2.3.2.1</label>
<title>Channel attention</title>
<p>Based on channel dimension, the channel attention modulates each feature map&#x00027;s importance. Given an input feature map <italic>F</italic>&#x02208;&#x0211D;<sup><italic>H</italic>&#x000D7;<italic>W</italic>&#x000D7;<italic>C</italic></sup>, the channel attention <italic>M</italic><sub><italic>c</italic></sub>(<italic>F</italic>) is computed, where <italic>AvgPool</italic>(<italic>F</italic>) and <italic>MaxPool</italic>(<italic>F</italic>) denote global average pooling and global max pooling across spatial dimensions, respectively (<xref ref-type="bibr" rid="B26">26</xref>), with <italic>MLP</italic>(&#x000B7;) representing a shared multi-layer perception with a hidden layer reduction ratio of <italic>r</italic>, and &#x003C3;(&#x000B7;) is the sigmoid activation function. The refined feature map is acquired by multiplying the channel attention back to the original feature map:</p>
<disp-formula id="EQ7"><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mi>L</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>M</mml:mi><mml:mi>L</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<disp-formula id="EQ8"><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02299;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>where &#x02299; denotes element-wise multiplication (<xref ref-type="bibr" rid="B27">27</xref>). This operation allows the network to focus on channels that carry discriminative information for diabetic retinopathy classification.</p></sec>
<sec>
<label>2.3.2.2</label>
<title>Spatial attention</title>
<p>In the channel attention phase, the spatial attention module enfolds crucial regions in the feature maps (<xref ref-type="bibr" rid="B28">28</xref>). The spatial attention works by pooling the channel information via average and max pooling. The results are concatenated and passed through a convolution followed by activation via a sigmoid function, where <italic>f</italic><sup>7 &#x000D7; 7</sup> represents the convolution operation with a kernel of size 7 &#x000D7; 7, and &#x000B7;; &#x000B7; denotes concatenation along the channel dimension. Thus, the final output of CBAM is:</p>
<disp-formula id="EQ9"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<disp-formula id="EQ10"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02033;</mml:mo></mml:mrow></mml:msup><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x02299;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>With channel-spatial attention in a sequential manner, the network is directed toward both informative feature channels about spatial regions (like lesions or microaneurysms) to enhance the discriminative power of features for the different levels of DR severity. In the form of combining DenseNet121 with CBAM (<xref ref-type="bibr" rid="B29">29</xref>), then, the model can learn where it will look in the retinal fundus images for improved sensitivity to subtle pathological patterns while preventing irrelevant information from the background. This structure, along with data preprocessing as well as augmentation strategies, strengthens the representational power and clinical reliability of the model. The <xref ref-type="fig" rid="F4">Figure 4</xref> represents the geometry of the CBAM unit.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>CBAM Module Architecture: Combining Channel Attention and Spatial Attention for Optimization.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0004.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network attention mechanism with a channel attention module using global max and average pooling, multilayer perceptron, fully connected layers, ReLU, summation, followed by a spatial attention module applying max and average pooling, concatenation, and a seven-by-seven convolution.</alt-text>
</graphic>
</fig>
</sec></sec>
<sec>
<label>2.3.3</label>
<title>DropBlock regularization</title>
<p>To enhance model robustness and avoid overfitting, DropBlock regularization was applied to the convolutional feature maps (<xref ref-type="bibr" rid="B30">30</xref>). In contrast to dropout, which zeroes out randomly chosen neurons, DropBlock sets to zero contiguous regions in the feature maps, thus forcing the network to learn robust features using distributed representations. For a feature map <italic>F</italic>&#x02208;&#x0211D;<sup><italic>H</italic>&#x000D7;<italic>W</italic>&#x000D7;<italic>C</italic></sup>, DropBlock generates a binary mask <italic>M</italic>&#x02208;{0, 1}<sup><italic>H</italic>&#x000D7;<italic>W</italic>&#x000D7;<italic>C</italic></sup> in which contiguous blocks of size <italic>s</italic>&#x000D7;<italic>s</italic> are set to zero with probability &#x003B3;. The modified feature map.</p>
<disp-formula id="EQ11"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x02299;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>M</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>Where &#x02299; signifies multiplication over one single entry, which normalizes through mean (M) the expected activation amount across layers (<xref ref-type="bibr" rid="B31">31</xref>), while the probability &#x003B3; of the block is calculated to represent dropout times as expected according to the size of the feature map.</p>
<disp-formula id="EQ12"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>&#x003B3;</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>&#x003C1;</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>H</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x000B7;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>H</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>-</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>s</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>-</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>s</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>H and W are the height and width of the feature map, respectively, and s is the block size. This formulation ensures that the dropout probability scales with the spatial dimensions and block size. By dropping out contiguous regions, DropBlock encourages the network to extract redundant and distributed feature representations instead of focusing specifically on certain localized features. This is especially helpful in retinal fundus images since lesions and pathological patterns can vary in their location, size, and intensity. Therefore, beyond DropBlock&#x00027;s work of enhancing a model&#x00027;s ability to generalize from one image to another, it works to curb over-sensitivity to noise or irrelevant patterns (<xref ref-type="bibr" rid="B32">32</xref>). Together with CBAM and the data augmentation strategy, the DropBlock method builds a robust pathway for accurately classifying all levels of diabetic retinopathy with improved generalization to unseen data and thus increased clinical reliability.</p></sec>
<sec>
<label>2.3.4</label>
<title>Transformer encoder for global context</title>
<p>After extracting features from convolution for this purpose, the Transformer encoder layers were added to capture long-range dependency structures and permit global contextual information in retinal fundus images (<xref ref-type="bibr" rid="B33">33</xref>). Convolutional layers, as mentioned, are good at local texture extraction and relatively poor at relationship modeling over distances. This is where Transformers come in handy because they apply self-attention to the whole feature sequence, thereby making it possible for the network to learn local features and global ones as well. Given a reshaped feature map <italic>F</italic> &#x02208; &#x0211D;<sup><italic>N</italic> &#x000D7; <italic>C</italic></sup>, where <italic>N</italic> &#x0003D; <italic>H</italic> &#x000D7; <italic>W</italic> is the flattened spatial dimension and <italic>C</italic> is the number of channels, the multi-head self-attention (MHSA) computes attention across all spatial locations. For each attention head (<xref ref-type="bibr" rid="B34">34</xref>), the query <italic>Q</italic>, key <italic>K</italic>, and value <italic>V</italic> matrices are obtained via learned linear projections:</p>
<disp-formula id="EQ13"><mml:math id="M14"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>Q</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:mi>K</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:mi>V</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<disp-formula id="EQ14"><mml:math id="M15"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>V</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>V</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<p>Where <italic>d</italic><sub><italic>k</italic></sub> is the dimension of the key vectors. The multi-head attention concatenates the outputs of all the heads and applies a final linear transformation, allowing the model to focus on information from different representation subspaces simultaneously. The operation of multi-head self-attention (MHSA) is followed by a feed-forward network (FFN) that is applied independently to each position and consists of two fully connected layers with a ReLU activation in between (<xref ref-type="bibr" rid="B35">35</xref>), and residual connections and layer normalization are employed after both the attention block and the FFN stabilize training and improve gradient flow:</p>
<disp-formula id="EQ15"><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>L</mml:mi><mml:mi>U</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mi>x</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<disp-formula id="EQ16"><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>y</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>N</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>M</mml:mi><mml:mi>H</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02033;</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>y</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>N</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>F</mml:mi><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<p>By incorporating Transformer encoders (<xref ref-type="bibr" rid="B34">34</xref>), the model will learn the interaction between remote retinal regions, as in the spatial relation between the microaneurysms, hemorrhages, and exudates. The combination of the local convolutional features and global attention improves the detection of complex patterns across the retina by the network, which leads to better classification across all the severities of diabetic retinopathy. The Transformer encoder forms a hybrid CNN-Transformer architecture together with CBAM and DropBlock, which are complementary methods of fine-grained local feature extraction and broad global context understanding, reinforcing generalization and robustness on retinal datasets.</p></sec>
<sec>
<label>2.3.5</label>
<title>Class balancing and weighted loss</title>
<p>The data for diabetic retinopathy can often be skewed, where the examples for some severity levels (e.g., Proliferative_DR or Mild) are way less than for the others (<xref ref-type="bibr" rid="B36">36</xref>). Training on these imbalanced data may result in skewed predictions toward the majority class. To rectify this, we computed the model loss such that a greater penalty was incurred for the wrong classification of the underrepresented classes to achieve a balanced learning for all severity levels. Given a dataset with <italic>C</italic> classes and <italic>N</italic> total training samples, let <italic>n</italic><sub><italic>c</italic></sub> denote the number of samples in class <italic>c</italic>, the class weight for class <italic>c</italic> is computed. The categorical cross-entropy loss during the training of a sample with a true label <italic>y</italic> &#x02208; {0, 1}<sup><italic>C</italic></sup> and predicted probability &#x00177; &#x02208; {0, 1}<sup><italic>C</italic></sup> is adjusted by the introduction of class weight.</p>
<disp-formula id="EQ17"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula>
<disp-formula id="EQ18"><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>L</mml:mi></mml:mstyle></mml:mrow><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(18)</label></disp-formula>
<p>The application of this weighting has the effect that the mistakes of the minority classes will have a greater impact on the total loss, thus making the network learn the common features of all classes (<xref ref-type="bibr" rid="B37">37</xref>). As a part of the strategy to deal with the class imbalance problem, in addition to the weighted loss, data augmentation was also used rather extensively. Among the augmentation methods were random rotations, brightness adjustments, zooming, and horizontal flips, which together resulted in creating more samples for the minority classes effectively. The combination of class-weighted loss and augmentation allowed the model to spread the learning evenly over the severity levels, hence making it possible for each class to have a similar representation during training. This method not only leads to a better classification accuracy but also makes the convergence process more stable and the generalization on previously unseen retinal images better.</p></sec>
<sec>
<label>2.3.6</label>
<title>Grad-CAM for model interpretability</title>
<p>For the reason of clinical interpretability and visualization of retinal regions influencing model prediction, we incorporated Grad-CAM into our framework (<xref ref-type="bibr" rid="B38">38</xref>). Upon applying Grad-CAM to highlight the regions of the input image deemed most important for the predicted class, the clinicians may obtain insight into the decision-making process of the model. For the given input image <italic>A</italic><sup><italic>k</italic></sup>&#x02208; &#x0211D;<sup><italic>H</italic>&#x000D7; <italic>W</italic></sup> being the feature maps from the last convolutional layer <italic>k</italic>, where as <italic>y</italic><sup><italic>c</italic></sup> is the score for class c before softmax activation (<xref ref-type="bibr" rid="B39">39</xref>). The importance weight <inline-formula><mml:math id="M21"><mml:msubsup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> for feature map <italic>k</italic> is computed by global average pooling of the gradients of <italic>y</italic><sup><italic>c</italic></sup> concerning <italic>A</italic><sup><italic>k</italic></sup>. The Grad-CAM heatmap <inline-formula><mml:math id="M22"><mml:msubsup><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mo>-</mml:mo><mml:mi>C</mml:mi><mml:mi>A</mml:mi><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is thereby obtained with a weighted combination of feature maps followed by a ReLU activation:</p>
<disp-formula id="EQ19"><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>H</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>W</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msubsup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(19)</label></disp-formula>
<disp-formula id="EQ20"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>d</mml:mi><mml:mo>-</mml:mo><mml:mi>C</mml:mi><mml:mi>A</mml:mi><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>L</mml:mi><mml:mi>U</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msubsup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup><mml:msup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(20)</label></disp-formula>
<p>Features with a positive influence on class predictions are highlighted. The resultant heat map is then upscaled to the input image resolution and laid on the original image for interpretative purposes. Clinically relevant areas in microaneurysm, hemorrhages, and exudates are then visualized by the model through Grad-CAM as an explanation of its predictions, thereby supporting clinical validation and indicating possible failure cases where the network might be dependent on some irrelevant region (<xref ref-type="bibr" rid="B40">40</xref>). The application of Grad-CAM in conjunction with the CBAM and Transformer modules further enhances interpretability because the CBAMs supply attention maps, and the global context from Transformers reinforces the regions highlighted in the Grad-CAM visualizations.</p>
</sec>
</sec>
<sec>
<label>2.4</label>
<title>Evaluation model</title>
<p>In the grading of diabetic retinopathy (DR), images generally fall into one of five severity categories: no DR, Mild DR, Moderate DR, Severe DR, and Proliferative DR. No DR here indicates that the retina appears normal and there are no pathological changes, whereas Mild DR indicates early-stage changes such as few microaneurysms or just minor hemorrhages. Moderate DR indicates more widespread pathological changes such as multiple hemorrhages or small exudates. Severe DR indicates considerable retinal damage with diffuse hemorrhages and, of course, vascular lesions; Proliferative DR describes the final stage, which is marked by abnormal neovascularization that can potentially lead to the loss of vision.</p>
<p>Evaluation metrics provide guarantees on the efficiency with which the model can perform on the test classes. Accuracy refers to the ratio of correctly classified images over the total number of images. Precision for a class tells us how many of the predicted cases for that class were correct. Sensitivity (Recall) measures the ability of the model to detect all true cases of a class, while specificity measures how well the model can identify that class from others. F1-score provides a trade-off between precision and sensitivity and is specifically important for the classes that are least common, like Proliferative DR.</p>
<p>For instance, a model may have a high positive predictive value but a low sensitivity for Proliferative DR, meaning that when it predicts this class, it is usually correct but will miss many true cases. Conversely, Mild DR is often more difficult to detect; hence, the importance of the F1-score to verify that mild cases are indeed detected, while minimizing false positives. The relationship between the metrics and each DR severity class is summarized in the <xref ref-type="table" rid="T3">Table 3</xref> below.</p>
<disp-formula id="EQ21"><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(21)</label></disp-formula>
<disp-formula id="EQ22"><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(22)</label></disp-formula>
<disp-formula id="EQ23"><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(23)</label></disp-formula>
<disp-formula id="EQ24"><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mi>S</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>S</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>S</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(24)</label></disp-formula>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Diabetic retinopathy levels and the importance of assessment measures.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>DR class</bold></th>
<th valign="top" align="left"><bold>Description</bold></th>
<th valign="top" align="left"><bold>Relevance of metrics</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">No_DR</td>
<td valign="top" align="left">Normal retina</td>
<td valign="top" align="left">Accuracy and specificity show how well normal cases are correctly identified</td>
</tr>
<tr>
<td valign="top" align="left">Mild DR</td>
<td valign="top" align="left">Early, subtle changes</td>
<td valign="top" align="left">Precision, sensitivity, and F1-score evaluate detection of mild cases</td>
</tr>
<tr>
<td valign="top" align="left">Moderate DR</td>
<td valign="top" align="left">Intermediate severity</td>
<td valign="top" align="left">Metrics assess balance of correct detection vs. misclassification</td>
</tr>
<tr>
<td valign="top" align="left">Severe DR</td>
<td valign="top" align="left">Extensive retinal damage</td>
<td valign="top" align="left">Sensitivity and F1-score ensure severe cases are captured</td>
</tr>
<tr>
<td valign="top" align="left">Proliferative DR</td>
<td valign="top" align="left">Advanced stage with neovascularization</td>
<td valign="top" align="left">Precision and sensitivity indicate ability to detect high-risk cases</td>
</tr></tbody>
</table>
</table-wrap>
<p>Finally, 95% confidence intervals for accuracy were estimated using the standard error of the proportion to quantify the statistical reliability of results:</p>
<disp-formula id="EQ25"><mml:math id="M29"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>&#x02003;</mml:mo><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>s</mml:mi><mml:mi>q</mml:mi><mml:mi>r</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>-</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>C</mml:mi><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mn>95</mml:mn></mml:mrow></mml:msub><mml:mi>%</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x000B1;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mn>1</mml:mn><mml:mo>.</mml:mo><mml:mn>96</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mi>S</mml:mi><mml:mi>E</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(25)</label></disp-formula>
</sec>
<sec>
<label>2.5</label>
<title>Computing environment</title>
<p>All the trials were conducted with the help of Visual Studio Code (VS Code) on Windows 11 Pro. The system on which training and evaluation were performed had an Intel Core i7-12700K processor, 16GB RAM, and NVIDIA RTX4060ti GPU.</p></sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Experimental results</title>
<p>The proposed model was evaluated using fundus images from the APTOS 2019 and DDR datasets. This section provides a brief overview of the results obtained from the proposed CheXNet_CBAM model for classifying fundus images into five categories: no diabetic retinopathy (DR), mild DR, moderate DR, severe DR, and proliferative DR. Each training experiment included five different transfer learning models: CheXNet, DenseNet121, MobileNetV2, VGG19, and ResNet50.</p>
<p>The CheXNet_CBAM model was created based on DenseNet121 backbone architecture integrated with CBAM attention mechanism and Transformer Encoder layer for enhancing attention capability of the model toward critical features in medical images. To make it compatible with the pretrained models, images were resized to 224 &#x000D7; 224 pixels. The batch size was 16 and the epoch number was set to 50 for efficient learning from the data. An 80-10-10 split: for training, validation, and testing, was then performed to provide fair evaluation with ImageDataGenerator used for image rescaling (rescale 1/255) for training stability.</p>
<p>This is in comparison to some other deep learning models, for instance: MobileNetV2, VGG19, ResNet50, and standard DenseNet121, which were also trained based on the same core settings (image size, batch size, and epochs) with some techniques such as Dropout introduced to aid stability. However, CheXNet_CBAM incorporates the CBAM attention module, Transformer layer, and DropBlock2D to enhance feature focus while reducing overfitting. The means of the performance evaluation included test accuracy, confusion matrices, and 95% confidence intervals to conduct a reliability analysis of the results.</p>
<sec>
<label>3.1</label>
<title>Comparing the performance of the proposed model, CheXNet_CBAM, with baseline models in diabetic retinopathy classification (APTOS 2019)</title>
<p><xref ref-type="table" rid="T4">Table 4</xref> presents the performance of the proposed CheXNet_CBAM model compared to various baseline models when applied to fundus images from the APTOS 2019 dataset for diabetic retinopathy (DR). The CheXNet_CBAM model achieved an accuracy of 96.12%, a precision of 96.30%, and an F1 score of 96.08%, outperforming the other models. The CheXNet model without CBAM achieved an accuracy of 93.80%, demonstrating the effectiveness of the proposed model and the idea of adding a CBAM layer to the CheXNet model, which significantly increased the performance of the CheXNet_CBAM model in diabetic retinopathy detection. Meanwhile, the MobileNetV2 model ranked second in diabetic retinopathy detection with an accuracy of 95.02% and an F1 score of 95.01%. The DenseNet121 model also achieved an accuracy of 94.24%. While VGG19 and ResNet50 performed the worst among the models in their ability to detect diabetic retinopathy, with VGG19 achieving an accuracy of 69.99%, ResNet50 ranked last with an accuracy of 52.71%. <xref ref-type="fig" rid="F5">Figure 5</xref> illustrates the model&#x00027;s effectiveness.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Model&#x00027;s performance analysis using fundus images from the APTOS 2019 dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>F1-Score (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>CheXNet_CBAM</bold></td>
<td valign="top" align="center">96.12</td>
<td valign="top" align="center">96.30</td>
<td valign="top" align="center">96.12</td>
<td valign="top" align="center">96.08</td>
</tr>
<tr>
<td valign="top" align="left"><bold>CheXNet</bold></td>
<td valign="top" align="center">93.80</td>
<td valign="top" align="center">93.96</td>
<td valign="top" align="center">93.79</td>
<td valign="top" align="center">93.75</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DenseNet121</bold></td>
<td valign="top" align="center">94.24</td>
<td valign="top" align="center">94.28</td>
<td valign="top" align="center">94.24</td>
<td valign="top" align="center">94.19</td>
</tr>
<tr>
<td valign="top" align="left"><bold>MobileNetV2</bold></td>
<td valign="top" align="center">95.02</td>
<td valign="top" align="center">95.03</td>
<td valign="top" align="center">95.02</td>
<td valign="top" align="center">95.01</td>
</tr>
<tr>
<td valign="top" align="left"><bold>VGG19</bold></td>
<td valign="top" align="center">69.99</td>
<td valign="top" align="center">73.37</td>
<td valign="top" align="center">69.96</td>
<td valign="top" align="center">70.00</td>
</tr>
<tr>
<td valign="top" align="left"><bold>ResNet50</bold></td>
<td valign="top" align="center">52.71</td>
<td valign="top" align="center">53.17</td>
<td valign="top" align="center">52.70</td>
<td valign="top" align="center">51.53</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values highlight the best-performing model (CheXNet_CBAM), achieving 96.12% accuracy on APTOS.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Comparison of performance metrics of different models on the APTOS 2019 dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0005.tif">
<alt-text content-type="machine-generated">Bar chart compares the performance of six models&#x02014;CheXNet_CBAM, CheXNet, DenseNet121, MobileNetV2, VGG19, and ResNet50&#x02014;across four metrics: accuracy, precision, recall, and F1-Score. CheXNet_CBAM, CheXNet, DenseNet121, and MobileNetV2 show similarly high metrics around ninety-five percent, while VGG19 and ResNet50 display noticeably lower values, especially for ResNet50, which hovers close to fifty percent. A color-coded legend indicates metric assignment.</alt-text>
</graphic>
</fig>
<p>The confusion matrices in <xref ref-type="fig" rid="F6">Figure 6</xref> demonstrate the superior diagnostic accuracy of the proposed CheXNet_CBAM model compared to other models, achieving high diagnostic accuracy across all five disease categories (mild, moderate, non-diabetic retinopathy, proliferative, and severe). The model demonstrated exceptional ability to differentiate diabetic retinopathy cases, with significantly lower false positives and false negatives compared to CheXNet, DenseNet121, MobileNetV2, VGG19, and ResNet50. This significant performance improvement is attributable to the integration of the CBAM attention mechanism (convolutional block attention module) into the CheXNet architecture, which enables the model to more accurately focus on critical regions in retinal images containing signs of diabetic retinopathy.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Confusion matrix of learning models using the APTOS 2019 dataset for diabetic retinopathy detection.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0006.tif">
<alt-text content-type="machine-generated">Six confusion matrix heatmaps compare predicted versus actual classifications across five diabetic retinopathy categories for CheXNet_CBAM, CheXNet, DenseNet121, MobileNet, VGG19, and ResNet50, with CheXNet_CBAM, CheXNet, and DenseNet121 showing higher diagonal values indicating better classification accuracy than VGG19 and ResNet50.</alt-text>
</graphic>
</fig>
<p>The evaluation results of the proposed CheXNet_CBAM model by category, shown in <xref ref-type="table" rid="T5">Table 5</xref>, demonstrated exceptional performance in diagnosing all stages of diabetic retinopathy. It achieved a perfect precision of 100% in identifying cases (No_DR) with a high recall rate of 97.24%, demonstrating its excellent ability to avoid misdiagnosis of benign cases. The model also demonstrated excellent balance in diagnosing different disease stages, achieving high F1 rates ranging from 92.04% for moderate cases to 97.51% for proliferative cases. It also achieved outstanding performance in diagnosing severe cases with a perfect recall rate of 100% and a precision of 94.24%. These results confirm the model&#x00027;s high ability to accurately distinguish between different disease stages without missing any disease (especially severe and mild), while minimizing misdiagnosis rates.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Performance evaluation of the CheXNet_CBAM model on the APTOS 2019 dataset, by category.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Class</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>F1-Score (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>Mild</bold></td>
<td valign="top" align="center">91.37</td>
<td valign="top" align="center">99.45</td>
<td valign="top" align="center">95.24</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Moderate</bold></td>
<td valign="top" align="center">98.11</td>
<td valign="top" align="center">86.67</td>
<td valign="top" align="center">92.04</td>
</tr>
<tr>
<td valign="top" align="left"><bold>No_DR</bold></td>
<td valign="top" align="center">100.00</td>
<td valign="top" align="center">97.24</td>
<td valign="top" align="center">98.60</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Proliferate_DR</bold></td>
<td valign="top" align="center">97.78</td>
<td valign="top" align="center">97.24</td>
<td valign="top" align="center">97.51</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Severe</bold></td>
<td valign="top" align="center">94.24</td>
<td valign="top" align="center">100.00</td>
<td valign="top" align="center">97.04</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate exceptional class-wise performance, including 100% precision for No_DR and 100% recall for Severe.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<label>3.2</label>
<title>Comparing the performance of the proposed CheXNet_CBAM model with baseline models in detecting diabetic retinopathy using the DDR dataset</title>
<p><xref ref-type="table" rid="T6">Table 6</xref> and <xref ref-type="fig" rid="F7">Figure 7</xref> provides a comprehensive overview of the performance of the proposed CheXNet_CBAM model compared to other baseline models when analyzing fundus images for diabetic retinopathy detection using the DDR dataset. The proposed model outperformed other models in detecting diabetic retinopathy with an accuracy rate of 96.33%, a precision rate of 96.29%, and an F1 score of 96.30%. The CheXNet model without CBAM achieved an accuracy of 91.35% and a precision of 91.18%. MobileNetV2 ranked second with an accuracy of 91.51% for detecting diabetic retinopathy. DenseNet121 and VGG19 achieved an accuracy of 84.42 and 64.06%, respectively. ResNet50 performed the worst among the models in detecting diabetic retinopathy with an accuracy of 41.72%.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Model&#x00027;s performance analysis using fundus images from the DDR dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>F1-Score (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>CheXNet_CBAM</bold></td>
<td valign="top" align="center">96.33</td>
<td valign="top" align="center">96.29</td>
<td valign="top" align="center">96.33</td>
<td valign="top" align="center">96.30</td>
</tr>
<tr>
<td valign="top" align="left"><bold>CheXNet</bold></td>
<td valign="top" align="center">91.35</td>
<td valign="top" align="center">91.18</td>
<td valign="top" align="center">91.35</td>
<td valign="top" align="center">91.10</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DenseNet121</bold></td>
<td valign="top" align="center">84.42</td>
<td valign="top" align="center">85.80</td>
<td valign="top" align="center">84.43</td>
<td valign="top" align="center">84.61</td>
</tr>
<tr>
<td valign="top" align="left"><bold>MobileNetV2</bold></td>
<td valign="top" align="center">91.51</td>
<td valign="top" align="center">91.27</td>
<td valign="top" align="center">91.51</td>
<td valign="top" align="center">91.35</td>
</tr>
<tr>
<td valign="top" align="left"><bold>VGG19</bold></td>
<td valign="top" align="center">64.06</td>
<td valign="top" align="center">70.85</td>
<td valign="top" align="center">64.05</td>
<td valign="top" align="center">63.74</td>
</tr>
<tr>
<td valign="top" align="left"><bold>ResNet50</bold></td>
<td valign="top" align="center">41.72</td>
<td valign="top" align="center">41.89</td>
<td valign="top" align="center">41.71</td>
<td valign="top" align="center">39.46</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values highlight the best-performing model (CheXNet_CBAM), achieving 96.33% accuracy on DDR.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Comparison of performance metrics of different models on the DDR dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0007.tif">
<alt-text content-type="machine-generated">Bar chart comparing the performance of six models&#x02014;CheXNet_CBAM, CheXNet, DenseNet121, MobileNetV2, VGG19, and ResNet50&#x02014;based on accuracy, precision, recall, and F1-score, with CheXNet_CBAM scoring highest and ResNet50 lowest.</alt-text>
</graphic>
</fig>
<p>ResNet50 performance was low relatively with the APTOS-2019 and DDR datasets. Though the ResNet-based models have been consistently strong in diabetic retinopathy studies, their performance under hyperparameter tuning, training duration, and dataset-specific optimization is very dependent. For conducting our experiments, we had all baseline models under same conditions to maintain a fair and unbiased comparison. One reason for the poor performance of ResNet50 is its restricted use of features and lack of pronounced attention mechanisms, both of which are vital for picking up small and scattered retinal lesions. DenseNet-based architecture, on the contrary, even more so when enhanced with CBAM, permits a richer aggregation of multi-scale features and greater attention paid to the clinically significant areas. The factors combined, besides the proposed CheXNet_CBAM model&#x00027;s robustness and generalization capability across different fundus image datasets, have made it possible to further emphasize performance gap.</p>
<p>Exceptional performance was achieved by the proposed CheXNet_CBAM model, which was presented in <xref ref-type="fig" rid="F8">Figure 8</xref> and unambiguously surpassed all other models in diagnosing diabetic retinopathy using the DDR dataset. The confusion matrix showed a perfect distribution of values for each class with the highest accuracy for the correct classification achieved. The main diagonal of the confusion matrix had the following values: 626 for severe cases, 634 for proliferative cases, 583 for non-proliferative cases, 558 for moderate cases, and 628 for mild cases. The CheXNet model made more misclassifications than the other models and those misclassifications were especially between adjacent classes. MobileNet and DenseNet121 had a relatively good performance, but their classifications overlapped to some extent. VGG19 and ResNet50, on the contrary, performed very poorly with misclassifications being widely spread across the classes. This clearly indicates the superior performance of the proposed CheXNet_CBAM model, which integrates the CBAM attention mechanism in overcoming high diagnostic accuracy and low error rates.</p>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Confusion matrix of learning models using the DDR dataset for diabetic retinopathy detection.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0008.tif">
<alt-text content-type="machine-generated">Six confusion matrix heatmaps for diabetic retinopathy classification are shown, each labeled with a different deep learning model: CheXNet_CBAM, CheXNet, DenseNet121, MobileNet, VGG19, and ResNet50. Actual classes and predicted classes are on the axes, with most matrices showing strong diagonal values indicating high accuracy except for VGG19 and ResNet50, which exhibit more dispersed values. Color intensity reflects the concentration of correct and incorrect predictions per class.</alt-text>
</graphic>
</fig>
<p>The CheXNet_CBAM model demonstrated exceptional ability to detect both mild and severe cases of diabetic retinopathy, achieving a perfect recall rate of 100% for both categories, as shown in <xref ref-type="table" rid="T7">Table 7</xref>. In severe cases requiring urgent medical intervention, it achieved an extremely high precision of 98.74%, detecting all cases, while in mild cases requiring early follow-up, its precision reached 97.21%. Severe proliferative cases were treated with exceptional precision, achieving an accuracy rate exceeding 99.36% across all metrics. Even in more complex cases, such as moderate and benign cases, the model maintained a strong performance exceeding 90%, confirming its ability to distinguish between all stages of the disease accurately. This balanced performance makes it a reliable medical tool that clinicians can rely on to make accurate and informed treatment decisions.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Performance evaluation of the CheXNet_CBAM model on the DDR dataset, by category.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Class</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>F1-Score (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>Mild</bold></td>
<td valign="top" align="center">97.21</td>
<td valign="top" align="center">100.00</td>
<td valign="top" align="center">98.58</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Moderate</bold></td>
<td valign="top" align="center">93.47</td>
<td valign="top" align="center">89.14</td>
<td valign="top" align="center">91.25</td>
</tr>
<tr>
<td valign="top" align="left"><bold>No_DR</bold></td>
<td valign="top" align="center">92.69</td>
<td valign="top" align="center">92.98</td>
<td valign="top" align="center">92.83</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Proliferate_DR</bold></td>
<td valign="top" align="center">99.36</td>
<td valign="top" align="center">99.52</td>
<td valign="top" align="center">99.44</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Severe</bold></td>
<td valign="top" align="center">98.74</td>
<td valign="top" align="center">100.00</td>
<td valign="top" align="center">99.37</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate perfect recall rates (100%) for Mild and Severe classes, as well as exceptional performance for Proliferate_DR.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<label>3.3</label>
<title>Statistical analysis</title>
<p>The statistical analysis in <xref ref-type="table" rid="T8">Table 8</xref> reveals a clear performance hierarchy, with the CheXNet_CBAM model outperforming with an accuracy of 96.12%. CheXNet_CBAM shows a statistically significant improvement of 2.32 percentage points due to the integration of CBAM, compared to the CheXNet model with non-overlapping confidence intervals (95.23, 96.98) vs. (92.75, 94.82). The 1.10 percentage point difference represents a significant but smaller gap with the MobileNetV2 model, with slightly overlapping confidence intervals indicating moderate statistical significance. The huge performance gaps 26.13% vs. VGG19, 43.41% vs. ResNet50 show highly significant differences with non-overlapping confidence intervals.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>APTOS 2019 dataset&#x02014;statistical performance analysis.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Difference from the CheXNet_CBAM</bold></th>
<th valign="top" align="center"><bold>95% confidence interval</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>CheXNet_CBAM</bold></td>
<td valign="top" align="center">96.12</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">(95.23, 96.98)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>MobileNetV2</bold></td>
<td valign="top" align="center">95.02</td>
<td valign="top" align="center">&#x02212;1.10</td>
<td valign="top" align="center">(94.05, 95.97)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DenseNet121</bold></td>
<td valign="top" align="center">94.24</td>
<td valign="top" align="center">&#x02212;1.88</td>
<td valign="top" align="center">(93.21, 95.25)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>CheXNet</bold></td>
<td valign="top" align="center">93.80</td>
<td valign="top" align="center">&#x02212;2.32</td>
<td valign="top" align="center">(92.75, 94.82)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>VGG19</bold></td>
<td valign="top" align="center">69.99</td>
<td valign="top" align="center">&#x02212;26.13</td>
<td valign="top" align="center">(67.89, 72.07)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>ResNet50</bold></td>
<td valign="top" align="center">52.71</td>
<td valign="top" align="center">&#x02212;43.41</td>
<td valign="top" align="center">(50.12, 55.28)</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the CheXNet_CBAM model as the reference baseline with the best performance on APTOS.</p>
</table-wrap-foot>
</table-wrap>
<p>Improved Performance Gaps: the DDR dataset shows more pronounced performance differences, as shown in <xref ref-type="table" rid="T9">Table 9</xref>, with CheXNet_CBAM achieving 96.33% accuracy and larger gaps compared to the other models. The 4.98 percentage point improvement between CheXNet_CBAM and CheXNet is highly statistically significant with completely non-overlapping confidence intervals of (95.46, 97.18) vs. (90.18, 92.50). The 4.82 percentage difference between CheXNet_CBAM and MobileNetV2 demonstrates clear statistical significance. The 11.91 percentage point difference is statistically significant with non-overlapping confidence intervals. The 11.91 percentage point difference demonstrates statistical significance with non-overlapping confidence intervals between CheXNet_CBAM and DenseNet121.</p>
<table-wrap position="float" id="T9">
<label>Table 9</label>
<caption><p>DDR dataset&#x02014;statistical performance analysis.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Difference from the CheXNet_CBAM</bold></th>
<th valign="top" align="center"><bold>95% confidence interval</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>CheXNet_CBAM</bold></td>
<td valign="top" align="center">96.33</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">(95.46, 97.18)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>MobileNetV2</bold></td>
<td valign="top" align="center">91.51</td>
<td valign="top" align="center">&#x02212;4.82</td>
<td valign="top" align="center">(90.35, 92.65)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>CheXNet</bold></td>
<td valign="top" align="center">91.35</td>
<td valign="top" align="center">&#x02212;4.98</td>
<td valign="top" align="center">(90.18, 92.50)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DenseNet121</bold></td>
<td valign="top" align="center">84.42</td>
<td valign="top" align="center">&#x02212;11.91</td>
<td valign="top" align="center">(82.95, 85.87)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>VGG19</bold></td>
<td valign="top" align="center">64.06</td>
<td valign="top" align="center">&#x02212;32.27</td>
<td valign="top" align="center">(61.78, 66.32)</td>
</tr>
<tr>
<td valign="top" align="left"><bold>ResNet50</bold></td>
<td valign="top" align="center">41.72</td>
<td valign="top" align="center">&#x02212;54.61</td>
<td valign="top" align="center">(38.65, 44.77)</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the CheXNet_CBAM model as the reference baseline with the best performance on DDR.</p>
</table-wrap-foot>
</table-wrap>
<p>The statistical analysis results provide strong evidence of the superiority of the proposed CheXNet_CBAM model, achieving high performance on both datasets and demonstrating significant differences from all competing models. Clinical reliability, thanks to precise confidence intervals, ensures predictable performance, with the model&#x00027;s generalizability. Overall, the inclusion of confidence intervals provides clinically meaningful effect size interpretation and supports the reliability and generalizability of the proposed CheXNet_CBAM framework.</p></sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<sec>
<label>4.1</label>
<title>Comparison with other studies</title>
<p>The meaning of the trend is the development of the classification of diabetic retinopathy that is prevalent in its various features, from methods, from feature fusion to attention-based techniques. For instance, RT2Net integrated fundus and vascular branch networks (<xref ref-type="bibr" rid="B41">41</xref>) and achieved 88.2% on EyePACS and 85.4% on APTOS, while few-shot learning with Siamese neural networks and pre-trained models such as VGG16 and ResNet50 (<xref ref-type="bibr" rid="B42">42</xref>) reported more modest results (80%&#x02212;81%).</p>
<p>Such results were shown by EfficientNetB3-enhanced models with SE processing blocks (<xref ref-type="bibr" rid="B43">43</xref>), which achieved an 88.44% accuracy rate. Attention-driven methods, such as MSCAS-Net (<xref ref-type="bibr" rid="B44">44</xref>), have also greatly raised this performance level to an astounding 93.8% on the APTOS database, surely proving how important multi-scale and attention fusion are in performance improvement. Therefore, it is evident that the introduction of attention mechanisms and feature-level fusion is more advantageous when compared with standard transfer learning techniques. Other studies have concentrated on interpretability and hybrid learning for the sake of accuracy vs. clinical applicability. With an amazing 94.64% on APTOS, the explanation AI is from ResNet-50 with SHAP (<xref ref-type="bibr" rid="B45">45</xref>) compared with traditional CNN attention frameworks such as MSRAB &#x0002B; CrAB (<xref ref-type="bibr" rid="B46">46</xref>), which achieved 88.31% on APTOS. Bayesian deep learning approaches (<xref ref-type="bibr" rid="B47">47</xref>) achieved the highest performance, with 94.23% accuracy using MC Dropout, emphasizing their strength in uncertainty-aware classification. Hybrid CNN&#x02013;ViT models with interpretability tools such as LIME and Grad-CAM (<xref ref-type="bibr" rid="B48">48</xref>) also demonstrated high performance (93.01%), showing that combining convolutional and transformer-based features yields both strong results and interpretability.</p>
<p>In the meantime, hybrid strategies that fuse deep features with classical machine learning did not do that well, compared to the few, but for example, an SVM classifier combined with EfficientNetV2-S (<xref ref-type="bibr" rid="B49">49</xref>) worked at 91%, while the lightweight two-stage models (<xref ref-type="bibr" rid="B50">50</xref>) had an accuracy of 90.75%-which is just a marginal improvement. Swin-TransformerV2 with hybrid attention (<xref ref-type="bibr" rid="B51">51</xref>) showed performance with average accuracy (85.5%&#x02212;87.9%), not so bad compared to CNN-driven models. Nonetheless, our proposed model, CheXNet_CBAM, which incorporates methods like DenseNet121 with CBAM, DropBlock2D, and a Transformer Encoder, attained 96.12% accuracy on APTOS and 96.33% accuracy on DDR-with the rest of the models competing with the Bayesian best ones.</p>
<p>The most salient points of improvement to CheXNet include the focus on clinically relevant retinal regions that was made possible with the incorporation of the convolutional block attention module (CBAM) within the model, DropBlock2D for greater regularization control and preventing any tendencies to overfit, and the use of a Transformer Encoder to capture contextual and long-range spatial dependencies in retinal images. Such a hybrid model stands to combine the strengths of contextualization with CNN-based feature extraction via attention mechanisms and transformers. Therefore, CheXNet_CBAM is not only better in terms of accuracy than most state-of-the-art methods, but also provides a sound and generalizable framework, making it a prime candidate for real clinical deployment. <xref ref-type="table" rid="T10">Table 10</xref> shows a comparison of previous studies on the diagnosis of diabetic retinopathy.</p>
<table-wrap position="float" id="T10">
<label>Table 10</label>
<caption><p>Comparison of the results of the proposed study with previous studies for the classification of diabetic retinopathy.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="center"><bold>Year</bold></th>
<th valign="top" align="center"><bold>Reference</bold></th>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Approach</bold></th>
<th valign="top" align="left"><bold>Model used</bold></th>
<th valign="top" align="left"><bold>Accuracy</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">2024</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B41">41</xref>)</td>
<td valign="top" align="left">EyePACS, APTOS-2019</td>
<td valign="top" align="left">Multi-view Joint Learning &#x0002B; Feature Fusion</td>
<td valign="top" align="left">RT2Net (Fundus &#x0002B; Vascular Branch Networks)</td>
<td valign="top" align="left">88.2% (EyePACS), 85.4% (APTOS-2019), AUC: 0.98 (EyePACS), 0.96 (APTOS)</td>
</tr>
<tr>
<td valign="top" align="center">2024</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B42">42</xref>)</td>
<td valign="top" align="left">FGADR, APTOS-2019</td>
<td valign="top" align="left">Few-shot learning using similarity-based classification</td>
<td valign="top" align="left">Siamese Neural Network &#x0002B; Pre-trained models (VGG16, ResNet50, DenseNet121)</td>
<td valign="top" align="left">FGADR: 80% APTOS-2019: 81%</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B43">43</xref>)</td>
<td valign="top" align="left">APTOS-2019, IDRiD, Messidor-2</td>
<td valign="top" align="left">Transfer Learning &#x0002B; SE Block</td>
<td valign="top" align="left">EfficientNetB3 &#x0002B; SE</td>
<td valign="top" align="left">88.44%</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B44">44</xref>)</td>
<td valign="top" align="left">APTOS, DDR, IDRiD</td>
<td valign="top" align="left">Fine-grained &#x0002B; Attention Fusion</td>
<td valign="top" align="left">MSCAS-Net (Swin Transformer &#x0002B; Multi-Scale Attention)</td>
<td valign="top" align="left">93.8% (APTOS), 89.8% (DDR), 86.7% (IDRiD)</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B45">45</xref>)</td>
<td valign="top" align="left">APTOS-2019, EyePACS, DDR, IDRiD, SUSTech-SYSU</td>
<td valign="top" align="left">Transfer Learning &#x0002B; Explainable AI</td>
<td valign="top" align="left">ResNet-50 &#x0002B; SHAP</td>
<td valign="top" align="left">94.64% (APTOS), 86.36% (EyePACS), 84.23% (DDR), 82.79% (IDRiD), 85.65% (SUSTech-SYSU)</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B46">46</xref>)</td>
<td valign="top" align="left">APTOS-2019, DDR</td>
<td valign="top" align="left">Multi-scale Residual &#x0002B; Cross-Attention</td>
<td valign="top" align="left">CNN &#x0002B; MSRAB &#x0002B; CrAB</td>
<td valign="top" align="left">APTOS-2019 :88.31% DDR : 84.15%</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B47">47</xref>)</td>
<td valign="top" align="left">APTOS-2019,</td>
<td valign="top" align="left">Bayesian Deep Learning</td>
<td valign="top" align="left">DenseNet-121 &#x0002B; Bayesian (MC Dropout, MFVI, Det.)</td>
<td valign="top" align="left">94.23%</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B52">52</xref>)</td>
<td valign="top" align="left">DDR, IDRiD</td>
<td valign="top" align="left">Dual-stage grading &#x0002B; Feature Collaboration</td>
<td valign="top" align="left">XE-Net &#x0002B; MFC-Net (lesion &#x0002B; vascular)</td>
<td valign="top" align="left">IDRiD:91.26 DDR: 89.24</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B53">53</xref>)</td>
<td valign="top" align="left">APTOS-2019, MosMedData CT</td>
<td valign="top" align="left">Privacy-Preserving Federated Learning &#x0002B; Homomorphic Encryption</td>
<td valign="top" align="left">EfficientNet-B0 &#x0002B; PPFLHE Framework</td>
<td valign="top" align="left">83.19% (APTOS), 81.27% (MosMedData)</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B48">48</xref>)</td>
<td valign="top" align="left">APTOS-2019</td>
<td valign="top" align="left">Hybrid CNN &#x0002B; ViT &#x0002B; Explainable AI</td>
<td valign="top" align="left">ResViT FusionNet (ResNet50 &#x0002B; ViT &#x0002B; LIME &#x0002B; Grad-CAM)</td>
<td valign="top" align="left">93.01%</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B51">51</xref>)</td>
<td valign="top" align="left">DDR, APTOS-2019, Clinical dataset</td>
<td valign="top" align="left">Multi-branch Fine-Grained Classification &#x0002B; Hybrid Attention</td>
<td valign="top" align="left">Swin-TransformerV2 &#x0002B; Multi-Branch &#x0002B; Category Attention</td>
<td valign="top" align="left">DDR: 87.9% APTOS-2019: 85.5% Clinical data: 77%</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B49">49</xref>)</td>
<td valign="top" align="left">APTOS 2019</td>
<td valign="top" align="left">Transfer learning with deep learning for multi-stage DR classification</td>
<td valign="top" align="left">ResNet_101, DenseNet_201, EfficientNet_b0 (best: EfficientNet_b0)</td>
<td valign="top" align="left">91%</td>
</tr>
<tr>
<td valign="top" align="center">2025</td>
<td valign="top" align="center">(<xref ref-type="bibr" rid="B50">50</xref>)</td>
<td valign="top" align="left">APTOS 2019</td>
<td valign="top" align="left">Two-stage deep learning: Stage 1 &#x02013; DR detection (healthy vs diseased), Stage 2 &#x02013; DR severity classification using transfer learning</td>
<td valign="top" align="left">Lightweight multi-deep learning framework (custom two-stage model)</td>
<td valign="top" align="left">90.75%</td>
</tr>
<tr>
<td valign="top" align="center"><bold>2025</bold></td>
<td valign="top" align="left"><bold>Proposed model</bold></td>
<td valign="top" align="left"><bold>APTOS 2019, DDR</bold></td>
<td valign="top" align="left"><bold>DenseNet121 for integrated with CBAM Block, DropBlock2D Layer, Transformer Encoder</bold></td>
<td valign="top" align="left"><bold>CheXNet_CBAM</bold></td>
<td valign="top" align="left"><bold>APTOS: 96.12%, DDR: 96.33%</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values highlight the proposed model results (96.12% on APTOS and 96.33% on DDR), representing the novel contributions of this study.</p>
</table-wrap-foot>
</table-wrap>
<p>Recent multicenter studies have demonstrated the effectiveness of deep convolutional and attention-based models for diabetic retinopathy screening across heterogeneous populations and imaging devices. While these works primarily focus on large-scale performance benchmarking, limited attention has been given to integrating lightweight attention mechanisms with dense feature reuse and explicit visual interpretability. In contrast, the proposed CheXNet_CBAM framework emphasizes both classification performance and clinical transparency through Grad-CAM visualization, while maintaining robustness across multiple public datasets. This complementary focus distinguishes the present study and contributes incremental value to existing multicenter research.</p>
</sec>
<sec>
<label>4.2</label>
<title>Grad-CAM visualization of CheXNet_CBAM for diabetic retinopathy detection</title>
<p><xref ref-type="fig" rid="F9">Figure 9</xref> presents Grad-CAM visualizations of the proposed CheXNet_CBAM model when applied to fundus images from the DDR and APTOS 2019 datasets, respectively. These visualizations provide insights into the regions of interest emphasized by the model during diabetic retinopathy (DR) detection, highlighting how the network interprets subtle to severe pathological changes.</p>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Grad-CAM Interpretive Maps of Diabetic Retinopathy Scores on <bold>(A)</bold> APTOS-2019 and <bold>(B)</bold> DDR Datasets.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1732109-g0009.tif">
<alt-text content-type="machine-generated">Figure containing two rows labeled A and B, each displaying eight panels of eye images corresponding to different stages of diabetic retinopathy: mild, moderate, no diabetic retinopathy, proliferative, and severe. Each pair shows an original retinal scan beside a Grad-CAM heatmap highlighting areas of interest for each condition. Panel A contains grayscale images, while panel B features color fundus images. All images are organized for comparison of original and model attention outputs across disease severity.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F9">Figure 9A</xref> (APTOS 2019 dataset), the Grad-CAM output is compared to DDR and displays more pronounced and localized activations, which is probably due to the better image quality of the dataset. The network highlights the small changes around the macula in case of mild DR, but in the case of moderate DR, it reveals more accurately the presence of small hemorrhages and vascular malformations. In No_DR, the distributions of activations stay at the same low levels and are dispersed, which is an indication of a correct diagnosis. In proliferative DR and severe DR, the visualizations highlight pathological areas, including abnormal vascular growth and widespread hemorrhages, confirming the model&#x00027;s ability to identify important clinical features in advanced cases.</p>
<p>In the DDR dataset shown in <xref ref-type="fig" rid="F9">Figure 9B</xref>, the Grad-CAM maps indicate that in case of Mild DR the model&#x00027;s attention is mostly given to the macular area where early retinal changes are taking place. During Moderate DR, the focus is progressively broader and includes microaneurysms and hemorrhages which are in places across retina. No_DR condition is characterized by faint and weak activations, in line with the lack of pathological conditions. On the contrary, the network highlights the areas of neovascularization very well for Proliferative DR, while for Severe DR it points to the areas of extensive bleeding and exudates, giving out intense activations all over the infected zones. Generally, these Grad-CAM results indicate that the suggested CheXNet_CBAM model not only provides better accuracy in classification but also gives clinically significant interpretation. The differences between DDR and APTOS 2019 datasets reveal the impact of dataset quality on the not so clear model visualization for DDR and the very clear attention maps for APTOS. The interpretability that comes with this model increase its acceptance in the field of eye care since it allows ophthalmologists to follow and even trust the network&#x00027;s decision-making process.</p>
<p>The Grad-CAM visualizations consistently highlight anatomically and clinically meaningful regions, including areas around the macula, optic disc, and vascular structures where diabetic retinopathy lesions are known to occur. These activation patterns align with established ophthalmic diagnostic criteria reported in the literature, supporting the clinical plausibility of the learned representations. Although quantitative expert annotation-based validation was not available for the utilized public datasets, the observed consistency across datasets and severity levels provides indirect evidence of interpretability reliability.</p>
</sec>
<sec>
<label>4.3</label>
<title>Integration of the proposed method into the hospital</title>
<p>The proposed CheXNet_CBAM framework is intended as a clinical decision-support system (CDSS) for diabetic retinopathy (DR) screening/triage, rather than as an autonomous diagnostic tool. In a hospital setting, the workflow would be integrated into routine ophthalmic imaging processes. Retinal fundus images would be acquired using non-mydriatic or mydriatic fundus cameras according to local clinical protocols and transferred to an on-premises, access-controlled inference server over the hospital intranet in standard formats (e.g., JPEG or PNG). Prior to inference, images would undergo automated preprocessing consistent with the training pipeline, including resizing to the model input resolution, applying the predefined normalization procedure, and applying an image-quality control step to identify ungradable or non-conforming images (with explicit criteria to be reported). Inference would be executed within a version-controlled software environment (e.g., Python with TensorFlow or PyTorch) using pinned library versions to improve reproducibility across deployments. The system would output both the predicted DR grade and Grad-CAM heatmaps highlighting image regions that most strongly influence the model output, and these outputs would be presented to clinicians via a graphical user interface to support review and documentation. The predictions and justification would be reviewed by the clinicians and they would be able to confirm, override or ask for an image to be re-taken depending on the local practice; the clinical responsibility stays with the physician who finally decides about the patient&#x00027;s management. A structured onboarding would be provided to the users covering the intended use, the interpretation of outputs, and the limitations of AI-assisted screening. Quality assurance would include performance monitoring at periodic intervals using predefined validation cases and acceptance criteria, logging of predictions and model versions for auditability, and targeted review of attribution maps to flag implausible patterns while noting that saliency methods do not establish causal relevance. The appropriate response to suspected performance degradation (e.g., due to acquisition changes) would include manual review and, if justified, a governed model update process with re-validation prior to redeployment. Although evaluation through public datasets can demonstrate generalization in controlled conditions, device- and site-specific external validation is still required to ensure safe integration into heterogeneous real-world screening workflows.</p>
</sec>
<sec>
<label>4.4</label>
<title>Limitations of this study</title>
<p>Regardless of the stated effectiveness, the CheXNet_CBAM model that has been suggested has limitations that the researchers consider very important. To begin with, the model was trained and tested on datasets that are publicly available which could be a reason for not reflecting completely the heterogeneity and case-mix of real-world patient populations met in the clinics. The differences in age distribution, disease prevalence in different areas, and other characteristics of the population may lead to the model being less accurate and less reliable when used in hospital settings different from those of the assessed datasets. Second, the assessment was retrospective and based on pre-existing datasets; no prospective workflow integration or evaluation of decision impact was performed. Consequently, downstream clinical outcomes&#x02014;such as the appropriateness and timeliness of referral pathways&#x02014;could not be assessed. Prospective, site-specific external validation in real clinical environments is therefore necessary to characterize clinical utility and safety. Third, as with AI-assisted screening systems generally, potential safety risks remain when translating the approach into practice, including misclassification of disease severity and inappropriate triage of higher-risk cases. The proposed method is intended to function as a decision-support system rather than an autonomous diagnostic tool, and clinical responsibility remains with qualified clinicians who make final management decisions. Although Grad-CAM heatmaps can provide qualitative attribution cues that may support clinical review, saliency-based explanations do not guarantee faithfulness and should not be treated as a standalone indicator of prediction reliability. Finally, deployment depends on operational infrastructure and governance, including secure integration into clinical workflows, technical support, fallback procedures for manual screening, and ongoing user training. Because data-driven models may also reflect dataset and labeling biases, ongoing monitoring, periodic validation, and recalibration as needed are required to maintain performance across evolving imaging devices, acquisition protocols, and clinical practices. Collectively, these considerations may hinder translation unless addressed through rigorous external validation and deployment governance.</p></sec></sec>
<sec id="s5">
<label>5</label>
<title>Conclusion and further work</title>
<p>This study presents CheXNet_CBAM, a model for classifying retinal fundus images by diabetic retinopathy (DR) severity, including a No DR class. The model was trained and evaluated on the APTOS 2019 and DDR datasets. The performance was evaluated as per the reported experimental protocol against CheXNet/DenseNet121 and four more deep learning baselines. CheXNet_CBAM, with a test accuracy of 96.12% on APTOS 2019 and 96.33% on DDR, surpassed the accuracy of all evaluated baselines on the same splits. In the future, the use of other attention features and image processing techniques to enhance the system&#x00027;s resistance to low-grade or ungradable fundus images may be studied. Moreover, in the case of data availability and proper validation, a multimodal extension of CheXNet_CBAM might be investigated to add more clinical inputs for DR severity grading.</p>
<p>Despite these encouraging results, the findings should be interpreted within the context of dataset-centered evaluation. The model was trained and tested on publicly available datasets, which may not fully capture the heterogeneity of real-world clinical environments, including variations in patient demographics, imaging devices, and acquisition protocols. Therefore, while the results indicate strong potential, further validation is required before clinical deployment.</p>
<p>Future work will focus on several concrete and actionable directions. First, multicenter and multi-device validation studies will be conducted to assess generalizability across diverse populations and imaging conditions. Second, prospective clinical studies will be designed to evaluate real-world performance, safety, and workflow integration under physician supervision. Third, improvements to preprocessing and attention mechanisms will be explored to enhance robustness against low-quality fundus images, while minimizing the exclusion of samples. Finally, multimodal extensions incorporating complementary clinical information, such as patient demographics or optical coherence tomography data, will be investigated further to improve diabetic retinopathy severity grading and clinical decision support.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>WA-D: Methodology, Validation, Visualization, Writing &#x02013; original draft, Project administration, Supervision. SA: Methodology, Software, Supervision, Writing &#x02013; original draft. NA: Data curation, Formal analysis, Investigation, Resources, Writing &#x02013; review &#x00026; editing. AA: Conceptualization, Funding acquisition, Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing. MA: Conceptualization, Methodology, Project administration, Software, Writing &#x02013; original draft. AD: Data curation, Formal analysis, Validation, Writing &#x02013; review &#x00026; editing. RoM: Data curation, Investigation, Resources, Visualization, Writing &#x02013; review &#x00026; editing. RaM: Conceptualization, Methodology, Visualization, Writing &#x02013; review &#x00026; editing. RA-O: Resources, Supervision, Validation, Writing &#x02013; review &#x00026; editing. FA: Conceptualization, Resources, Visualization, Writing &#x02013; review &#x00026; editing. SA: Conceptualization, Project administration, Visualization, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<ack><title>Acknowledgments</title><p>This work was supported and funded by the Deanship of Scientific Research at Imam Mohammad Ibn Saud Islamic University (IMSIU) (grant number IMSIU-DDRSP2601).</p></ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schmidt</surname> <given-names>AM</given-names></name></person-group>. <article-title>Highlighting diabetes mellitus: the epidemic continues</article-title>. <source>Arterioscler Thromb Vasc Biol.</source> (<year>2018</year>) <volume>38</volume>:<fpage>e1</fpage>&#x02013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1161/ATVBAHA.117.310221</pub-id><pub-id pub-id-type="pmid">29282247</pub-id></mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ogurtsova</surname> <given-names>K</given-names></name> <name><surname>da Rocha Fernandes</surname> <given-names>JD</given-names></name> <name><surname>Huang</surname> <given-names>Y</given-names></name> <name><surname>Linnenkamp</surname> <given-names>U</given-names></name> <name><surname>Guariguata</surname> <given-names>L</given-names></name> <name><surname>Cho</surname> <given-names>NH</given-names></name> <etal/></person-group>. <article-title>IDF diabetes atlas: global estimates for the prevalence of diabetes for 2015 and 2040</article-title>. <source>Diabetes Res Clin Pract</source>. (<year>2017</year>) <volume>128</volume>:<fpage>40</fpage>&#x02013;<lpage>50</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.diabres.2017.03.024</pub-id><pub-id pub-id-type="pmid">28437734</pub-id></mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kataoka</surname> <given-names>SY</given-names></name> <name><surname>Lois</surname> <given-names>N</given-names></name> <name><surname>Kawano</surname> <given-names>S</given-names></name> <name><surname>Kataoka</surname> <given-names>Y</given-names></name> <name><surname>Inoue</surname> <given-names>K</given-names></name> <name><surname>Watanabe</surname> <given-names>N</given-names></name></person-group>. <article-title>Fenofibrate for diabetic retinopathy</article-title>. <source>Cochrane Database Syst Rev</source>. (<year>2023</year>) <volume>6</volume>:<fpage>CD013318</fpage>. doi: <pub-id pub-id-type="doi">10.1002/14651858.CD013318.pub2</pub-id><pub-id pub-id-type="pmid">37310870</pub-id></mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bourne</surname> <given-names>RR</given-names></name> <name><surname>Stevens</surname> <given-names>GA</given-names></name> <name><surname>White</surname> <given-names>RA</given-names></name> <name><surname>Smith</surname> <given-names>JL</given-names></name> <name><surname>Flaxman</surname> <given-names>SR</given-names></name> <name><surname>Price</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Causes of vision loss worldwide, 1990&#x02013;2010: a systematic analysis</article-title>. <source>Lancet Glob Health</source>. (<year>2013</year>) <volume>1</volume>:<fpage>e339</fpage>&#x02013;<lpage>49</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S2214-109X(13)70113-X</pub-id><pub-id pub-id-type="pmid">25104599</pub-id></mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Z</given-names></name> <name><surname>Tan</surname> <given-names>T-E</given-names></name> <name><surname>Shao</surname> <given-names>Y</given-names></name> <name><surname>Wong</surname> <given-names>TY</given-names></name> <name><surname>Li</surname> <given-names>X</given-names></name></person-group>. <article-title>Classification of diabetic retinopathy: past, present and future</article-title>. <source>Front Endocrinol</source>. (<year>2022</year>) <volume>13</volume>:<fpage>1079217</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fendo.2022.1079217</pub-id><pub-id pub-id-type="pmid">36589807</pub-id></mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>W</given-names></name> <name><surname>Lo</surname> <given-names>ACY</given-names></name></person-group>. <article-title>Diabetic retinopathy: pathophysiology and treatments</article-title>. <source>Int J Mol Sci.</source> (<year>2018</year>) <volume>19</volume>:<fpage>1816</fpage>. doi: <pub-id pub-id-type="doi">10.3390/ijms19061816</pub-id><pub-id pub-id-type="pmid">29925789</pub-id></mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kropp</surname> <given-names>M</given-names></name> <name><surname>Golubnitschaja</surname> <given-names>O</given-names></name> <name><surname>Mazurakova</surname> <given-names>A</given-names></name> <name><surname>Koklesova</surname> <given-names>L</given-names></name> <name><surname>Sargheini</surname> <given-names>N</given-names></name> <name><surname>Vo</surname> <given-names>TKS</given-names></name> <etal/></person-group>. <article-title>Diabetic retinopathy as the leading cause of blindness and early predictor of cascading complications&#x02014;risks and mitigation</article-title>. <source>EPMA J</source>. (<year>2023</year>) <volume>14</volume>:<fpage>21</fpage>&#x02013;<lpage>42</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s13167-023-00314-8</pub-id><pub-id pub-id-type="pmid">36866156</pub-id></mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="journal"><collab>Early treatment diabetic retinopathy study design and baseline patient characteristics: ETDRS report number 7</collab>. <source>Ophthalmology</source>. (<year>1991</year>) <volume>98</volume>:<fpage>741</fpage>&#x02013;<lpage>56</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0161-6420(13)38009-9</pub-id></mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yun</surname> <given-names>WL</given-names></name> <name><surname>Acharya</surname> <given-names>UR</given-names></name> <name><surname>Venkatesh</surname> <given-names>YV</given-names></name> <name><surname>Chee</surname> <given-names>C</given-names></name> <name><surname>Min</surname> <given-names>LC</given-names></name> <name><surname>Ng</surname> <given-names>EYK</given-names></name></person-group>. <article-title>Identification of different stages of diabetic retinopathy using retinal optical images</article-title>. <source>Inf Sci.</source> (<year>2008</year>) <volume>178</volume>:<fpage>106</fpage>&#x02013;<lpage>21</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ins.2007.07.020</pub-id></mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vashist</surname> <given-names>P</given-names></name> <name><surname>Singh</surname> <given-names>S</given-names></name> <name><surname>Gupta</surname> <given-names>N</given-names></name> <name><surname>Saxena</surname> <given-names>R</given-names></name></person-group>. <article-title>Role of early screening for diabetic retinopathy in patients with diabetes mellitus: an overview</article-title>. <source>Indian J Community Med.</source> (<year>2011</year>) <volume>36</volume>:<fpage>247</fpage>&#x02013;<lpage>52</lpage>. doi: <pub-id pub-id-type="doi">10.4103/0970-0218.91324</pub-id><pub-id pub-id-type="pmid">22279252</pub-id></mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fung</surname> <given-names>TH</given-names></name> <name><surname>Patel</surname> <given-names>B</given-names></name> <name><surname>Wilmot</surname> <given-names>EG</given-names></name> <name><surname>Amoaku</surname> <given-names>WM</given-names></name></person-group>. <article-title>Diabetic retinopathy for the non-ophthalmologist</article-title>. <source>Clin Med.</source> (<year>2022</year>) <volume>22</volume>:<fpage>112</fpage>&#x02013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.7861/clinmed.2021-0792</pub-id><pub-id pub-id-type="pmid">35304370</pub-id></mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nadeem</surname> <given-names>MW</given-names></name> <name><surname>Goh</surname> <given-names>HG</given-names></name> <name><surname>Hussain</surname> <given-names>M</given-names></name> <name><surname>Liew</surname> <given-names>S-Y</given-names></name> <name><surname>Andonovic</surname> <given-names>I</given-names></name> <name><surname>Khan</surname> <given-names>MA</given-names></name></person-group>. <article-title>Deep learning for diabetic retinopathy analysis: a review, research challenges, and future directions</article-title>. <source>Sensors</source>. (<year>2022</year>) <volume>22</volume>:<fpage>6780</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s22186780</pub-id><pub-id pub-id-type="pmid">36146130</pub-id></mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gunasekeran</surname> <given-names>DV</given-names></name> <name><surname>Ting</surname> <given-names>DSW</given-names></name> <name><surname>Tan</surname> <given-names>GSW</given-names></name> <name><surname>Wong</surname> <given-names>TY</given-names></name></person-group>. <article-title>Artificial intelligence for diabetic retinopathy screening, prediction and management</article-title>. <source>Curr Opin Ophthalmol.</source> (<year>2020</year>) <volume>31</volume>:<fpage>357</fpage>&#x02013;<lpage>65</lpage>. doi: <pub-id pub-id-type="doi">10.1097/ICU.0000000000000693</pub-id><pub-id pub-id-type="pmid">32740069</pub-id></mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Davenport</surname> <given-names>T</given-names></name> <name><surname>Kalakota</surname> <given-names>R</given-names></name></person-group>. <article-title>The potential for artificial intelligence in healthcare</article-title>. <source>Future Healthc J</source>. (<year>2019</year>) <volume>6</volume>:<fpage>94</fpage>&#x02013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.7861/futurehosp.6-2-94</pub-id></mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Altal</surname> <given-names>OF</given-names></name> <name><surname>Sindiani</surname> <given-names>AM</given-names></name> <name><surname>Amin</surname> <given-names>M</given-names></name> <name><surname>Mhanna</surname> <given-names>HYA</given-names></name> <name><surname>Hamad</surname> <given-names>R</given-names></name> <name><surname>Gharaibeh</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Hybrid attention-enhanced MobileNetV2 with particle swarm optimization for endometrial cancer classification in CT images</article-title>. <source>Inform Med Unlocked</source>. (<year>2025</year>) <volume>57</volume>:<fpage>101662</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.imu.2025.101662</pub-id></mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alshdaifat</surname> <given-names>EH</given-names></name> <name><surname>Gharaibeh</surname> <given-names>H</given-names></name> <name><surname>Sindiani</surname> <given-names>AM</given-names></name> <name><surname>Madain</surname> <given-names>R</given-names></name> <name><surname>Al-Mnayyis</surname> <given-names>AM</given-names></name> <name><surname>Mhanna</surname> <given-names>HYA</given-names></name> <etal/></person-group>. <article-title>Hybrid vision transformer and Xception model for reliable CT-based ovarian neoplasms diagnosis</article-title>. <source>Intell Based Med</source>. (<year>2025</year>) <volume>11</volume>:<fpage>100227</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ibmed.2025.100227</pub-id></mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Karthik</surname> <given-names>M</given-names></name> <name><surname>Dane</surname> <given-names>S</given-names></name></person-group>. <article-title>Aptos 2019 Blindness Detection</article-title>. Kaggle (<year>2019</year>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://kaggle.com/competitions/aptos2019-blindness-detection">https://kaggle.com/competitions/aptos2019-blindness-detection</ext-link> (Accessed February 13, 2026).</mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>T</given-names></name> <name><surname>Gao</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>K</given-names></name> <name><surname>Guo</surname> <given-names>S</given-names></name> <name><surname>Liu</surname> <given-names>H</given-names></name> <name><surname>Kang</surname> <given-names>H</given-names></name></person-group>. <article-title>Diagnostic assessment of deep learning algorithms for diabetic retinopathy screening</article-title>. <source>Inf Sci.</source> (<year>2019</year>) <volume>501</volume>:<fpage>511</fpage>&#x02013;<lpage>22</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ins.2019.06.011</pub-id></mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rajpurkar</surname> <given-names>P</given-names></name> <name><surname>Irvin</surname> <given-names>J</given-names></name> <name><surname>Zhu</surname> <given-names>K</given-names></name> <name><surname>Yang</surname> <given-names>B</given-names></name> <name><surname>Mehta</surname> <given-names>H</given-names></name> <name><surname>Duan</surname> <given-names>T</given-names></name> <etal/></person-group>. <article-title>CheXNet: radiologist-level pneumonia detection on chest X-rays with deep learning</article-title>. <source>arXiv</source>. [preprint] (<year>2017</year>). arXiv:1711.05225. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1711.05225</pub-id></mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Haritha</surname> <given-names>D</given-names></name> <name><surname>Pranathi</surname> <given-names>MK</given-names></name> <name><surname>Reethika</surname> <given-names>M</given-names></name></person-group>. <article-title>COVID detection from chest X-rays with DeepLearning: CheXNet</article-title>. in <source>2020 5th international conference on computing, communication and security (ICCCS)</source>. Patna: IEEE (<year>2020</year>). p. <fpage>1</fpage>&#x02013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCCS49678.2020.9277077</pub-id></mixed-citation>
</ref>
<ref id="B21">
<label>21.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nandhini</surname> <given-names>S</given-names></name> <name><surname>Ashokkumar</surname> <given-names>K</given-names></name></person-group>. <article-title>An automatic plant leaf disease identification using DenseNet-121 architecture with a mutation-based henry gas solubility optimization algorithm</article-title>. <source>Neural Comput Appl.</source> (<year>2022</year>) <volume>34</volume>:<fpage>5513</fpage>&#x02013;<lpage>34</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00521-021-06714-z</pub-id></mixed-citation>
</ref>
<ref id="B22">
<label>22.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Maharana</surname> <given-names>K</given-names></name> <name><surname>Mondal</surname> <given-names>S</given-names></name> <name><surname>Nemade</surname> <given-names>B</given-names></name></person-group>. <article-title>A review: data pre-processing and data augmentation techniques</article-title>. <source>Global Transitions Proc.</source> (<year>2022</year>) <volume>3</volume>:<fpage>91</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.gltp.2022.04.020</pub-id></mixed-citation>
</ref>
<ref id="B23">
<label>23.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rana</surname> <given-names>P</given-names></name> <name><surname>Sowmya</surname> <given-names>A</given-names></name> <name><surname>Meijering</surname> <given-names>E</given-names></name> <name><surname>Song</surname> <given-names>Y</given-names></name></person-group>. <article-title>Data augmentation with improved regularisation and sampling for imbalanced blood cell image classification</article-title>. <source>Sci Rep.</source> (<year>2022</year>) <volume>12</volume>:<fpage>18101</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-022-22882-x</pub-id><pub-id pub-id-type="pmid">36302948</pub-id></mixed-citation>
</ref>
<ref id="B24">
<label>24.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rochac</surname> <given-names>JFR</given-names></name> <name><surname>Zhang</surname> <given-names>N</given-names></name> <name><surname>Thompson</surname> <given-names>L</given-names></name> <name><surname>Oladunni</surname> <given-names>T</given-names></name></person-group>. <article-title>A data augmentation-assisted deep learning model for high dimensional and highly imbalanced hyperspectral imaging data</article-title>. in <source>2019 9th International Conference on Information Science and Technology (ICIST)</source>. Hulunbuir: IEEE (<year>2019</year>). p. <fpage>362</fpage>&#x02013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICIST.2019.8836913</pub-id></mixed-citation>
</ref>
<ref id="B25">
<label>25.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Woo</surname> <given-names>S</given-names></name> <name><surname>Park</surname> <given-names>J</given-names></name> <name><surname>Lee</surname> <given-names>J-Y</given-names></name> <name><surname>Kweon</surname> <given-names>IS</given-names></name></person-group>. <article-title>CBAM: Convolutional block attention module</article-title>. in <source>Proceedings of the European Conference on Computer Vision (ECCV)</source>. Munich (<year>2018</year>). p. <fpage>3</fpage>&#x02013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_1</pub-id></mixed-citation>
</ref>
<ref id="B26">
<label>26.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>H</given-names></name> <name><surname>Wang</surname> <given-names>X</given-names></name> <name><surname>Han</surname> <given-names>T</given-names></name> <name><surname>Wang</surname> <given-names>Z</given-names></name> <name><surname>Zhao</surname> <given-names>Z</given-names></name> <name><surname>Zhang</surname> <given-names>P</given-names></name></person-group>. <article-title>Research on a U-Net bridge crack identification and feature-calculation methods based on a CBAM attention mechanism</article-title>. <source>Buildings.</source> (<year>2022</year>) <volume>12</volume>:<fpage>1561</fpage>. doi: <pub-id pub-id-type="doi">10.3390/buildings12101561</pub-id></mixed-citation>
</ref>
<ref id="B27">
<label>27.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>R</given-names></name> <name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Zhao</surname> <given-names>W</given-names></name> <name><surname>Guo</surname> <given-names>H</given-names></name> <name><surname>Dai</surname> <given-names>D</given-names></name> <name><surname>Ma</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>Identification of maize seed varieties using MobileNetV2 with improved attention mechanism CBAM</article-title>. <source>Agriculture</source>. (<year>2022</year>) <volume>13</volume>:<fpage>11</fpage>. doi: <pub-id pub-id-type="doi">10.3390/agriculture13010011</pub-id></mixed-citation>
</ref>
<ref id="B28">
<label>28.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pang</surname> <given-names>B</given-names></name></person-group>. <article-title>Classification of images using EfficientNet CNN model with convolutional block attention module (CBAM) and spatial group-wise enhance module (SGE)</article-title>. in <source>International Conference on Image, Signal Processing, and Pattern Recognition (ISPP 2022)</source>. SPIE (<year>2022</year>). p. <fpage>34</fpage>&#x02013;<lpage>41</lpage>. doi: <pub-id pub-id-type="doi">10.1117/12.2636811</pub-id></mixed-citation>
</ref>
<ref id="B29">
<label>29.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname> <given-names>A</given-names></name> <name><surname>Kaur</surname> <given-names>A</given-names></name></person-group>. <article-title>Multi-crop disease prediction using attention-augmented deep learning: a DenseNet121-CBAM Model</article-title>. in <source>2025 International Conference on Electronics, AI and Computing (EAIC)</source>. Jalandhar: IEEE (<year>2025</year>). p. <fpage>1</fpage>&#x02013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1109/EAIC66483.2025.11101367</pub-id></mixed-citation>
</ref>
<ref id="B30">
<label>30.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ghiasi</surname> <given-names>G</given-names></name> <name><surname>Lin</surname> <given-names>T-Y</given-names></name> <name><surname>Le</surname> <given-names>QV</given-names></name></person-group>. <article-title>DropBlock: a regularization method for convolutional networks</article-title>. <source>Adv Neural Inf Process Syst.</source> (<year>2018</year>) <volume>31</volume>:<fpage>10750</fpage>&#x02013;<lpage>60</lpage>.</mixed-citation>
</ref>
<ref id="B31">
<label>31.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>Y</given-names></name> <name><surname>Dong</surname> <given-names>S</given-names></name> <name><surname>Tong</surname> <given-names>Y</given-names></name> <name><surname>Ma</surname> <given-names>Z</given-names></name> <name><surname>Xiao</surname> <given-names>B</given-names></name> <name><surname>Ling</surname> <given-names>H</given-names></name></person-group>. <article-title>Channel DropBlock: an improved regularization method for fine-grained visual classification</article-title>. <source>arXiv</source>. [preprint] (<year>2021</year>). arXiv:2106.03432. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2106.03432</pub-id></mixed-citation>
</ref>
<ref id="B32">
<label>32.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dai</surname> <given-names>Z</given-names></name> <name><surname>Chen</surname> <given-names>M</given-names></name> <name><surname>Gu</surname> <given-names>X</given-names></name> <name><surname>Zhu</surname> <given-names>S</given-names></name> <name><surname>Tan</surname> <given-names>P</given-names></name></person-group>. <article-title>Batch dropblock network for person re-identification and beyond</article-title>. in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source>. Seoul: IEEE (<year>2019</year>). p. <fpage>3691</fpage>&#x02013;<lpage>3701</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV.2019.00379</pub-id></mixed-citation>
</ref>
<ref id="B33">
<label>33.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hatamizadeh</surname> <given-names>A</given-names></name> <name><surname>Yin</surname> <given-names>H</given-names></name> <name><surname>Heinrich</surname> <given-names>G</given-names></name> <name><surname>Kautz</surname> <given-names>J</given-names></name> <name><surname>Molchanov</surname> <given-names>P</given-names></name></person-group>. <article-title>Global context vision transformers</article-title>. <source>In International Conference on Machine Learning.</source> <publisher-loc>Jalandhar</publisher-loc>: <publisher-name>PMLR</publisher-name> (<year>2023</year>). p. 12633-46.</mixed-citation>
</ref>
<ref id="B34">
<label>34.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Modarressi</surname> <given-names>A</given-names></name> <name><surname>Fayyaz</surname> <given-names>M</given-names></name> <name><surname>Yaghoobzadeh</surname> <given-names>Y</given-names></name> <name><surname>Pilehvar</surname> <given-names>MT</given-names></name></person-group>. <article-title>GlobEnc: quantifying global token attribution by incorporating the whole encoder layer in transformers</article-title>. <source>arXiv [preprint]</source> arXiv:2205.03286 (<year>2022</year>). doi: <pub-id pub-id-type="doi">10.48550/arXiv.2205.03286</pub-id></mixed-citation>
</ref>
<ref id="B35">
<label>35.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Geva</surname> <given-names>M</given-names></name> <name><surname>Caciularu</surname> <given-names>A</given-names></name> <name><surname>Wang</surname> <given-names>KR</given-names></name> <name><surname>Goldberg</surname> <given-names>Y</given-names></name></person-group>. <article-title>Transformer feed-forward layers build predictions by promoting concepts in the vocabulary space</article-title>. <source>arXiv</source> [preprint] (<year>2022</year>). arXiv:2203.14680. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2203.14680</pub-id></mixed-citation>
</ref>
<ref id="B36">
<label>36.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fernando</surname> <given-names>KRM</given-names></name> <name><surname>Tsokos</surname> <given-names>CP</given-names></name></person-group>. <article-title>Dynamically weighted balanced loss: class imbalanced learning and confidence calibration of deep neural networks</article-title>. <source>IEEE Trans Neural Netw Learn Syst.</source> (<year>2021</year>) <volume>33</volume>:<fpage>2940</fpage>&#x02013;<lpage>51</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2020.3047335</pub-id><pub-id pub-id-type="pmid">33444149</pub-id></mixed-citation>
</ref>
<ref id="B37">
<label>37.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alshammari</surname> <given-names>S</given-names></name> <name><surname>Wang</surname> <given-names>Y-X</given-names></name> <name><surname>Ramanan</surname> <given-names>D</given-names></name> <name><surname>Kong</surname> <given-names>S</given-names></name></person-group>. <article-title>Long-tailed recognition via weight balancing</article-title>. in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<year>2022</year>). p. <fpage>6897</fpage>&#x02013;<lpage>907</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52688.2022.00677</pub-id></mixed-citation>
</ref>
<ref id="B38">
<label>38.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ennab</surname> <given-names>M</given-names></name> <name><surname>Mcheick</surname> <given-names>H</given-names></name></person-group>. <article-title>Advancing AI interpretability in medical imaging: a comparative analysis of pixel-level interpretability and Grad-CAM models</article-title>. <source>Mach Learn Knowl Extr</source>. (<year>2025</year>) <volume>7</volume>:<fpage>12</fpage>. doi: <pub-id pub-id-type="doi">10.3390/make7010012</pub-id></mixed-citation>
</ref>
<ref id="B39">
<label>39.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Selvaraju</surname> <given-names>RR</given-names></name> <name><surname>Das</surname> <given-names>A</given-names></name> <name><surname>Vedantam</surname> <given-names>R</given-names></name> <name><surname>Cogswell</surname> <given-names>M</given-names></name> <name><surname>Parikh</surname> <given-names>D</given-names></name> <name><surname>Batra</surname> <given-names>D</given-names></name></person-group>. <article-title>Grad-CAM: why did you say that?</article-title> <source>arXiv [preprint]</source> arXiv:1611:07450 (<year>2016</year>). doi: <pub-id pub-id-type="doi">10.48550/arXiv.1611.07450</pub-id></mixed-citation>
</ref>
<ref id="B40">
<label>40.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abd El-Ghany</surname> <given-names>S</given-names></name> <name><surname>Mahmood</surname> <given-names>MA</given-names></name> <name><surname>Abd El-Aziz</surname> <given-names>AA</given-names></name></person-group>. <article-title>Automated eye disease diagnosis using a 2D CNN with Grad-CAM: high-accuracy detection of retinal asymmetries for multiclass classification</article-title>. <source>Symmetry.</source> (<year>2025</year>) <volume>17</volume>:<fpage>768</fpage>. doi: <pub-id pub-id-type="doi">10.3390/sym17050768</pub-id></mixed-citation>
</ref>
<ref id="B41">
<label>41.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>X</given-names></name> <name><surname>Liu</surname> <given-names>D</given-names></name> <name><surname>Huang</surname> <given-names>G</given-names></name> <name><surname>Wang</surname> <given-names>M</given-names></name> <name><surname>Lei</surname> <given-names>M</given-names></name> <name><surname>Jia</surname> <given-names>Y</given-names></name></person-group>. <article-title>Computer aided diagnosis of diabetic retinopathy based on multi-view joint learning</article-title>. <source>Comput Biol Med.</source> (<year>2024</year>) <volume>174</volume>:<fpage>108428</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108428</pub-id><pub-id pub-id-type="pmid">38631117</pub-id></mixed-citation>
</ref>
<ref id="B42">
<label>42.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tariq</surname> <given-names>M</given-names></name> <name><surname>Palade</surname> <given-names>V</given-names></name> <name><surname>Ma</surname> <given-names>Y</given-names></name></person-group>. <article-title>Effective diabetic retinopathy classification with siamese neural network: a strategy for small dataset challenges</article-title>. (<year>2024</year>) <italic>IEEE Access</italic>. IEEE. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3510556</pub-id></mixed-citation>
</ref>
<ref id="B43">
<label>43.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dixit</surname> <given-names>RB</given-names></name> <name><surname>Jha</surname> <given-names>CK</given-names></name></person-group>. <article-title>Fundus image based diabetic retinopathy detection using EfficientNetB3 with squeeze and excitation block</article-title>. <source>Med Eng Phys</source>. (<year>2025</year>) <fpage>104350</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.medengphy.2025.104350</pub-id><pub-id pub-id-type="pmid">40436513</pub-id></mixed-citation>
</ref>
<ref id="B44">
<label>44.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Badar</surname> <given-names>D</given-names></name> <name><surname>Abbas</surname> <given-names>J</given-names></name> <name><surname>Alsini</surname> <given-names>R</given-names></name> <name><surname>Abbas</surname> <given-names>T</given-names></name> <name><surname>ChengLiang</surname> <given-names>W</given-names></name> <name><surname>Daud</surname> <given-names>A</given-names></name></person-group>. <article-title>Transformer attention fusion for fine grained medical image classification</article-title>. <source>Sci Rep.</source> (<year>2025</year>) 15:20655. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-07561-x</pub-id><pub-id pub-id-type="pmid">40596233</pub-id></mixed-citation>
</ref>
<ref id="B45">
<label>45.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Herrero-Tudela</surname> <given-names>M</given-names></name> <name><surname>Romero-Oraa</surname> <given-names>R</given-names></name> <name><surname>Hornero</surname> <given-names>R</given-names></name> <name><surname>Tobal</surname> <given-names>GCG</given-names></name> <name><surname>Lopez</surname> <given-names>MI</given-names></name> <name><surname>Garcia</surname> <given-names>M</given-names></name></person-group>. <article-title>An explainable deep-learning model reveals clinical clues in diabetic retinopathy through SHAP</article-title>. <source>Biomed Signal Process Control.</source> (<year>2025</year>) <volume>102</volume>:<fpage>107328</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2024.107328</pub-id></mixed-citation>
</ref>
<ref id="B46">
<label>46.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Singh</surname> <given-names>AK</given-names></name> <name><surname>Madarapu</surname> <given-names>S</given-names></name> <name><surname>Ari</surname> <given-names>S</given-names></name></person-group>. <article-title>Diabetic retinopathy grading based on multi-scale residual network and cross-attention module</article-title>. <source>Digit Signal Process.</source> (<year>2025</year>) <volume>157</volume>:<fpage>104888</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.dsp.2024.104888</pub-id></mixed-citation>
</ref>
<ref id="B47">
<label>47.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Akram</surname> <given-names>M</given-names></name> <name><surname>Adnan</surname> <given-names>M</given-names></name> <name><surname>Ali</surname> <given-names>SF</given-names></name> <name><surname>Ahmad</surname> <given-names>J</given-names></name> <name><surname>Yousef</surname> <given-names>A</given-names></name> <name><surname>Alshalali</surname> <given-names>TAN</given-names></name> <etal/></person-group>. <article-title>Uncertainty-aware diabetic retinopathy detection using deep learning enhanced by Bayesian approaches</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>1342</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-024-84478-x</pub-id><pub-id pub-id-type="pmid">39779778</pub-id></mixed-citation>
</ref>
<ref id="B48">
<label>48.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ikram</surname> <given-names>A</given-names></name> <name><surname>Imran</surname> <given-names>A</given-names></name></person-group>. <article-title>ResViT FusionNet Model: an explainable AI-driven approach for automated grading of diabetic retinopathy in retinal images</article-title>. <source>Comput Biol Med.</source> (<year>2025</year>) <volume>186</volume>:<fpage>109656</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.109656</pub-id><pub-id pub-id-type="pmid">39823821</pub-id></mixed-citation>
</ref>
<ref id="B49">
<label>49.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rupasinghe</surname> <given-names>ADP</given-names></name> <name><surname>Samarawickrama</surname> <given-names>KG</given-names></name></person-group>. <article-title>A Deep learning framework for the identification of distinct stages in diabetic retinopathy through retinal image analysis</article-title>. in <source>SETSCI-Conference Proceedings, SETSCI-Conference Proceedings</source> (<year>2025</year>). p. <fpage>34</fpage>&#x02013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.36287/setsci.22.3.001</pub-id></mixed-citation>
</ref>
<ref id="B50">
<label>50.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zafar</surname> <given-names>A</given-names></name> <name><surname>Kim</surname> <given-names>KS</given-names></name> <name><surname>Ali</surname> <given-names>MU</given-names></name> <name><surname>Byun</surname> <given-names>JH</given-names></name> <name><surname>Kim S-</surname> <given-names>H</given-names></name></person-group>. <article-title>A lightweight multi-deep learning framework for accurate diabetic retinopathy detection and multi-level severity identification</article-title>. <source>Front Med</source>. (<year>2025</year>) <volume>12</volume>:<fpage>1551315</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmed.2025.1551315</pub-id><pub-id pub-id-type="pmid">40241910</pub-id></mixed-citation>
</ref>
<ref id="B51">
<label>51.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Yao</surname> <given-names>D</given-names></name> <name><surname>Ma</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Bai</surname> <given-names>X</given-names></name> <etal/></person-group>. <article-title>STMF-DRNet: a multi-branch fine-grained classification model for diabetic retinopathy using Swin-TransformerV2</article-title>. <source>Biomed Signal Process Control</source>. (<year>2025</year>) <volume>103</volume>:<fpage>107352</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2024.107352</pub-id></mixed-citation>
</ref>
<ref id="B52">
<label>52.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>X</given-names></name> <name><surname>Yang</surname> <given-names>G</given-names></name> <name><surname>Xu</surname> <given-names>C</given-names></name> <name><surname>Dong</surname> <given-names>H</given-names></name> <name><surname>Hu</surname> <given-names>X</given-names></name> <name><surname>Che</surname> <given-names>S</given-names></name></person-group>. <article-title>Prior-guided dual-stage diabetic retinopathy grading model based on feature collaboration of lesion and vascular structure</article-title>. <source>Expert Syst Appl.</source> (<year>2025</year>) <volume>285</volume>:<fpage>128052</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2025.128052</pub-id></mixed-citation>
</ref>
<ref id="B53">
<label>53.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Adnan</surname> <given-names>A</given-names></name> <name><surname>Shoaib</surname> <given-names>M</given-names></name> <name><surname>Altaf</surname> <given-names>A</given-names></name> <name><surname>Kausar</surname> <given-names>F</given-names></name> <name><surname>Iqbal</surname> <given-names>F</given-names></name> <name><surname>Asif</surname> <given-names>HM</given-names></name></person-group>. <article-title>A secure and privacy-preserving approach to healthcare data collaboration</article-title>. <source>Symmetry.</source> (<year>2025</year>) <volume>17</volume>:<fpage>1139</fpage>. doi: <pub-id pub-id-type="doi">10.3390/sym17071139</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1321924/overview">Francesco Napolitano</ext-link>, University of Sannio, Italy</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2140935/overview">R. Geetha</ext-link>, Saveetha University, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3264871/overview">Biswadip Basu Mallik</ext-link>, Institute of Engineering and Management (IEM), India</p>
</fn>
</fn-group>
</back>
</article>