<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Cell Dev. Biol.</journal-id>
<journal-title>Frontiers in Cell and Developmental Biology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Cell Dev. Biol.</abbrev-journal-title>
<issn pub-type="epub">2296-634X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1484880</article-id>
<article-id pub-id-type="doi">10.3389/fcell.2024.1484880</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Cell and Developmental Biology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Multi-resolution visual Mamba with multi-directional selective mechanism for retinal disease detection</article-title>
<alt-title alt-title-type="left-running-head">Zuo et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fcell.2024.1484880">10.3389/fcell.2024.1484880</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zuo</surname>
<given-names>Qiankun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2267617/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Shi</surname>
<given-names>Zhengkun</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Bo</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ping</surname>
<given-names>Na</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Jiangtao</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cheng</surname>
<given-names>Xi</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Kexin</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Guo</surname>
<given-names>Jia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2668164/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Yixian</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Hong</surname>
<given-names>Jin</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2707232/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Hubei Key Laboratory of Digital Finance Innovation</institution>, <institution>Hubei University of Economics</institution>, <addr-line>Wuhan</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Information Engineering</institution>, <institution>Hubei University of Economics</institution>, <addr-line>Wuhan</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Hubei Internet Finance Information Engineering Technology Research Center</institution>, <institution>Hubei University of Economics</institution>, <addr-line>Wuhan</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>School of Mathematics and Computer Science</institution>, <institution>Nanchang University</institution>, <addr-line>Nanchang</addr-line>, <country>China</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>School of Mechanical Engineering</institution>, <institution>Beijing Institute of Petrochemical Technology</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>School of Information Engineering</institution>, <institution>Nanchang University</institution>, <addr-line>Nanchang</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1491541/overview">Yanwu Xu</ext-link>, Baidu, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2824310/overview">Siyuan Lu</ext-link>, Nanjing University of Posts and Telecommunications, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2824322/overview">Rongli Zhang</ext-link>, The University of Hong Kong, Hong Kong, SAR China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/600132/overview">Cunjing Zheng</ext-link>, Sun Yat-sen University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Jia Guo, <email>guojia@hbue.edu.cn</email>; Jin Hong, <email>hongjin@ncu.edu.cn</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>11</day>
<month>10</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>12</volume>
<elocation-id>1484880</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>08</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>09</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Zuo, Shi, Liu, Ping, Wang, Cheng, Zhang, Guo, Wu and Hong.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Zuo, Shi, Liu, Ping, Wang, Cheng, Zhang, Guo, Wu and Hong</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Retinal diseases significantly impact patients&#x2019; quality of life and increase social medical costs. Optical coherence tomography (OCT) offers high-resolution imaging for precise detection and monitoring of these conditions. While deep learning techniques have been employed to extract features from OCT images for classification, convolutional neural networks (CNNs) often fail to capture global context due to their focus on local receptive fields. Transformer-based methods, on the other hand, suffer from quadratic complexity when handling long-range dependencies.</p>
</sec>
<sec>
<title>Methods</title>
<p>To overcome these limitations, we introduce the Multi-Resolution Visual Mamba (MRVM) model, which addresses long-range dependencies with linear computational complexity for OCT image classification. The MRVM model initially employs convolution to extract local features and subsequently utilizes the retinal Mamba to capture global dependencies. By integrating multi-scale global features, the MRVM enhances classification accuracy and overall performance. Additionally, the multi-directional selection mechanism (MSM) within the retinal Mamba improves feature extraction by concentrating on various directions, thereby better capturing complex, orientation-specific retinal patterns.</p>
</sec>
<sec>
<title>Results</title>
<p>Experimental results demonstrate that the MRVM model excels in differentiating retinal images with various lesions, achieving superior detection accuracy compared to traditional methods, with overall accuracies of 98.98\% and 96.21\% on two public datasets, respectively.</p>
</sec>
<sec>
<title>Discussion</title>
<p>This approach offers a novel perspective for accurately identifying retinal diseases and could contribute to the development of more robust artificial intelligence algorithms and recognition systems for medical image-assisted diagnosis.</p>
</sec>
</abstract>
<kwd-group>
<kwd>retinal disease detection</kwd>
<kwd>state-space model</kwd>
<kwd>global&#x2013;local feature</kwd>
<kwd>multi-scale fusion</kwd>
<kwd>multi-directional selective learning</kwd>
</kwd-group>
<contract-sponsor id="cn001">Natural Science Foundation of Hubei Province<named-content content-type="fundref-id">10.13039/501100003819</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Molecular and Cellular Pathology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>The human body relies on the eyes to perceive external information. However, the eyes are easily damaged because of prolonged screen exposure, resulting in frequent vision problems and serious interference with daily life <xref ref-type="bibr" rid="B23">Rauchman et al. (2022)</xref>. In today&#x2019;s society, the popularity of electronic devices such as mobile phones and computers makes it almost impossible to work and study without using electronic screens, which undoubtedly poses a direct challenge to vision. Long-term immersion in front of electronic screens often leads to varying degrees of vision damage <xref ref-type="bibr" rid="B14">Lanzani et al. (2024)</xref>. Due to the large population base and uneven distribution of medical resources, not everyone can receive high-quality medical diagnosis and treatment in time, which increases the risk of delayed illness and makes some patients miss the best time for treatment. According to the World Health Organization, approximately 2.2 billion people in the world have vision problems caused by eye diseases <xref ref-type="bibr" rid="B20">Bashshur and Ross (2020)</xref>. It is particularly noteworthy that nearly half of these vision impairments could have been avoided or recovered through effective preventive measures or early and timely intervention. Therefore, in the field of clinical research, early detection and accurate diagnosis of eye diseases <xref ref-type="bibr" rid="B36">Xu et al. (2022)</xref>; <xref ref-type="bibr" rid="B30">Wan et al. (2023b)</xref>; <xref ref-type="bibr" rid="B31">Wan et al. (2024b)</xref> are particularly important. Accurate diagnosis of eye diseases can not only reduce avoidable vision loss, but also improve the quality of patients&#x2019; life.</p>
<p>With the continuous advancements in optimal theory and technology <xref ref-type="bibr" rid="B28">Wan et al. (2023a)</xref>; <xref ref-type="bibr" rid="B29">Wan et al. (2024a)</xref>; <xref ref-type="bibr" rid="B12">Ji et al. (2024)</xref>, optical coherence tomography (OCT) technology has emerged and rapidly penetrated into the medical field <xref ref-type="bibr" rid="B2">Bouma et al. (2022)</xref>. OCT has significant advantages such as high resolution, efficient detection, and non-invasiveness. It can be used for the detection and diagnosis of retinopathy and has now become an indispensable routine method in eye examinations <xref ref-type="bibr" rid="B35">Xu et al. (2023)</xref>. <xref ref-type="fig" rid="F1">Figure 1</xref> shows eight examples of retinal disease, namely, age-related macular degeneration (AMD), choroidal neovascularization (CNV), central serous chorioretinopathy (CSR), diabetic macular edema (DME), macular hole (MH), Drusen, diabetic retinopathy (DR), and normal. However, due to hardware and equipment factors, OCT images are often mixed with unavoidable noise during the imaging process, which undoubtedly increases the complexity and challenge of diagnosis for doctors. Moreover, OCT is a grayscale imaging technique. Since the characteristics of small lesions are not clear enough at the grayscale level, these subtle changes are often difficult to detect, which increases the risk of missed diagnosis by doctors. At the same time, although the number of patients with retinal eye diseases increases year by year, the number of doctors with professional diagnostic capabilities is relatively scarce. This contradiction is becoming increasingly prominent, making it difficult to effectively meet the diagnosis and treatment needs of a large patient population <xref ref-type="bibr" rid="B3">Daich Varela et al. (2023)</xref>. This technology can assist doctors in accurately assessing patients&#x2019; conditions, effectively reducing doctors&#x2019; workload, while improving the accuracy of eye disease screening and diagnosis. It has far-reaching significance for optimizing the allocation of medical resources and improving the quality of medical services.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Visualization of the eight retinal diseases.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g001.tif"/>
</fig>
<p>In the field of medical image processing, convolutional neural networks (CNNs) have performed well in medical image segmentation <xref ref-type="bibr" rid="B16">Li et al. (2024)</xref>; <xref ref-type="bibr" rid="B11">Hong et al. (2022b)</xref>; <xref ref-type="bibr" rid="B41">Zhang et al. (2023)</xref>, image generation <xref ref-type="bibr" rid="B37">You et al. (2022)</xref>; <xref ref-type="bibr" rid="B38">You et al. (2024)</xref>, and image classification <xref ref-type="bibr" rid="B40">Yu et al. (2022)</xref>; <xref ref-type="bibr" rid="B43">Zong et al. (2024)</xref>; <xref ref-type="bibr" rid="B45">Zuo et al. (2023a)</xref>. By stacking multiple layers of convolution and pooling layers, CNNs can effectively extract complex features and subtle lesions in images <xref ref-type="bibr" rid="B10">Hong et al. (2022a)</xref>, such as microaneurysms and exudates, which are key signs of diseases such as diabetic retinopathy. Combined with fully connected layers for feature integration and classification, CNN models can accurately distinguish different types of retinal diseases, providing ophthalmologists with fast and objective preliminary diagnostic references, thereby improving the diagnostic efficiency and accuracy and speeding up patient treatment. However, CNN models have difficulty modeling long-distance dependencies in images and are sensitive to position translation, which limits their application in certain complex retinal disease classification tasks.</p>
<p>Due to its remarkable work in natural language processing, the transformer network is now gradually entering the field of medical image computing <xref ref-type="bibr" rid="B44">Zuo et al. (2024)</xref>; <xref ref-type="bibr" rid="B46">Zuo et al. (2023b)</xref>, bringing improvements in performance of the task of retinal disease image classification <xref ref-type="bibr" rid="B21">Parvaiz et al. (2023)</xref>. Due to the unique self-attention mechanism, the transformer-based network is able to deeply analyze the complex relationship between each pixel and other pixels in the image, thereby capturing small but important pathological features in retinal disease images, such as subtle vascular abnormalities and exudate distribution. This global information integration capability enables the transformer network to more accurately identify different types of retinal diseases during the classification process, providing ophthalmologists with a more reliable and timely diagnostic basis. Since the network does not consider the spatial locality of the image, it may not capture detailed features as finely as CNNs when processing high-resolution medical images and requires larger data sets and computing resources to train, all of which limit the application scenarios of transformer-based models in medical image diagnosis.</p>
<p>Recently, the Mamba network, an innovative deep learning architecture, has excelled in long-distance relationship modeling <xref ref-type="bibr" rid="B6">Gu and Dao (2023)</xref>; <xref ref-type="bibr" rid="B42">Zhu et al. (2024)</xref>. Through its unique selection state mechanism, it effectively captures the spatial dependencies between distant regions in an image and ignores noise interference, thereby improving the learning efficiency and prediction accuracy of the model. Inspired by the above observations, we combined the CNN and Mamba networks and proposed the multi-resolution visual Mamba (MRVM) model for OCT image classification. The MRVM model first extracts local features from OCT images using convolution and then captures global long-range dependencies through the retinal Mamba. Next, by integrating multi-scale global features, the model enhances the classification accuracy and overall performance. The multi-directional selection mechanism (MSM) within the retinal Mamba improves feature extraction by focusing on various directions, thereby boosting the model&#x2019;s ability to detect complex, orientation-specific retinal patterns. Finally, the fused multi-scale features are sent to the classifier to discriminate disease-related OCT images. The proposed model has the potential to accurately detect retinal diseases and can be extended to other medical image classifications. The main contributions of this work are summarized as follows.<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> The proposed MRVM model first extracts local features of OCT images through the convolution module and then extracts global long-range dependent features through the retinal Mamba, significantly improving the performance of image analysis and recognition tasks.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> We devised the MSM in the retinal Mamba to enhance feature extraction by focusing on multiple directions of the local receptive feature map. This enables the model to more effectively capture complex, orientation-specific patterns in retinal images, improving the performance of image classification and retinal disease detection.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> By fusing multi-scale global features, it can capture detailed lesion characteristics of retinal images at different scales, further improving the performance of OCT image classification and making the model more robust and accurate.</p>
</list-item>
</list>
</p>
<p>The subsequent sections of this work are structured as follows: In <xref ref-type="sec" rid="s2">Section 2</xref>, we review the literature on retinal disease detection. We detail the innovative MRVM model in <xref ref-type="sec" rid="s3">Section 3</xref> to introduce a novel approach for detecting retinal disease using OCT images. Subsequently, <xref ref-type="sec" rid="s4">Sections 4</xref> and <xref ref-type="sec" rid="s5">5</xref> present the experimental setup alongside comparative prediction outcomes utilizing alternative methods. Lastly, Section 6 delves into the credibility of this work and provides concise key findings.</p>
</sec>
<sec id="s2">
<title>2 Related works</title>
<p>The classification performance of retinal OCT images is also constantly improving with the advancement of artificial intelligence. These improved methods mainly focus on local feature learning and global feature learning.</p>
<p>The first approach focuses on local lesion characteristics. It deeply analyzes the key lesion signs in the image, such as changes in the vascular morphology, edema of the optic disc, and abnormal manifestations of the macular area, and accurately captures the specific characteristics of these lesions to achieve accurate classification of the retinal diseases. <xref ref-type="bibr" rid="B24">Rong et al. (2018)</xref> proposed a CNN-based automatic classification method to effectively classify OCT images through image denoising, mask extraction, and proxy image generation. This CNN-based method performs well in evaluation on different databases. <xref ref-type="bibr" rid="B1">Alqudah, (2020)</xref> developed a more powerful CNN-based model to classify five types of retinal diseases (including AMD, CNV, DME, Drusen, and normal) with an overall accuracy of 95.3%. <xref ref-type="bibr" rid="B13">Karthik and Mahadevappa (2023)</xref> replaced the residual connection with the contrast of derivatives in the standard ResNet model. Experimental results on the two public OCT datasets show at least 1% improvement in the accuracy estimation. To reduce the model size, <xref ref-type="bibr" rid="B26">Sunija et al. (2021)</xref> designed only six convolutional blocks with downsampling and weight sharing mechanisms to classify four-label OCT images. Compared with the existing ResNet-50 model, it uses 6.9% of the learnable parameters but has a better classification performance. Considering the previous methods may ignore useful discriminative information at different scales, <xref ref-type="bibr" rid="B32">Wang and Wang (2019)</xref> designed a novel CNN-based method to automatically detect AME and AMD, which shows good classification performance in cross-dataset adaptability. In addition, <xref ref-type="bibr" rid="B4">Das et al. (2021)</xref> proposed a deep multi-scale fusion convolutional neural network (DMF-CNN) to extract and fuse different scale features for AMD/DME/normal classification. The multi-label classification results show excellent performance and good versatility on the UCSD and NEH datasets.</p>
<p>The second approach is modeling the global diseased areas, which focuses on the overall information of the image, comprehensively considers multiple visual elements and structural features in the image, and does not need to identify specific lesions separately but directly performs intelligent analysis on the entire image so as to determine the label of retinal diseases from a global perspective. <xref ref-type="bibr" rid="B39">Yu et al. (2021)</xref> applied the vision transformer (VIT) to the task of retinal disease classification. Their framework outperforms CNN models on two publicly funded image datasets. <xref ref-type="bibr" rid="B25">Shen et al. (2023)</xref> incorporated the clinical prior knowledge to guide the transformer-based network for retinal disease prediction and achieved superior classification and good generality on the public nAMD dataset. <xref ref-type="bibr" rid="B7">Hammou et al. (2023)</xref> used the pre-trained state-of-the-art models as the prior knowledge and fine-tuned these models to classify OCT videos. This method has potential application in the real-time diagnosis of retinal diseases. To improve the accuracy and interpretability of these classification models, <xref ref-type="bibr" rid="B8">He et al. (2023)</xref> proposed a transformer-based model with Swin-poly strategy to classify retinal OCT images. They achieved state-of-the-art performance on the OCT2017 dataset, which is superior to that of both vision transformer (VIT) and convolutional neural network approaches. A similar work is presented in <xref ref-type="bibr" rid="B22">Playout et al. (2022)</xref>. <xref ref-type="bibr" rid="B33">Wen et al. (2022)</xref> combined the transformer and CNN to train this hybrid model for ophthalmic disease classification. This model extracts both local and global contexts for lesion area extraction and understanding with considerable accuracy improvement. In addition, they <xref ref-type="bibr" rid="B15">Laouarem et al. (2024)</xref> designed a hybrid model to classify seven retinal diseases by combining visual transformers and CNN. They extracted multi-scale local features from OCT images by a hierarchical CNN and achieved good results on three public datasets. <xref ref-type="bibr" rid="B9">Hemalakshmi et al. (2024)</xref> proposed a SqueezeNet-Vit model to extract local and global features for more accurate OCT classification.</p>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methods</title>
<p>The proposed MVRM model is illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. The input is an image with the size <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the output is the retinal disease label. There are three main blocks: the convolutional block, the retinal Mamba block, and the classifier block. The convolutional block is used to extract local structures buried in the image by using local receptive fields and parameter sharing. The local receptive field allows the convolution kernel to focus on only a small area, thereby capturing local features. The retinal Mamba focuses on the long-range dependencies and mines the overall lesion area association in OCT images. Through the resampling modules, the three retinal Mamba modules can generate multi-scale global&#x2013;local features for capturing the characteristics of the lesion area from all directions. By cleverly integrating global features and local features, the proposed model not only fully retains disease-related global information but also significantly enhances its ability to keenly capture local subtle differences. This fusion strategy effectively improves the accuracy and robustness of classification tasks. Furthermore, by using the category loss function to optimize and calculate these fused multi-scale features, the model can generate more refined and representative representations for each retinal disease category. These representations accurately reflect the core characteristics of retinal diseases and can be used for analysis and decision-making on other downstream tasks. The details of these blocks are described in the following sections.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Architecture of the proposed MVRM model, consisting of the Conv block, retinal Mamba module, sampling module, and classifier. The input is a two-dimensional image, and the output is a vector representing the retinal disease label.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g002.tif"/>
</fig>
<sec id="s3-1">
<title>3.1 Convolutional block</title>
<p>In the convolution module, we designed three residual layers, and the output sizes of these three residual layers are as follows: <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, and <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>. Adjacent residual layers are connected with 1 &#xd7; 1 convolution kernels with a sliding step of 2. After the third residual layer, a 1 &#xd7; 1 convolution kernel is used to change the number of channels from <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The input image size is <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the output size is <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:math>
</inline-formula>. The calculation formula can be expressed as follows:<disp-formula id="e1">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m15">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>where, <xref ref-type="disp-formula" rid="e1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="e3">3</xref> are based on the <xref ref-type="disp-formula" rid="e4">Equation 4</xref>. In <xref ref-type="disp-formula" rid="e4">Equation 4</xref>, it contains 2 sub-convolution layers. The first sub-convolution layer contains a 3 &#xd7; 3 convolution (Conv) kernel with a step size of 2, a batch normalization layer (BN), a <inline-formula id="inf12">
<mml:math id="m16">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> activation layer, and an average pooling layer (AvgPool); the second sub-convolution layer contains a 3 &#xd7; 3 convolution kernel with a step size of 1, a normalization layer, a ReLu activation layer, and a flat pooling layer.</p>
</sec>
<sec id="s3-2">
<title>3.2 Retinal Mamba</title>
<p>This module extracts global disease-related patterns by selectively modeling different parts of the OCT image. To capture multi-scale patterns, we designed two resampling modules to obtain multi-resolution feature maps and utilize the retinal Mamba (RM) to learn the global lesion area relations from multi-scale perspectives. The resampling module between retinal Mamba modules consists of a batch-normalized <inline-formula id="inf13">
<mml:math id="m17">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> CNN layer with a stride of 2 to halve the image resolution and double the channel dimension. The multi-scale feature maps can be computed by the following formula:<disp-formula id="e5">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<disp-formula id="e6">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
<disp-formula id="e7">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where <inline-formula id="inf14">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf15">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf16">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the output of <xref ref-type="disp-formula" rid="e5">Equations 5</xref>&#x2013;<xref ref-type="disp-formula" rid="e7">7</xref>, representing feature maps at three different multi-resolutions. The feature map sizes are <inline-formula id="inf17">
<mml:math id="m24">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf18">
<mml:math id="m25">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf19">
<mml:math id="m26">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>4</mml:mn>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. Next, we use the average pooling to normalize the three multi-resolution maps and concatenate these maps to fuse multi-scale features. The fused feature <inline-formula id="inf20">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be expressed by the following:<disp-formula id="e8">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="0.3333em"/>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="0.3333em"/>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>The fused feature <inline-formula id="inf21">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="disp-formula" rid="e8">Equation 8</xref> has the size <inline-formula id="inf22">
<mml:math id="m30">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>7</mml:mn>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<sec id="s3-2-1">
<title>3.2.1 Enhanced Mamba</title>
<p>In the retinal Mamba, four paths are used to extract different direction features from the retinal OCT image. Considering the rich pattern correlations in different directions of time series and the complexity of spatial location dependencies, the output of each enhanced Mamba is added to fuse different directional features. The structure of each enhanced Mamba is shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Detailed structure of the enhanced Mamba. It utilizes two gates to capture sequence dependencies for global complementary information. The input and output have the same dimension.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g003.tif"/>
</fig>
<p>We designed the enhanced Mamba with two pathways. The first pathway leverages a linear mapping (LM), a 1-D convolutional module, and a selective state-space model (SSM) to learn long-range sequence dependencies. The selective SSM can memorize long-term historical information in the HIPPO matrix. The second pathway generates two gates: the sigmoid-weighted linear unit (SiLU) and the reversed SiLU (R-SiLU). The SiLU gate processes the longer-term historical context, and the R-SiLU gate filters the complementary historical information to more comprehensively preserve the valuable long-term information. This designed enhanced Mamba facilitates a more nuanced and effective handling of long-term sequence modeling tasks. The computation process is illustrated in the <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>Computation process of enhanced Mamba.<list list-type="simple">
<list-item>
<p>
<bold>Input:</bold> <inline-formula id="inf23">
<mml:math id="m31">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf24">
<mml:math id="m32">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<bold>Output:</bold> <inline-formula id="inf25">
<mml:math id="m33">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf26">
<mml:math id="m34">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;1: <inline-formula id="inf27">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>11</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf28">
<mml:math id="m36">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf29">
<mml:math id="m37">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>L</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>11</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>(<inline-formula id="inf30">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>)</p>
</list-item>
<list-item>
<p>&#x2003;2: <inline-formula id="inf31">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>21</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf32">
<mml:math id="m40">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf33">
<mml:math id="m41">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>L</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>21</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>(<inline-formula id="inf34">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>)</p>
</list-item>
<list-item>
<p>&#x2003;3: <inline-formula id="inf35">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>12</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf36">
<mml:math id="m44">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf37">
<mml:math id="m45">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>11</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;4: <inline-formula id="inf38">
<mml:math id="m46">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf39">
<mml:math id="m47">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf40">
<mml:math id="m48">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>P</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;6: <inline-formula id="inf41">
<mml:math id="m49">
<mml:mrow>
<mml:mi mathvariant="bold">C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf42">
<mml:math id="m50">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf43">
<mml:math id="m51">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>L</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>12</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;7: <inline-formula id="inf44">
<mml:math id="m52">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf45">
<mml:math id="m53">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf46">
<mml:math id="m54">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>12</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;8: <inline-formula id="inf47">
<mml:math id="m55">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf48">
<mml:math id="m56">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf49">
<mml:math id="m57">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> discretize(<inline-formula id="inf50">
<mml:math id="m58">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf51">
<mml:math id="m59">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf52">
<mml:math id="m60">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>)</p>
</list-item>
<list-item>
<p>&#x2003;9: <inline-formula id="inf53">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf54">
<mml:math id="m62">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf55">
<mml:math id="m63">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>(<inline-formula id="inf56">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>12</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>)</p>
</list-item>
<list-item>
<p>&#x2003;10: <inline-formula id="inf57">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf58">
<mml:math id="m66">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf59">
<mml:math id="m67">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>21</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>12</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>21</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;11: <inline-formula id="inf60">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf61">
<mml:math id="m69">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf62">
<mml:math id="m70">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>L</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;12: Return <inline-formula id="inf63">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
</list>
</p>
</statement>
</p>
</sec>
<sec id="s3-2-2">
<title>3.2.2 Selective state-space model</title>
<p>The selective SSM can help the retinal Mamba to capture global dependencies in OCT images, capturing rich semantic disease-related information. The structure of the selective SSM is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>; it is a discretized version of the SSM, where the input is <inline-formula id="inf64">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the output is <inline-formula id="inf65">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Both of them are the features at the <inline-formula id="inf66">
<mml:math id="m74">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th time point. For the continuous condition, we map the one-dimensional sequence <inline-formula id="inf67">
<mml:math id="m75">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> to the output sequence <inline-formula id="inf68">
<mml:math id="m76">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> through latent historical representation <inline-formula id="inf69">
<mml:math id="m77">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The continuous SSM is expressed as follows:<disp-formula id="e9">
<mml:math id="m78">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">A</mml:mi>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="bold">B</mml:mi>
<mml:mi>x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m79">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>Here, <inline-formula id="inf70">
<mml:math id="m80">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the state matrix, which memorizes the history information of latent representations. <inline-formula id="inf71">
<mml:math id="m81">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf72">
<mml:math id="m82">
<mml:mrow>
<mml:mi mathvariant="bold">C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> project the input sequence and the latent representation into the output sequence. The problem of <xref ref-type="disp-formula" rid="e9">Equations 9</xref>, <xref ref-type="disp-formula" rid="e10">10</xref> lies in the unsuitable adaptation for deep learning. To solve this problem, we discretize it by introducing the time-scale factor <inline-formula id="inf73">
<mml:math id="m83">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The projection matrix <inline-formula id="inf74">
<mml:math id="m84">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the state matrix <inline-formula id="inf75">
<mml:math id="m85">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> can be transformed into <inline-formula id="inf76">
<mml:math id="m86">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf77">
<mml:math id="m87">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. The zero-order hold strategy is used to complete this task:<disp-formula id="e11">
<mml:math id="m88">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
<disp-formula id="e12">
<mml:math id="m89">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi mathvariant="bold">B</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Structure of the selective state-space model.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g004.tif"/>
</fig>
<p>After discretizing with the step size <inline-formula id="inf78">
<mml:math id="m90">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="disp-formula" rid="e11">Equations 11</xref>, <xref ref-type="disp-formula" rid="e12">12</xref>, the SSM is defined with <xref ref-type="disp-formula" rid="e13">Equations 13</xref>, <xref ref-type="disp-formula" rid="e14">14</xref>:<disp-formula id="e13">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
<disp-formula id="e14">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>Finally, we employ a convolution operation for convenient optimization of the proposed model. The SSM computation is expressed as follows:<disp-formula id="e15">
<mml:math id="m93">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
<disp-formula id="e16">
<mml:math id="m94">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">x</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">K</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>where, in <xref ref-type="disp-formula" rid="e15">Equation 15</xref>, <inline-formula id="inf79">
<mml:math id="m95">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">K</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> indicates a dynamic convolutional kernel, and <inline-formula id="inf80">
<mml:math id="m96">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the sequence length. In <xref ref-type="disp-formula" rid="e16">Equation 16</xref>, <inline-formula id="inf81">
<mml:math id="m97">
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf82">
<mml:math id="m98">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are matrices that share the same size <inline-formula id="inf83">
<mml:math id="m99">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
</sec>
<sec id="s3-3">
<title>3.3 Classifier</title>
<p>The classifier is a five-layer perceptron network, including the three hidden layers. The input layer receives the fused feature <inline-formula id="inf84">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>7</mml:mn>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The three hidden layers have <inline-formula id="inf85">
<mml:math id="m101">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf86">
<mml:math id="m102">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf87">
<mml:math id="m103">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> neurons, respectively. The output layer contains <inline-formula id="inf88">
<mml:math id="m104">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> neurons corresponding to retinal disease labels, and a softmax activation function is used to convert the output into a probability distribution, representing the predicted probability of each category. This network is trained using a back-propagation algorithm, adjusting weights and biases to reduce the error between the predicted category and the actual category. During the training process, the model learns to map the features of the input data to the corresponding category labels, thereby achieving classification. We utilized the cross-entropy objective to optimize the proposed MVRM model.<disp-formula id="e17">
<mml:math id="m105">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
<disp-formula id="e18">
<mml:math id="m106">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>where, in <xref ref-type="disp-formula" rid="e17">Equation 17</xref>, <inline-formula id="inf89">
<mml:math id="m107">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is a <inline-formula id="inf90">
<mml:math id="m108">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-length vector, the largest value index of <inline-formula id="inf91">
<mml:math id="m109">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the predicted label; <inline-formula id="inf92">
<mml:math id="m110">
<mml:mrow>
<mml:mi mathvariant="bold">Y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a one-hot vector representing the actual label. In <xref ref-type="disp-formula" rid="e18">Equation 18</xref>, L is the loss function, and <inline-formula id="inf93">
<mml:math id="m111">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the training image number.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Experimental configuration</title>
<sec id="s4-1">
<title>4.1 Dataset description</title>
<p>Due to the confidentiality and sensitivity of medical data, as well as the high expertise and time costs required for medical image annotation, the use of public datasets has become a common and effective practice in the field of medical image analysis research. Public datasets, such as OCT (optical coherence tomography) image datasets, have been carefully collected and annotated by professional teams to ensure the quality and accuracy of the data. To evaluate our model&#x2019;s effectiveness, we selected the two public OCT datasets: the OCT-2017 and the OCT-C8. The OCT-2017 dataset<xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref> covers four types of retinal disease images: age-related wet maculopathy (CNV), diabetic macular edema (DME), age-related dry maculopathy (DRUSEN), and normal retinal images (NORMAL). The dataset comes from 4,686 patients with different eye diseases and contains a total of 84,484 images. There are 37,205 CNV images, 8,616 DRUSEN images, 11,348 DME images, and 26,315 NORMAL images in the training set. The testing set contains 1,000 images, with 250 each of various lesions and normal images, which are used to evaluate model performance. The OCT-C8 dataset<xref ref-type="fn" rid="fn2">
<sup>2</sup>
</xref> contains a total of 24,000 images with eight categories. Each category has 2,300, 350, and 350 images for training, validation, and testing, respectively. The largest resolution of the OCT image is <inline-formula id="inf94">
<mml:math id="m112">
<mml:mrow>
<mml:mn>384</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>496</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and the smallest resolution of the OCT image is <inline-formula id="inf95">
<mml:math id="m113">
<mml:mrow>
<mml:mn>1536</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>496</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>In order to develop a unified model framework, we resize every OCT image into the same size: <inline-formula id="inf96">
<mml:math id="m114">
<mml:mrow>
<mml:mn>512</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels. The number of images in the original dataset is too different. During the training process, the accuracy of the category with the largest number will greatly affect the overall accuracy of the model. To solve this problem, this paper randomly selects an equal number from each category and determines the ratio of training, validating, and testing be 8:1:1. For the OCT-2017 dataset, we select 8,800 images for each category, including the 7,040 training images, 880 validating images, and 880 testing images. For the OCT-C8 dataset, we partitioned the dataset into the 8:1:1 ratio. The training, validating, and testing image numbers for each category are 2,400, 300, and 300, respectively. The datasets used for this study are summarized in <xref ref-type="table" rid="T1">Table 1</xref>. To accelerate the training speed and enhance the model&#x2019;s ability to converge toward optimal weights, we normalize the image&#x2019;s pixel values across its channels to a uniform range [0, 1]. This process ensures that the eigenvalues of the image data are within a comparable range, facilitating a more stable and efficient training process for neural networks. We also apply the image augmentation techniques (i.e., random shuffling, crop, and rotate) to enhance the generalization of the model&#x2019;s performance.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Experimental data details used in this study.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Dataset</th>
<th align="left"/>
<th align="center">AMD</th>
<th align="center">CNV</th>
<th align="center">CSR</th>
<th align="center">DME</th>
<th align="center">MH</th>
<th align="center">Drusen</th>
<th align="center">DR</th>
<th align="center">Normal</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">OCT2017</td>
<td align="center">Train</td>
<td align="center">&#x2014;</td>
<td align="center">7,040</td>
<td align="center">&#x2014;</td>
<td align="center">7,040</td>
<td align="center">&#x2014;</td>
<td align="center">7,040</td>
<td align="center">&#x2014;</td>
<td align="center">7,040</td>
</tr>
<tr>
<td align="left">
</td>
<td align="center">Val</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
</tr>
<tr>
<td align="left"/>
<td align="center">Test</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
<td align="center">&#x2014;</td>
<td align="center">880</td>
</tr>
<tr>
<td align="center">OCT-C8</td>
<td align="center">Train</td>
<td align="center">2,400</td>
<td align="center">2,400</td>
<td align="center">2,400</td>
<td align="center">2,400</td>
<td align="center">2,400</td>
<td align="center">2,400</td>
<td align="center">2,400</td>
<td align="center">2,400</td>
</tr>
<tr>
<td align="left">
</td>
<td align="center">Val</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
</tr>
<tr>
<td align="left"/>
<td align="center">Test</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
<td align="center">300</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-2">
<title>4.2 Model training details</title>
<p>In the Conv block, <inline-formula id="inf97">
<mml:math id="m115">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf98">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, there are <inline-formula id="inf99">
<mml:math id="m117">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> retinal Mamba modules. Our model was trained using the TensorFlow framework on the Nvidia RTX4090 GPU. The Adam optimizer was selected for its adaptive learning rate adjustment capability, and the initial learning rate was set to 0.001 to promote rapid convergence while avoiding overfitting. The batch size is set at 64 to balance memory usage and training efficiency. The number of epochs was set to 150. After each round of dataset training, the model performance was evaluated through the validation set, and the learning rate or model structure was adjusted in time to optimize the results. During the training process, TensorBoard was used to monitor the changes in loss and accuracy to ensure that the training process was stable and effective. The trained model is evaluated on the testing set for comparison and analysis.</p>
</sec>
<sec id="s4-3">
<title>4.3 Evaluation metrics</title>
<p>In the multi-category classification task, we use the mean accuracy (mACC), mean sensitivity (mSEN), mean specificity (mSPE), mean precision (mPRE), mean F1-score (mF1), and overall accuracy (OACC). First, we compute the ACC, SEN, SPE, and PRE for each category and then average them for all the categories. During the evaluation, for each category, we treat it as a binary classification, where the positive label is itself and the negative label is the remaining categories. Therefore, TP represents the count of samples that are correctly identified as belonging to the positive category by the network&#x2019;s predictions, matching their true-positive labels. FP denotes the number of samples that are incorrectly labeled as positive by the network&#x2019;s predictions, despite their true labels being negative. TN stands for the count of samples that are accurately classified as negative by the network&#x2019;s predictions, aligning with their genuine negative labels. FN signifies the number of samples that are erroneously classified as negative by the network, whereas their true labels are positive.<disp-formula id="e19">
<mml:math id="m118">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mi>A</mml:mi>
<mml:mi>C</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
<disp-formula id="e20">
<mml:math id="m119">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>
<disp-formula id="e21">
<mml:math id="m120">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mi>S</mml:mi>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>
<disp-formula id="e22">
<mml:math id="m121">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>
<disp-formula id="e23">
<mml:math id="m122">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(23)</label>
</disp-formula>where <inline-formula id="inf100">
<mml:math id="m123">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the testing image number and <inline-formula id="inf101">
<mml:math id="m124">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>C</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> means the accuracy for the <inline-formula id="inf102">
<mml:math id="m125">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th category. Another OACC evaluates the overall performance for all categories. In the confusion matrix, we define TL as the diagonal of the matrix, and the OACC is expressed by<disp-formula id="e24">
<mml:math id="m126">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(24)</label>
</disp-formula>
</p>
<p>
<xref ref-type="disp-formula" rid="e19">Equations 19</xref>&#x2013;<xref ref-type="disp-formula" rid="e24">24</xref> are used to evaluate the diagnosis performance of different methods on the ADNI and ABIDE datasets.</p>
</sec>
</sec>
<sec sec-type="results" id="s5">
<title>5 Results</title>
<sec id="s5-1">
<title>5.1 Prediction results</title>
<p>
<xref ref-type="fig" rid="F5">Figure 5</xref> shows the details during the training. The left graph shows the curve of loss changing with epochs, and the right subfigure shows the curve of overall accuracy changing with the epochs. Both the training and validating losses show a stable trend. The little gap between them indicates that our model is a good fit model. The confusion matrix of the classification results is shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. Our model shows accurate classification performance on the OCT2017 dataset, with almost no errors in each category. In the OCT-C8 dataset, our model also performs well on most categories, except the CNV and DME categories. <xref ref-type="table" rid="T2">Table 2</xref> shows the classification performance of the model on two different datasets (OCT2017 and OCT-C8). For each category, the ACC, SEN, PRE, F1, and SPE of each category are calculated according to the binary classification algorithm. For the OCT-2017 dataset, the average accuracy (mACC) and overall accuracy (oACC) of the model are 99.49% and 98.98%, respectively. For the OCT-C8 dataset, the overall accuracy of the model is 96.21%. Although the model achieved 100% of the indicators in the AMD category, the sensitivity in the CNV and DME classifications was relatively low (92.67% and 91.00%, respectively), resulting in a slight decrease in the <inline-formula id="inf103">
<mml:math id="m127">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> values of these categories. The results of these two datasets show that this model can maintain a high classification performance when dealing with tasks of multi-category classification.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Visualization of objective loss and over accuracy during the model&#x2019;s training.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g005.tif"/>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Confusion matrix of the predicted results on the OCT-2017 and OCT-C8 datasets using our model.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g006.tif"/>
</fig>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Detection results of our model on the two datasets. (%).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Dataset</th>
<th align="left">Label</th>
<th align="left">ACC</th>
<th align="left">SEN</th>
<th align="left">PRE</th>
<th align="left">F1</th>
<th align="left">SPE</th>
<th align="left">oACC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">OCT2017</td>
<td align="left">CNV</td>
<td align="left">99.38</td>
<td align="left">98.75</td>
<td align="left">98.75</td>
<td align="left">98.75</td>
<td align="left">99.58</td>
<td align="left">98.98</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">DME</td>
<td align="left">99.63</td>
<td align="left">99.09</td>
<td align="left">99.43</td>
<td align="left">99.26</td>
<td align="left">99.81</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">DRUSEN</td>
<td align="left">99.26</td>
<td align="left">98.64</td>
<td align="left">98.41</td>
<td align="left">98.52</td>
<td align="left">99.47</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">Normal</td>
<td align="left">99.69</td>
<td align="left">99.43</td>
<td align="left">99.32</td>
<td align="left">99.38</td>
<td align="left">99.77</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left"/>
<td align="left">Average</td>
<td align="left">99.49</td>
<td align="left">98.98</td>
<td align="left">98.98</td>
<td align="left">98.98</td>
<td align="left">99.66</td>
<td align="left"/>
</tr>
<tr>
<td align="left">OCT-C8</td>
<td align="left">AMD</td>
<td align="left">100.00</td>
<td align="left">100.00</td>
<td align="left">100.00</td>
<td align="left">100.00</td>
<td align="left">100.00</td>
<td align="left">96.21</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">CNV</td>
<td align="left">98.58</td>
<td align="left">92.67</td>
<td align="left">95.86</td>
<td align="left">94.24</td>
<td align="left">99.43</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">CSR</td>
<td align="left">99.38</td>
<td align="left">99.00</td>
<td align="left">96.12</td>
<td align="left">97.54</td>
<td align="left">99.43</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">DME</td>
<td align="left">98.29</td>
<td align="left">91.00</td>
<td align="left">95.12</td>
<td align="left">93.02</td>
<td align="left">99.33</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">DR</td>
<td align="left">98.67</td>
<td align="left">95.00</td>
<td align="left">94.37</td>
<td align="left">94.68</td>
<td align="left">99.19</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">DRUSEN</td>
<td align="left">99.00</td>
<td align="left">98.00</td>
<td align="left">94.23</td>
<td align="left">96.08</td>
<td align="left">99.14</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">MH</td>
<td align="left">99.29</td>
<td align="left">97.33</td>
<td align="left">97.01</td>
<td align="left">97.17</td>
<td align="left">99.57</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">Normal</td>
<td align="left">99.21</td>
<td align="left">96.67</td>
<td align="left">96.99</td>
<td align="left">96.83</td>
<td align="left">99.57</td>
<td align="left">
</td>
</tr>
<tr>
<td align="left"/>
<td align="left">Average</td>
<td align="left">99.05</td>
<td align="left">96.21</td>
<td align="left">96.21</td>
<td align="left">96.19</td>
<td align="left">99.46</td>
<td align="left"/>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s5-2">
<title>5.2 Comparative analysis</title>
<p>To demonstrate our model&#x2019;s superiority, we select seven competing methods to test on our model and compare the classification performance. These methods include the baseline ResNet <xref ref-type="bibr" rid="B27">Talo et al. (2019)</xref>, the CNN-based OctNet method <xref ref-type="bibr" rid="B26">Sunija et al. (2021)</xref>, the ViT model <xref ref-type="bibr" rid="B5">Dosovitskiy et al. (2020)</xref>, the Swin transformer model <xref ref-type="bibr" rid="B18">Liu et al. (2021)</xref>, the CVM-Cervix model <xref ref-type="bibr" rid="B17">Liu et al. (2022)</xref>, the CTransCNN model <xref ref-type="bibr" rid="B34">Wu et al. (2023)</xref>, and the MedVit model <xref ref-type="bibr" rid="B19">Manzari et al. (2023)</xref>. The last three hybrid models combine the CNN and transformer to conduct image classifications.</p>
<p>
<xref ref-type="table" rid="T3">Table 3</xref> demonstrates the comparison of the performance of different methods in multi-category classification tasks on the OCT2017 and OCT-C8 data sets. The evaluation indicators in the table include average accuracy (mACC), average sensitivity (mSEN), average precision (mPRE), average F1 value (mF1), average specificity (mSPE), and overall accuracy (oACC). On the OCT2017 dataset, our model performs best on all metrics, reaching an mACC value of 99.49% and an oACC value of 98.98%. On the OCT-C8 data set, our model also demonstrates strong generalization capabilities, outperforming other methods with an mACC of 99.05% and an oACC of 96.20%. Furthermore, we compare our model with the three hybrid models in terms of the ACC and F1 for each category. <xref ref-type="fig" rid="F7">Figures 7</xref>, <xref ref-type="fig" rid="F8">8</xref> show the classification performance of four methods (CVM-Cervix, CTransCNN, MedViT, and Ours) on the OCT-2017 dataset and OCT-C8 dataset, respectively. For the CNV category, our method slightly outperforms other methods in both ACC and F1 values, but the advantage is not obvious. For the DME category, our method significantly outperforms other methods, especially on the F1 value. For the Drusen category, both ACC and F1 values of our method are better than CVM-Cervix and CTransCNN, but slightly lower compared to MedViT. For the normal category, our method has significant advantages in both ACC and F1 values. We also compare the ROC of these four methods, and the results are shown in <xref ref-type="fig" rid="F9">Figures 9</xref>, <xref ref-type="fig" rid="F10">10</xref>. Our model has the highest AUC value of 0.981 and 0.962 for OCT-2017 and OCT-C8, respectively. Our method has the best classification performance in overall accuracy and high AUC among these competing methods.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Comparison of the multi-category classification using different methods. (%).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Dataset</th>
<th align="left">Method</th>
<th align="left">mACC</th>
<th align="left">mSEN</th>
<th align="left">mPRE</th>
<th align="left">mF1</th>
<th align="left">mSPE</th>
<th align="left">oACC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">OCT2017</td>
<td align="left">ResNet50 <xref ref-type="bibr" rid="B27">Talo et al. (2019)</xref>
</td>
<td align="left">97.59</td>
<td align="left">95.17</td>
<td align="left">95.18</td>
<td align="left">95.17</td>
<td align="left">98.39</td>
<td align="left">95.17</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">OctNet <xref ref-type="bibr" rid="B26">Sunija et al. (2021)</xref>
</td>
<td align="left">98.37</td>
<td align="left">96.73</td>
<td align="left">96.74</td>
<td align="left">96.73</td>
<td align="left">98.91</td>
<td align="left">96.73</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">ViT <xref ref-type="bibr" rid="B5">Dosovitskiy et al. (2020)</xref>
</td>
<td align="left">98.93</td>
<td align="left">97.87</td>
<td align="left">97.87</td>
<td align="left">97.87</td>
<td align="left">99.29</td>
<td align="left">97.87</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">Swin Transformer <xref ref-type="bibr" rid="B18">Liu et al. (2021)</xref>
</td>
<td align="left">99.16</td>
<td align="left">98.32</td>
<td align="left">98.32</td>
<td align="left">98.32</td>
<td align="left">99.44</td>
<td align="left">98.32</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">CVM-Cervix <xref ref-type="bibr" rid="B17">Liu et al. (2022)</xref>
</td>
<td align="left">99.36</td>
<td align="left">98.72</td>
<td align="left">98.72</td>
<td align="left">98.72</td>
<td align="left">99.57</td>
<td align="left">98.72</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">CTransCNN <xref ref-type="bibr" rid="B34">Wu et al. (2023)</xref>
</td>
<td align="left">99.32</td>
<td align="left">98.64</td>
<td align="left">98.64</td>
<td align="left">98.64</td>
<td align="left">99.55</td>
<td align="left">98.64</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">MedViT <xref ref-type="bibr" rid="B19">Manzari et al. (2023)</xref>
</td>
<td align="left">99.39</td>
<td align="left">98.78</td>
<td align="left">98.78</td>
<td align="left">98.78</td>
<td align="left">99.59</td>
<td align="left">98.78</td>
</tr>
<tr>
<td align="left"/>
<td align="left">Ours</td>
<td align="left">99.49</td>
<td align="left">98.98</td>
<td align="left">98.98</td>
<td align="left">98.98</td>
<td align="left">99.66</td>
<td align="left">98.98</td>
</tr>
<tr>
<td align="left">OCT-C8</td>
<td align="left">ResNet50 <xref ref-type="bibr" rid="B27">Talo et al. (2019)</xref>
</td>
<td align="left">98.08</td>
<td align="left">92.33</td>
<td align="left">92.36</td>
<td align="left">92.34</td>
<td align="left">98.90</td>
<td align="left">92.33</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">OctNet <xref ref-type="bibr" rid="B26">Sunija et al. (2021)</xref>
</td>
<td align="left">98.32</td>
<td align="left">93.29</td>
<td align="left">93.31</td>
<td align="left">93.30</td>
<td align="left">99.04</td>
<td align="left">93.29</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">ViT <xref ref-type="bibr" rid="B5">Dosovitskiy et al. (2020)</xref>
</td>
<td align="left">98.47</td>
<td align="left">93.88</td>
<td align="left">93.88</td>
<td align="left">93.87</td>
<td align="left">99.13</td>
<td align="left">93.88</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">Swin Transformer <xref ref-type="bibr" rid="B18">Liu et al. (2021)</xref>
</td>
<td align="left">98.74</td>
<td align="left">94.96</td>
<td align="left">94.96</td>
<td align="left">94.94</td>
<td align="left">99.28</td>
<td align="left">94.96</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">CVM-Cervix <xref ref-type="bibr" rid="B17">Liu et al. (2022)</xref>
</td>
<td align="left">98.91</td>
<td align="left">95.63</td>
<td align="left">95.63</td>
<td align="left">95.61</td>
<td align="left">99.37</td>
<td align="left">95.63</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">CTransCNN <xref ref-type="bibr" rid="B34">Wu et al. (2023)</xref>
</td>
<td align="left">98.97</td>
<td align="left">95.88</td>
<td align="left">95.88</td>
<td align="left">95.86</td>
<td align="left">99.41</td>
<td align="left">95.88</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">MedViT <xref ref-type="bibr" rid="B19">Manzari et al. (2023)</xref>
</td>
<td align="left">98.99</td>
<td align="left">95.96</td>
<td align="left">95.96</td>
<td align="left">95.95</td>
<td align="left">99.42</td>
<td align="left">95.96</td>
</tr>
<tr>
<td align="left"/>
<td align="left">Ours</td>
<td align="left">99.05</td>
<td align="left">96.21</td>
<td align="left">96.21</td>
<td align="left">96.19</td>
<td align="left">99.46</td>
<td align="left">96.20</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Comparison of different methods on the accuracy and F1 for each label (OCT-2017 dataset).</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g007.tif"/>
</fig>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Comparison of different methods on the accuracy and F1 for each label (OCT-C8 dataset).</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g008.tif"/>
</fig>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>ROC comparison of the four different methods for both datasets.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g009.tif"/>
</fig>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Stick diagram of the oACC and AUC on both datasets.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g010.tif"/>
</fig>
</sec>
<sec id="s5-3">
<title>5.3 Ablation studies</title>
<p>To investigate the influence of different modules on the evaluation performance, we focus on the convolutional module (Conv), the multi-resolution (MR) strategy, the multi-path (MP) in the retinal Mamba, and the enhanced Mamba (EM). The MR removal means that we only keep the retinal Mamba in resolution-1. The MP removal means we remove the path-2, path-3, and path-4 in the retinal Mamba network. Removing EM means we delete the R-SiLU module in the enhanced Mamba.</p>
<p>
<xref ref-type="table" rid="T4">Table 4</xref> shows the impact of different modules (Conv, MR, MP, and EM) on the classification performance of our model. Specifically, the combination of all modules (Conv, MR, MP, and EM) performed best on both OCT2017 and OCT-C8 datasets. After removing the EM module, the classification performance shows an approximately 0.1 percent decrease. The Conv and MR modules both contribute to the improvement of our model&#x2019;s classification performance. We further remove the Mamba-related modules (including MR, MP, and EM), and the oACC decreased by approximately 2.1 percentage and 1.2 percentage points on the OCT-2017 dataset and OCT-C8 dataset, respectively. This shows that each module plays an important role in the model, especially the Conv module and MR module, which are particularly critical to improving the overall performance. The lack of any module will lead to a decrease in the classification performance.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Impact of different MRVM modules on the detection performance. (%).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th rowspan="2" align="center">Conv</th>
<th rowspan="2" align="center">MR</th>
<th rowspan="2" align="center">MP</th>
<th rowspan="2" align="center">EM</th>
<th colspan="2" align="left">OCT2017</th>
<th colspan="2" align="left">OCT-C8</th>
</tr>
<tr>
<th align="left">mACC</th>
<th align="left">oACC</th>
<th align="left">mACC</th>
<th align="left">oACC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Our model</td>
<td align="center">&#x2717;</td>
<td align="center">
<inline-formula id="inf104">
<mml:math id="m128">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf105">
<mml:math id="m129">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf106">
<mml:math id="m130">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">99.05</td>
<td align="left">98.10</td>
<td align="left">98.83</td>
<td align="left">95.33</td>
</tr>
<tr>
<td align="left">
</td>
<td align="center">
<inline-formula id="inf107">
<mml:math id="m131">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">&#x2717;</td>
<td align="center">
<inline-formula id="inf108">
<mml:math id="m132">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf109">
<mml:math id="m133">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">99.16</td>
<td align="left">98.32</td>
<td align="left">98.93</td>
<td align="left">95.71</td>
</tr>
<tr>
<td align="left">
</td>
<td align="center">
<inline-formula id="inf110">
<mml:math id="m134">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf111">
<mml:math id="m135">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">&#x2717;</td>
<td align="center">
<inline-formula id="inf112">
<mml:math id="m136">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">99.23</td>
<td align="left">98.47</td>
<td align="left">98.89</td>
<td align="left">95.54</td>
</tr>
<tr>
<td align="left">
</td>
<td align="center">
<inline-formula id="inf113">
<mml:math id="m137">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf114">
<mml:math id="m138">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf115">
<mml:math id="m139">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">&#x2717;</td>
<td align="left">99.32</td>
<td align="left">98.64</td>
<td align="left">98.94</td>
<td align="left">95.75</td>
</tr>
<tr>
<td align="left">
</td>
<td align="center">
<inline-formula id="inf116">
<mml:math id="m140">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2717;</td>
<td align="center">
<inline-formula id="inf117">
<mml:math id="m141">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">98.69</td>
<td align="left">97.39</td>
<td align="left">98.83</td>
<td align="left">95.33</td>
</tr>
<tr>
<td align="left">
</td>
<td align="center">
<inline-formula id="inf118">
<mml:math id="m142">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2717;</td>
<td align="left">98.44</td>
<td align="left">96.88</td>
<td align="left">98.75</td>
<td align="left">95.00</td>
</tr>
<tr>
<td align="left"/>
<td align="center">
<inline-formula id="inf119">
<mml:math id="m143">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf120">
<mml:math id="m144">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf121">
<mml:math id="m145">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf122">
<mml:math id="m146">
<mml:mrow>
<mml:mi>&#x2713;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">99.49</td>
<td align="left">98.98</td>
<td align="left">99.05</td>
<td align="left">96.21</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s5-4">
<title>5.4 Discussion</title>
<p>Our model demonstrates good classification performance and generalization on two public datasets. Comparative analysis using different competing methods also shows our model&#x2019;s superiority. The good performance of our model can be attributed to its great ability in feature extraction at multi-scales. Both global dependencies and local receptive fields can explore the underlying complex disease-related cues. The gradient-weighted class activation mapping (Grad-CAM) visualization can analyze and understand activation regions of different classes. We use it to show how our model captures the key cues in the retinal OCT image classification. As shown in <xref ref-type="fig" rid="F11">Figure 11</xref>, the use of the Grad-CAM generates a heatmap with the size of the raw OCT image and shows the key areas in the OCT image that contribute most to the predicted label. To investigate our model&#x2019;s robustness, we added a certain degree of noise to the original OCT images and followed the same training procedures. <xref ref-type="table" rid="T5">Table 5</xref> shows the classification performance of our model under multiple noise levels. For the OCT2017 dataset, as the noise level increases from 0% to 10%, the mACC and oACC decrease from 99.49% and 99.98% to 99.19% and 98.38%, respectively. For the OCT-C8 dataset, the mACC and oACC decrease from 99.05% and 96.21% to 98.93% and 95.71%, respectively. Similarly, despite the slight performance degradation caused by noise, the model still maintains high accuracy and robustness under the influence of noise. Overall, the performance of the model under different noise conditions shows strong stability, especially under low-to-medium noise levels (1% and 5%); the classification performance only fluctuates slightly, indicating that the model has good resistance to noise.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Activation heatmaps of different retinal OCT images using our model. The upper row is the raw OCT images, and the bottom row is the activation heatmaps using the Grad-CAM method.</p>
</caption>
<graphic xlink:href="fcell-12-1484880-g011.tif"/>
</fig>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Impact of different levels of noise on the classification performance. (%).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Dataset</th>
<th align="left">Noise level (%)</th>
<th align="left">mACC</th>
<th align="left">mSEN</th>
<th align="left">mPRE</th>
<th align="left">mF1</th>
<th align="left">mSPE</th>
<th align="left">oACC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">OCT2017</td>
<td align="left">0</td>
<td align="left">99.49</td>
<td align="left">98.98</td>
<td align="left">98.98</td>
<td align="left">98.98</td>
<td align="left">99.66</td>
<td align="left">99.98</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">1</td>
<td align="left">99.42</td>
<td align="left">98.84</td>
<td align="left">98.84</td>
<td align="left">98.84</td>
<td align="left">99.61</td>
<td align="left">98.84</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">5</td>
<td align="left">99.35</td>
<td align="left">98.69</td>
<td align="left">98.69</td>
<td align="left">98.69</td>
<td align="left">99.56</td>
<td align="left">98.69</td>
</tr>
<tr>
<td align="left"/>
<td align="left">10</td>
<td align="left">99.19</td>
<td align="left">98.38</td>
<td align="left">98.38</td>
<td align="left">98.38</td>
<td align="left">99.46</td>
<td align="left">98.38</td>
</tr>
<tr>
<td align="left">OCT-C8</td>
<td align="left">0</td>
<td align="left">99.05</td>
<td align="left">96.21</td>
<td align="left">96.21</td>
<td align="left">96.19</td>
<td align="left">99.46</td>
<td align="left">96.21</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">1</td>
<td align="left">99.04</td>
<td align="left">96.17</td>
<td align="left">96.16</td>
<td align="left">96.15</td>
<td align="left">99.45</td>
<td align="left">96.17</td>
</tr>
<tr>
<td align="left">
</td>
<td align="left">5</td>
<td align="left">99.00</td>
<td align="left">96.00</td>
<td align="left">96.00</td>
<td align="left">95.98</td>
<td align="left">99.43</td>
<td align="left">96.00</td>
</tr>
<tr>
<td align="left"/>
<td align="left">10</td>
<td align="left">98.93</td>
<td align="left">95.71</td>
<td align="left">95.71</td>
<td align="left">95.69</td>
<td align="left">99.39</td>
<td align="left">95.71</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The main limitation of our model is the lack of multimodal retinal images. Single-modality retinal OCT images may not capture all pathological features of the retina. Single-modality retinal images can only provide information on one aspect but lack a comprehensive understanding of the global perspective. A single modality may not be able to fully assess the progression of the disease or other relevant pathological features. In the next study, we will add multimodal retinal images (i.e., fundus images) to more precisely detect retinal diseases.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>This paper presents the multi-resolution visual Mamba (MRVM) model, designed to enhance OCT image classification performance by addressing long-range dependencies with linear computational complexity. The MRVM model first utilizes convolution operations to extract local features from OCT images and then leverages the retinal Mamba to capture global dependencies. By integrating multi-scale global features, the model not only improves classification accuracy but also boosts overall performance and robustness. A key innovation of the MRVM is its multi-directional selection mechanism, which enhances feature extraction by focusing on various directions to capture intricate, orientation-specific retinal patterns. Experimental results demonstrate that the MRVM model excels in distinguishing diverse retinopathy images, achieving a significant accuracy improvement over traditional methods&#x2014;0.2 percentage points higher&#x2014;with overall accuracies of 98.98% and 96.21% on the OCT2017 and OCT-C8 datasets, respectively. This advancement holds promise for automatic retinal disease diagnosis and could be valuable in clinical settings.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The raw data supporting the conclusion of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s8">
<title>Author contributions</title>
<p>QZ: conceptualization, formal analysis, funding acquisition, methodology, resources, software, validation, visualization, and writing&#x2013;original draft. ZS: formal analysis, investigation, validation, and writing&#x2013;review and editing. BL: conceptualization, formal analysis, software, validation, and writing&#x2013;review and editing. NP: data curation, formal analysis, investigation, and writing&#x2013;review and editing. JW: conceptualization, formal analysis, investigation, software, and writing&#x2013;review and editing. XC: data curation, formal analysis, methodology, and writing&#x2013;review and editing. KZ: data curation, software, validation, and writing&#x2013;review and editing. JG: conceptualization, methodology, validation, visualization, and writing&#x2013;review and editing. YW: data curation, investigation, software, and writing&#x2013;review and editing. JH: investigation, project administration, resources, supervision, and writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported in part by the National Natural Science Foundation of China (62406107, 62466033), in part by the Natural Science Foundation of Hubei Province (2023AFB004 and 2023AFB003), in part by the Jiangxi Provincial Natural Science Foundation (20242BAB20070), in part by the Education Department Scientific Research Program Project of Hubei Province of China (Grant Number Q20232206).</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>
<ext-link ext-link-type="uri" xlink:href="http://data.mendeley.com/datasets/rscbjbr9sj/2">http://data.mendeley.com/datasets/rscbjbr9sj/2</ext-link>
</p>
</fn>
<fn id="fn2">
<label>2</label>
<p>
<ext-link ext-link-type="uri" xlink:href="http://kaggle.com/datasets/obulisainaren/retinal-oct-c8">http://kaggle.com/datasets/obulisainaren/retinal-oct-c8</ext-link>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alqudah</surname>
<given-names>A. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Aoct-net: a convolutional network automated classification of multiclass retinal diseases using spectral-domain optical coherence tomography images</article-title>. <source>Med. and Biol. Eng. and Comput.</source> <volume>58</volume>, <fpage>41</fpage>&#x2013;<lpage>53</lpage>. <pub-id pub-id-type="doi">10.1007/s11517-019-02066-y</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bashshur</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ross</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>World report on vision</article-title>. <source>Int. J. Eye Bank.</source> <volume>8</volume> (<issue>3</issue>).</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bouma</surname>
<given-names>B. E.</given-names>
</name>
<name>
<surname>de Boer</surname>
<given-names>J. F.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jang</surname>
<given-names>I.-K.</given-names>
</name>
<name>
<surname>Yonetsu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Leggett</surname>
<given-names>C. L.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Optical coherence tomography</article-title>. <source>Nat. Rev. Methods Prim.</source> <volume>2</volume>, <fpage>79</fpage>. <pub-id pub-id-type="doi">10.1038/s43586-022-00162-2</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Daich Varela</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sen</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>De Guimaraes</surname>
<given-names>T. A. C.</given-names>
</name>
<name>
<surname>Kabiri</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Pontikos</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Balaskas</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Artificial intelligence in retinal disease: clinical application, challenges, and future directions</article-title>. <source>Graefe&#x2019;s Archive Clin. Exp. Ophthalmol.</source> <volume>261</volume>, <fpage>3283</fpage>&#x2013;<lpage>3297</lpage>. <pub-id pub-id-type="doi">10.1007/s00417-023-06052-x</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Das</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Dandapat</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bora</surname>
<given-names>P. K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Automated classification of retinal oct images using a deep multi-scale fusion cnn</article-title>. <source>IEEE Sensors J.</source> <volume>21</volume>, <fpage>23256</fpage>&#x2013;<lpage>23265</lpage>. <pub-id pub-id-type="doi">10.1109/jsen.2021.3108642</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kolesnikov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Weissenborn</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhai</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>,&#x201d; in <conf-name>International conference on learning representations</conf-name>.</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Mamba: linear-time sequence modeling with selective state spaces</article-title>. <source>arXiv Prepr. arXiv:2312.00752</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2312.00752</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hammou</surname>
<given-names>B. A.</given-names>
</name>
<name>
<surname>Antaki</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Boucher</surname>
<given-names>M.-C.</given-names>
</name>
<name>
<surname>Duval</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Mbt: model-based transformer for retinal optical coherence tomography image and video multi-classification</article-title>. <source>Int. J. Med. Inf.</source> <volume>178</volume>, <fpage>105178</fpage>. <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105178</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>An interpretable transformer network for the retinal disease classification using optical coherence tomography</article-title>. <source>Sci. Rep.</source> <volume>13</volume>, <fpage>3637</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-023-30853-z</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hemalakshmi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Murugappan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sikkandar</surname>
<given-names>M. Y.</given-names>
</name>
<name>
<surname>Begum</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Prakash</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Automated retinal disease classification using hybrid transformer model (svit) using optical coherence tomography images</article-title>. <source>Neural Comput. Appl.</source> <volume>36</volume>, <fpage>9171</fpage>&#x2013;<lpage>9188</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-024-09564-7</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>S. C.-H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022a</year>). <article-title>Unsupervised domain adaptation for cross-modality liver segmentation via joint adversarial learning and self-learning</article-title>. <source>Appl. Soft Comput.</source> <volume>121</volume>, <fpage>108729</fpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2022.108729</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.-D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022b</year>). <article-title>Source-free unsupervised domain adaptation for cross-modality abdominal multi-organ segmentation</article-title>. <source>Knowledge-Based Syst.</source> <volume>250</volume>, <fpage>109155</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2022.109155</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ji</surname>
<given-names>Y.-K.</given-names>
</name>
<name>
<surname>Hua</surname>
<given-names>R.-R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>C.-J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.-C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.-H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Intelligent diagnosis of retinal vein occlusion based on color fundus photographs</article-title>. <source>Int. J. Ophthalmol.</source> <volume>17</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.18240/ijo.2024.01.01</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karthik</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Mahadevappa</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Convolution neural networks for optical coherence tomography (oct) image classification</article-title>. <source>Biomed. Signal Process. Control</source> <volume>79</volume>, <fpage>104176</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2022.104176</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lanzani</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chiaravalli</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Colombo</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Manfredi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Di Marco</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vurro</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Nanotechnology for vision restoration</article-title>. <source>Nat. Rev. Bioeng.</source>, <fpage>1</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1038/s44222-024-00210-4</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Laouarem</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kara-Mohamed</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bourennane</surname>
<given-names>E.-B.</given-names>
</name>
<name>
<surname>Hamdi-Cherif</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Htc-retina: a hybrid retinal diseases classification model using transformer-convolutional neural network from optical coherence tomography images</article-title>. <source>Comput. Biol. Med.</source> <volume>178</volume>, <fpage>108726</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108726</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Source-free unsupervised adaptive segmentation for knee joint mri</article-title>. <source>Biomed. Signal Process. Control</source> <volume>92</volume>, <fpage>106028</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2024.106028</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Rahaman</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Cvm-cervix: a hybrid cervical pap-smear image classification framework using cnn, visual transformer and multilayer perceptron</article-title>. <source>Pattern Recognit.</source> <volume>130</volume>, <fpage>108829</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2022.108829</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Manzari</surname>
<given-names>O. N.</given-names>
</name>
<name>
<surname>Ahmadabadi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kashiani</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shokouhi</surname>
<given-names>S. B.</given-names>
</name>
<name>
<surname>Ayatollahi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Medvit: a robust vision transformer for generalized medical image classification</article-title>. <source>Comput. Biol. Med.</source> <volume>157</volume>, <fpage>106791</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106791</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Parvaiz</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Khalid</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Zafar</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ameer</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fraz</surname>
<given-names>M. M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Vision transformers in medical computer vision&#x2014;a contemplative retrospection</article-title>. <source>Eng. Appl. Artif. Intell.</source> <volume>122</volume>, <fpage>106126</fpage>. <pub-id pub-id-type="doi">10.1016/j.engappai.2023.106126</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Playout</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Duval</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Boucher</surname>
<given-names>M. C.</given-names>
</name>
<name>
<surname>Cheriet</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Focused attention in transformers for interpretable classification of retinal images</article-title>. <source>Med. Image Anal.</source> <volume>82</volume>, <fpage>102608</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102608</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rauchman</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Albert</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pinkhasov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Reiss</surname>
<given-names>A. B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Mild-to-moderate traumatic brain injury: a review with focus on the visual system</article-title>. <source>Neurol. Int.</source> <volume>14</volume>, <fpage>453</fpage>&#x2013;<lpage>470</lpage>. <pub-id pub-id-type="doi">10.3390/neurolint14020038</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xiang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Surrogate-assisted retinal oct image classification based on convolutional neural networks</article-title>. <source>IEEE J. Biomed. health Inf.</source> <volume>23</volume>, <fpage>253</fpage>&#x2013;<lpage>263</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2018.2795545</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kawasaki</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Structure-oriented transformer for retinal diseases grading from oct images</article-title>. <source>Comput. Biol. Med.</source> <volume>152</volume>, <fpage>106445</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.106445</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sunija</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gayathri</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gopi</surname>
<given-names>V. P.</given-names>
</name>
<name>
<surname>Palanisamy</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Octnet: a lightweight cnn for retinal disease classification from optical coherence tomography images</article-title>. <source>Comput. methods programs Biomed.</source> <volume>200</volume>, <fpage>105877</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2020.105877</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Talo</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yildirim</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Baloglu</surname>
<given-names>U. B.</given-names>
</name>
<name>
<surname>Aydin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Acharya</surname>
<given-names>U. R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Convolutional neural networks for multi-class brain disease detection using mri images</article-title>. <source>Comput. Med. Imaging Graph.</source> <volume>78</volume>, <fpage>101673</fpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2019.101673</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hua</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2023a</year>). <article-title>Automated detection of myopic maculopathy using five-category models based on vision outlooker for visual recognition</article-title>. <source>Front. Comput. Neurosci.</source> <volume>17</volume>, <fpage>1169464</fpage>. <pub-id pub-id-type="doi">10.3389/fncom.2023.1169464</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2024a</year>). <article-title>A new segmentation algorithm for peripapillary atrophy and optic disk from ultra-widefield photographs</article-title>. <source>Comput. Biol. Med.</source> <volume>172</volume>, <fpage>108281</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108281</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hua</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2023b</year>). <article-title>Automatic diagnosis of different types of retinal vein occlusion based on fundus images</article-title>. <source>Int. J. Intelligent Syst.</source> <volume>2023</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1155/2023/1587410</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2024b</year>). <article-title>Dbpf-net: dual-branch structural feature extraction reinforcement network for ocular surface disease image classification</article-title>. <source>Front. Med.</source> <volume>10</volume>, <fpage>1309097</fpage>. <pub-id pub-id-type="doi">10.3389/fmed.2023.1309097</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>On oct image classification via deep learning</article-title>. <source>IEEE Photonics J.</source> <volume>11</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/jphot.2019.2934484</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Towards more efficient ophthalmic disease classification and lesion location via convolution transformer</article-title>. <source>Comput. Methods Programs Biomed.</source> <volume>220</volume>, <fpage>106832</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2022.106832</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Ctranscnn: combining transformer and cnn in multilabel medical image classification</article-title>. <source>Knowledge-Based Syst.</source> <volume>281</volume>, <fpage>111030</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2023.111030</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>A multi-modal fundus image based auxiliary location method of lesion boundary for guiding the layout of laser spot in central serous chorioretinopathy therapy</article-title>. <source>Comput. Biol. Med.</source> <volume>155</volume>, <fpage>106648</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106648</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>An intelligent location method of key boundary points for assisting the diameter measurement of central serous chorioretinopathy lesion area</article-title>. <source>Comput. Biol. Med.</source> <volume>147</volume>, <fpage>105730</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.105730</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>You</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chui</surname>
<given-names>C. K.</given-names>
</name>
<name>
<surname>Cheung</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Fine perceptive gans for brain mr image super-resolution in wavelet domain</article-title>. <source>IEEE Trans. neural Netw. Learn. Syst.</source> <volume>34</volume>, <fpage>8802</fpage>&#x2013;<lpage>8814</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2022.3153088</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>You</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lyu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chui</surname>
<given-names>C. K.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C. P.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Generative ai enables synthesizing cross-modality brain image via multi-level-latent representation learning</article-title>. <source>IEEE Trans. Comput. Imaging</source> <volume>10</volume>, <fpage>1152</fpage>&#x2013;<lpage>1164</lpage>. <pub-id pub-id-type="doi">10.1109/tci.2024.3434724</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bi</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Bian</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ning</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Mil-vt: multiple instance learning enhanced vision transformer for fundus image classification</article-title>,&#x201d; in <conf-name>Medical image computing and computer assisted intervention&#x2013;MICCAI 2021: 24th international conference, strasbourg, France, september 27&#x2013;october 1, 2021, proceedings, Part VIII 24</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>45</fpage>&#x2013;<lpage>54</lpage>.</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Morphological feature visualization of alzheimer&#x2019;s disease via multidirectional perception gan</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>34</volume>, <fpage>4401</fpage>&#x2013;<lpage>4415</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3118369</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Medical big data and artificial intelligence for healthcare</article-title>. <source>Dataset</source> <volume>13</volume>, <fpage>3745</fpage>. <pub-id pub-id-type="doi">10.3390/app13063745</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Vision mamba: efficient visual representation learning with bidirectional state space model</article-title>. <source>arXiv Prepr. arXiv:2401.09417</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2401.09417</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>M. K.-P.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A new brain network construction paradigm for brain disorder via diffusion-based graph contrastive learning</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2024.3442811</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zuo</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>M. K.-P.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Bdht: generative ai enables causality analysis for mild cognitive impairment</article-title>. <source>IEEE Trans. Automation Sci. Eng.</source>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/tase.2024.3425949</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zuo</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C. P.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023a</year>). <article-title>Alzheimer&#x2019;s disease prediction via brain structural-functional deep fusing network</article-title>. <source>IEEE Trans. Neural Syst. Rehabilitation Eng.</source> <volume>31</volume>, <fpage>4601</fpage>&#x2013;<lpage>4612</lpage>. <pub-id pub-id-type="doi">10.1109/TNSRE.2023.3333952</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zuo</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023b</year>). <article-title>Brain structure-function fusing representation learning using adversarial decomposed-vae for analyzing mci</article-title>. <source>IEEE Trans. Neural Syst. Rehabilitation Eng.</source> <volume>31</volume>, <fpage>4017</fpage>&#x2013;<lpage>4028</lpage>. <pub-id pub-id-type="doi">10.1109/TNSRE.2023.3323432</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>