<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1768786</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2026.1768786</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>ZR<sup>2</sup>ViM: a recursive vision Mamba model for boundary-preserving medical image segmentation</article-title>
<alt-title alt-title-type="left-running-head">Hua et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2026.1768786">10.3389/fbinf.2026.1768786</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Hua</surname>
<given-names>Caijian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2867058"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Xiang</surname>
<given-names>Caorong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3319958"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Liuying</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhou</surname>
<given-names>Xia</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>School of Computer Science and Engineering, Sichuan University of Science and Engineering</institution>, <city>Yibin</city>, <country country="CN">China</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Traditional Chinese Medicine Department, Zigong First People&#x2019;s Hospital</institution>, <city>Zigong</city>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Liuying Li, <email xlink:href="mailto:arenally@sina.com">arenally@sina.com</email>; Xia Zhou, <email xlink:href="mailto:zhoux1823@163.com">zhoux1823@163.com</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-04">
<day>04</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>6</volume>
<elocation-id>1768786</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>24</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Hua, Xiang, Li and Zhou.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Hua, Xiang, Li and Zhou</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-04">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Medical image segmentation is fundamental to quantitative disease analysis and therapeutic decision-making. However, constrained by limited computational resources, existing deep learning methods often struggle to simultaneously model long-range dependencies and preserve boundary precision, particularly when delineating structures with complex morphology or blurred edges.</p>
</sec>
<sec>
<title>Method</title>
<p>To overcome these challenges, we propose <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, a recursion-enhanced visual state space model designed for medical image segmentation. <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM augments the Vision Mamba framework with a Zigzag Recursive Reinforced (<inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) Block that incorporates Stacked State Redistribution (SSR) and a Nested Recursive Connection (NRC). The NRC employs dual inner and outer pathways to iteratively fuse local details with global context while preserving 2D spatial adjacency. Furthermore, a Cross-directional Zigzag WKV (CZ-WKV) module executes multi-step recursive updates along multiple zigzag trajectories, injecting spatial directional information via Quad-Directional Token Shift (Q-Shift) directional priors. Collectively, these mechanisms mitigate serialization-induced banding artifacts and enhance the representation of fine, elongated, and low-contrast structures, all while maintaining near-linear computational complexity.</p>
</sec>
<sec>
<title>Results</title>
<p>Comprehensive evaluations across four medical imaging domains&#x2014;spanning dermatoscopic images, breast ultrasound, colorectal polyps, and abdominal multi-organ CT&#x2014;on five public datasets demonstrate that <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM consistently outperforms representative convolutional, attention-based, and visual state space architectures in region consistency and boundary localization. Notably, <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM achieves a 2.15 mm reduction in the HD95 on the Synapse multi-organ CT dataset relative to the CC-ViM baseline, substantiating its superior capability for precise, clinically relevant boundary delineation.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>The <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM framework delivers accurate, boundary-preserving segmentation across diverse imaging modalities and anatomically complex structures, achieving these gains with near-linear computational complexity. These findings demonstrate that <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM offers a robust and efficient solution for medical image analysis, establishing a promising foundation for advanced clinical and research applications.</p>
</sec>
</abstract>
<kwd-group>
<kwd>boundary preservation</kwd>
<kwd>deep learning</kwd>
<kwd>medical image segmentation</kwd>
<kwd>state space models</kwd>
<kwd>vision mamba</kwd>
<kwd>zigzag scanning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was supported by the National Natural Science Foundation of China (Grant No. 42471437), the Zigong Science and Technology Plan Project (Grant No. 2023ZC22), the Zigong Key Science and Technology Plan (Collaborative Innovation Class of Zigong Academy of Medical Sciences) Key Project (Grant No. 2023YKYXT03), and the Zigong Key Science and Technology Plan (Collaborative Innovation Project of Zigong Academy of Medical Sciences) in 2025 (Grant No. 2025YKY0304).</funding-statement>
</funding-group>
<counts>
<fig-count count="8"/>
<table-count count="10"/>
<equation-count count="28"/>
<ref-count count="41"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational BioImaging</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Medical image segmentation is critical for disease screening, lesion localization, preoperative planning, and treatment efficacy assessment, as segmentation accuracy directly influences subsequent quantitative analysis and clinical decision-making (<xref ref-type="bibr" rid="B22">Litjens et al., 2017</xref>; <xref ref-type="bibr" rid="B6">Chan et al., 2020</xref>). Unlike natural images, medical images are often characterized by high resolution, low contrast, and irregular boundaries. These challenges are compounded by high annotation costs and limited sample sizes, complicating three core tasks: modeling long-range dependencies, preserving boundary continuity, and ensuring robust inference, particularly for low-contrast images and elongated structures (<xref ref-type="bibr" rid="B19">Isensee et al., 2021</xref>; <xref ref-type="bibr" rid="B32">Wang et al., 2021</xref>). Consequently, developing a unified framework that achieves both high precision and computational efficiency remains a central challenge in medical image segmentation.</p>
<p>Traditional convolutional neural networks (CNNs) rely on local convolution kernels and fixed receptive fields, a design that limits their ability to encode global anatomical context. Although architectures like UNet and its variants (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>; <xref ref-type="bibr" rid="B38">Zhou et al., 2018</xref>; <xref ref-type="bibr" rid="B27">Oktay et al., 2018</xref>; <xref ref-type="bibr" rid="B34">Xiao et al., 2018</xref>) enhance multi-scale representations through encoder&#x2013;decoder designs and skip connections, CNNs have inherent limitations in modeling long-range dependencies. They therefore often produce fragmented predictions or blurred boundaries, especially when segmenting large, weakly contrasted, or elongated anatomical structures. In contrast, Transformer-based models leverage global self-attention to model long-range dependencies more effectively. Architectures such as TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>) and Swin-Unet (<xref ref-type="bibr" rid="B23">Liu et al., 2021</xref>), inspired by the Vision Transformer (ViT) (<xref ref-type="bibr" rid="B15">Han et al., 2023</xref>), have demonstrated strong performance in medical imaging. However, the quadratic computational complexity of self-attention (<xref ref-type="bibr" rid="B15">Han et al., 2023</xref>) results in prohibitive memory costs and inference latency as image resolution increases. Furthermore, their lack of convolutional inductive bias can compromise robustness on small medical datasets and degrade the localization of fine-grained boundaries. Thus, despite their global modeling strengths, Transformers face intrinsic scalability and stability limitations in this domain.</p>
<p>Recently, Mamba-based state space models (SSMs) (<xref ref-type="bibr" rid="B14">Gu and Dao, 2024</xref>) have emerged as a compelling alternative, offering near-linear complexity and efficient long-sequence modeling. The Vision Mamba (ViM) (<xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>) extends SSMs to the visual domain by incorporating multi-directional selective scanning. Nevertheless, these models still flatten 2D feature maps into 1D sequences, a process that disrupts the intrinsic 2D spatial adjacency and directional continuity. This serialization weakens the model&#x2019;s ability to represent complex boundaries and slender structures, limiting segmentation performance on challenging targets. Another approach, the Receptance Weighted Key&#x2013;Value (RWKV) model (<xref ref-type="bibr" rid="B39">Zhou et al., 2023</xref>; <xref ref-type="bibr" rid="B12">Duan et al., 2024</xref>), integrates recurrent updates with key-value interactions and has emerged as a potential successor to Transformers due to its linear computational complexity and robust long-range dependency modeling. However, its direct application to segmentation reveals critical limitations: while it excels at modeling long-range dependencies, it lacks mechanisms for explicit modeling of local fine-grained features and 2D geometric adjacency. These shortcomings underscore the necessity of integrating explicit 2D structural priors into recurrent SSMs for robust medical image segmentation.</p>
<p>This analysis raises a central question: how can two-dimensional spatial adjacency and directional continuity be restored within a vision state space model, while maintaining near-linear complexity and enabling effective interaction between local details and global context? To address this, we introduce <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM (Zigzag Recursive Reinforced Vision Mamba), a novel architecture that restructures the foundational ViM framework. The fundamental component of our model is the Zigzag Recursive Reinforced (<inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) Block, which implements a Selective State Recurrence (SSR) operator within a Nested Recurrent Cell (NRC). The NRC features two parallel paths, NRC-Inner and NRC-Outer, to process features at different scales. Within this block, we propose the Cyclic Zigzag Weighted Key Value (CZ-WKV) attention mechanism, which employs an expandable, multi-directional cyclic zigzag scan to aggregate contextual information. By integrating CZ-WKV into the nested recurrent architecture, SSR achieves superior spatial alignment and directional robustness compared to the unidirectional scanning in conventional SSMs. A residual connection from the NRC-Inner to the NRC-Outer path ensures the reliable injection of fine-grained details, enhancing boundary delineation while preserving the model&#x2019;s near-linear computational complexity.</p>
<p>The main contributions of this work are: <list list-type="simple">
<list-item>
<label>&#x2022;</label>
<p>A novel segmentation architecture, <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, that enhances visual state space models with mechanisms for spatial continuity and global context while maintaining near-linear complexity.</p>
</list-item>
<list-item>
<label>&#x2022;</label>
<p>The core SSR operator, implemented via a NRC, which jointly models intra-patch details and inter-patch context. Its extensible zigzag scan explicitly restores 2D spatial and directional priors.</p>
</list-item>
<list-item>
<label>&#x2022;</label>
<p>The CZ-WKV attention mechanism, which efficiently aggregates multi-directional context in linear time within the recursive scanning framework, balancing global dependency modeling with spatial continuity.</p>
</list-item>
<list-item>
<label>&#x2022;</label>
<p>Comprehensive validation across multiple public datasets demonstrating that <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM significantly outperforms state-of-the-art methods&#x2014;particularly for images with complex boundaries, low contrast, and slender structures&#x2014;at a low computational cost.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2-1">
<label>2.1</label>
<title>Medical image segmentation</title>
<sec id="s2-1-1">
<label>2.1.1</label>
<title>CNN-based methods</title>
<p>CNNs have long been the cornerstone of medical image segmentation, prized for their powerful local feature extraction capabilities. The seminal UNet (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>) architecture introduced a symmetric encoder&#x2013;decoder design with skip connections, a structure that proved highly effective for fusing shallow, fine-grained details with deep semantic features, especially on limited medical datasets. However, the original UNet design offered limited interaction between features at different semantic scales. Subsequent variants sought to address these shortcomings. For instance, UNet&#x2b;&#x2b;(<xref ref-type="bibr" rid="B38">Zhou et al., 2018</xref>) introduced dense skip connections to bridge this &#x201c;semantic gap&#x201d; and improve feature fusion, yet it often failed to precisely delineate targets with complex boundaries. Similarly, Attention-UNet (Att-UNet) (<xref ref-type="bibr" rid="B27">Oktay et al., 2018</xref>) incorporated attention gates to suppress background noise and focus on salient regions, thereby improving accuracy for intricate structures. To enable the training of deeper, more powerful models, other frameworks integrated residual connections, inspired by ResNet (<xref ref-type="bibr" rid="B17">He et al., 2016</xref>), to mitigate the vanishing gradient problem and enhance feature representation in deep layers. Despite these advances, all CNN-based architectures are fundamentally constrained by the local nature of the convolution operation, which limits their ability to model long-range spatial dependencies. This intrinsic locality often results in inconsistent segmentation of large organs or blurred predictions along complex boundaries. While techniques like dilated convolutions expand the receptive field, they typically do so at the cost of sacrificing fine-grained local detail (<xref ref-type="bibr" rid="B15">Han et al., 2023</xref>).</p>
</sec>
<sec id="s2-1-2">
<label>2.1.2</label>
<title>Transformer-based methods</title>
<p>To overcome the intrinsic locality of CNNs, Transformers leverage a global self-attention mechanism to model long-range spatial dependencies (<xref ref-type="bibr" rid="B31">Shamshad et al., 2023</xref>). The ViT (<xref ref-type="bibr" rid="B15">Han et al., 2023</xref>) pioneered the use of pure Transformer architectures for vision tasks, but its direct application to medical imaging is challenging due to the data-hungry nature of self-attention and the typically small scale of medical datasets. Subsequent innovations like the Swin Transformer (<xref ref-type="bibr" rid="B23">Liu et al., 2021</xref>) addressed the prohibitive computational cost of global attention by introducing a hierarchical, windowed self-attention mechanism that scales linearly with image size. This efficient design was later adapted into U-shaped architectures such as Swin-UNet (<xref ref-type="bibr" rid="B5">Cao et al., 2022</xref>), creating a pure Transformer-based model for segmentation. A parallel research direction sought to combine the strengths of both paradigms in hybrid CNN-Transformer models. These architectures aim to retain the robust local feature extraction of CNNs while incorporating the global context modeling of Transformers. Prominent examples include TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>), which embeds a Transformer in the encoder of a U-Net to capture global context; UNETR (<xref ref-type="bibr" rid="B16">Hatamizadeh et al., 2022</xref>), which pairs a Transformer encoder with a convolutional decoder for 3D volumetric segmentation; and TransFuse (<xref ref-type="bibr" rid="B37">Zhang et al., 2021</xref>), which uses a dual-branch structure to fuse features from parallel CNN and Transformer backbones. Despite these architectural innovations, a fundamental bottleneck remains: the quadratic computational complexity of standard self-attention. This leads to prohibitive memory usage and inference latency on the high-resolution images common in clinical practice. Furthermore, the reduced inductive bias of Transformers compared to CNNs often necessitates extensive pre-training and can compromise model stability and performance on small datasets, particularly in localizing complex boundaries (<xref ref-type="bibr" rid="B15">Han et al., 2023</xref>; <xref ref-type="bibr" rid="B23">Liu et al., 2021</xref>).</p>
</sec>
<sec id="s2-1-3">
<label>2.1.3</label>
<title>Mamba-based methods</title>
<p>SSMs have recently emerged as a powerful alternative to Transformers, offering comparable long-range dependency modeling at near-linear computational complexity. The foundational Mamba model (<xref ref-type="bibr" rid="B14">Gu and Dao, 2024</xref>) achieves this efficiency through a selective state space mechanism (S6), but its unidirectional processing limits its native awareness of 2D spatial structures. To adapt this paradigm for vision, the ViM (<xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>) introduced a multi-directional scanning process (SS2D) that transforms image features into complementary 1D sequences, establishing a blueprint for visual SSMs. This foundational work prompted a rapid proliferation of Mamba-based architectures for medical segmentation. Many of these, such as U-Mamba (<xref ref-type="bibr" rid="B2">Bao et al., 2025</xref>) and VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>), integrated Mamba blocks into established U-shaped frameworks to enhance their feature encoders. Other efforts have focused on improving computational efficiency for resource-constrained environments (e.g., LightM-UNet (<xref ref-type="bibr" rid="B33">Wu et al., 2024</xref>)) or leveraging pre-training to improve generalization (e.g., Swin-U-Mamba (<xref ref-type="bibr" rid="B24">Liu J. et al., 2024</xref>)). However, a critical limitation pervades these first-generation visual SSMs. They all rely on flattening 2D feature maps into 1D sequences for processing. This serialization fundamentally disrupts the intrinsic 2D spatial adjacency and directional relationships inherent in images. As a result, their ability to model complex boundaries and preserve the continuity of slender anatomical structures is compromised, hindering their performance on challenging segmentation targets (<xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>). This architectural flaw underscores the need for a new approach that can process visual information in its native 2D context.</p>
</sec>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Visual state space modeling and recursive enhancement</title>
<p>Visual state space models (<xref ref-type="bibr" rid="B14">Gu and Dao, 2024</xref>), exemplified by Mamba and ViM (<xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>), have shown considerable promise for medical image segmentation but also face inherent limitations. To address these, recent architectures have incorporated recursive mechanisms to enhance long-range information propagation and directional awareness. A prominent example is the RWKV model (<xref ref-type="bibr" rid="B39">Zhou et al., 2023</xref>), which integrates linear-time recursive updates with key&#x2013;value interactions. This design achieves low computational cost and stable long-range dependency modeling, and its efficacy has been proven in general-purpose vision and natural language tasks, such as Vision-RWKV (<xref ref-type="bibr" rid="B12">Duan et al., 2024</xref>). However, a critical challenge arises when applying RWKV directly to medical image segmentation. Its strength in modeling global context often comes at the expense of capturing fine-grained local spatial continuity. This limitation hinders its ability to accurately delineate complex anatomical structures, a fundamental requirement for clinical applications (<xref ref-type="bibr" rid="B39">Zhou et al., 2023</xref>; <xref ref-type="bibr" rid="B12">Duan et al., 2024</xref>).</p>
<p>To address this limitation, we introduce <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, a model that advances the ViM-based (<xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>) U-shaped architecture through two synergistic innovations: a nested recursive block and a multi-directional zigzag space-mixing mechanism. This design explicitly restores 2D spatial adjacency and strengthens directional context aggregation, enabling the simultaneous modeling of fine-grained local details and global context with near-linear complexity. Consequently, <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM excels at accurately delineating complex boundaries, slender structures, and multi-scale anatomical regions within medical images. Critically, while maintaining a compact parameter footprint and near-linear computational cost, <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM outperforms leading convolutional, Transformer-based, and state space models across diverse public medical image segmentation benchmarks.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3-1">
<label>3.1</label>
<title>Preliminary knowledge</title>
<p>SSMs describe how an input sequence drives the evolution of a hidden state and generates an output sequence. In the continuous-time case, a first-order linear SSM can be written as in <xref ref-type="disp-formula" rid="e1">Equation 1</xref>:<disp-formula id="e1">
<mml:math id="m19">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mspace width="0.17em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>A</mml:mi>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>y</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mspace width="0.17em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>C</mml:mi>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf19">
<mml:math id="m20">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the input sequence, <inline-formula id="inf20">
<mml:math id="m21">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the latent state, <inline-formula id="inf21">
<mml:math id="m22">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the output, and <inline-formula id="inf22">
<mml:math id="m23">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are learnable parameter matrices.</p>
<p>For use in deep-learning models, this continuous-time system is usually converted into a discrete-time form. Using Zero-Order Hold (ZOH) with sampling interval <inline-formula id="inf23">
<mml:math id="m24">
<mml:mrow>
<mml:mo>&#x25b3;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, we obtain <xref ref-type="disp-formula" rid="e2">Equation 2</xref>:<disp-formula id="e2">
<mml:math id="m25">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mspace width="0.17em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>y</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mspace width="0.17em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>C</mml:mi>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf24">
<mml:math id="m26">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x25b3;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x25b3;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x25b3;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>&#x25b3;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:mo>&#x25b3;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> controls the timescale of the dynamics.</p>
<p>The recurrent update above can be implemented efficiently as a one-dimensional convolution, as shown in <xref ref-type="disp-formula" rid="e3">Equation 3</xref>:<disp-formula id="e3">
<mml:math id="m28">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>x</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:mo>&#x2217;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes convolution, <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the convolution kernel induced by the SSM, and <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the length of <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This view allows SSMs to model long-range dependencies in linear time.</p>
<p>The ViM model (<xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>) adapts this SSM framework for visual tasks. Its architecture features two core components: the S6 block, which leverages the efficient convolutional representation to model dependencies within a sequence, and the SS2D mechanism, which flattens 2D image features into 1D sequences for the SSM. By employing bidirectional scanning (e.g., horizontal and vertical), SS2D embeds spatial context from the original image grid. Integrated into a U-Net architecture (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>), ViM has demonstrated strong performance in medical image segmentation. Despite its success, ViM exhibits two fundamental limitations. First, its capacity for fine-grained local modeling is constrained by the inherently 1D nature of the underlying SSM. Second, its fixed, axis-aligned scanning strategy is suboptimal for capturing the complex boundaries and irregular topologies characteristic of anatomical structures. Addressing these shortcomings is the primary motivation for our work. We enhance ViM by introducing mechanisms that significantly boost both local modeling fidelity and scanning flexibility, resulting in the recursion-enhanced <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM model.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Overall architecture</title>
<p>We introduce <inline-formula id="inf31">
<mml:math id="m34">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, a recursive visual state space model designed for high-fidelity medical image segmentation. While built upon the classic U-shaped encoder-decoder architecture (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>), <inline-formula id="inf32">
<mml:math id="m35">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM&#x2019;s innovation lies in its core building blocks and sequence modeling mechanisms. Central to our design is the novel <inline-formula id="inf33">
<mml:math id="m36">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block, which replaces the conventional S6 state space unit. The <inline-formula id="inf34">
<mml:math id="m37">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block integrates a new attention mechanism, termed CZ-WKV attention. This block employs SSR within a NRC to model local and global features concurrently. This structure is complemented by an extensible multi-directional zigzag scanning strategy, which, combined with Quad-Directional Token Shift (Q-Shift), injects directional priors into the model. This architecture explicitly enhances spatial continuity, directional robustness, and cross-scale contextual modeling, all while maintaining near-linear computational complexity.</p>
<p>The overall architecture of <inline-formula id="inf35">
<mml:math id="m38">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>. It consists of a patch-embedding layer, a hierarchical encoder, a symmetric decoder, skip connections, and a final projection layer. An input image <inline-formula id="inf36">
<mml:math id="m39">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is first partitioned into non-overlapping patches using a <inline-formula id="inf37">
<mml:math id="m40">
<mml:mrow>
<mml:mn>4</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolution with a stride of 4. This operation simultaneously projects the patches into a <inline-formula id="inf38">
<mml:math id="m41">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-dimensional feature space, yielding an embedded feature map <inline-formula id="inf39">
<mml:math id="m42">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Overall architecture of <inline-formula id="inf40">
<mml:math id="m43">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM. The model employs a U-shaped encoder&#x2013;decoder framework where the <inline-formula id="inf41">
<mml:math id="m44">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block, as the fundamental building block, integrates multi-scale features through skip connections.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g001.tif">
<alt-text content-type="machine-generated">Flowchart diagram illustrating a medical image segmentation model with an input skin lesion image passed through patch embedding, sequential ZR squared blocks with patch merging down to lower resolutions, and symmetric patch expanding with skip connections, ending in a final projection that produces a segmented output mask.</alt-text>
</graphic>
</fig>
<p>The encoder comprises <inline-formula id="inf42">
<mml:math id="m45">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> stages. Let <inline-formula id="inf43">
<mml:math id="m46">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">enc</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote the feature map at the <inline-formula id="inf44">
<mml:math id="m47">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th encoder stage. Each stage consists of a series of <inline-formula id="inf45">
<mml:math id="m48">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block for feature extraction, followed by a Patch Merging (PM) module (<xref ref-type="bibr" rid="B23">Liu et al., 2021</xref>). The PM module downsamples the feature map, halving its spatial resolution <inline-formula id="inf46">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and doubling its channel dimension <inline-formula id="inf47">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The stage-level update for the encoder is thus given in <xref ref-type="disp-formula" rid="e4">Equation 4</xref>:<disp-formula id="e4">
<mml:math id="m51">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">enc</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>Z</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">enc</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>Symmetrical to the encoder, the decoder also comprises <inline-formula id="inf48">
<mml:math id="m52">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> stages. Let <inline-formula id="inf49">
<mml:math id="m53">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">dec</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> be the feature map at the <inline-formula id="inf50">
<mml:math id="m54">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th decoder stage. The decoding process begins with the bottleneck feature map from the encoder, <inline-formula id="inf51">
<mml:math id="m55">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">dec</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">enc</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. Each decoder stage includes several <inline-formula id="inf52">
<mml:math id="m56">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block for feature refinement and a Patch Expanding (PE) module (<xref ref-type="bibr" rid="B23">Liu et al., 2021</xref>). The PE module upsamples the feature map, doubling its spatial resolution and halving its channel dimension. Skip connections fuse the upsampled decoder features with the corresponding high-resolution features from the encoder. This stage-wise decoder update is defined as in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>:<disp-formula id="e5">
<mml:math id="m57">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">dec</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>Z</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>E</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">dec</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">enc</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>where the addition operation fuses the features from the decoder pathway and the corresponding encoder stage.</p>
<p>This symmetric, multi-scale architecture, combined with the recursive feature refinement of the <inline-formula id="inf53">
<mml:math id="m58">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block and residual cross-layer fusion, enhances long-range dependency modeling and ensures robust feature consistency across scales.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>ZR<sup>2</sup> block</title>
<p>The <inline-formula id="inf54">
<mml:math id="m59">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block is the fundamental computational unit of <inline-formula id="inf55">
<mml:math id="m60">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM. It adapts the ViM paradigm of lightweight state space modeling but crucially substitutes the standard S6 operator with our SSR module to explicitly model spatial continuity. As depicted in <xref ref-type="fig" rid="F2">Figure 2</xref>, input features first undergo Layer Normalization and linear projection. The resulting features are then processed by the SSR module, which performs the recursive state updates. The output from the SSR is subsequently normalized, modulated by a sigmoid gate, and fused with the original input via a residual connection to ensure stable information propagation.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Architecture of the <inline-formula id="inf56">
<mml:math id="m61">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block. The <inline-formula id="inf57">
<mml:math id="m62">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block, the foundational computational unit of <inline-formula id="inf58">
<mml:math id="m63">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, comprises a central SSR operator, a normalization layer, a linear projection, and a gated residual connection.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g002.tif">
<alt-text content-type="machine-generated">Block diagram illustrating a neural network module with the following flow: input passes through layer normalization, linear, and SSR blocks, followed by another layer normalization, gated with a parallel linear path, then passes through another linear block and a residual addition before producing output.</alt-text>
</graphic>
</fig>
<p>The efficacy of this design stems from the SSR module&#x2019;s unique ability to jointly model fine-grained local details and global context through its nested recursive structure and multi-directional scanning. By integrating this module, the <inline-formula id="inf59">
<mml:math id="m64">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block surpasses the performance of the original S6 operator in ViM (<xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>), particularly in delineating the complex boundaries and irregular structures characteristic of medical images.</p>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>SSR module</title>
<sec id="s3-4-1">
<label>3.4.1</label>
<title>NRC collaborative architecture</title>
<p>The architecture of the SSR module is detailed in <xref ref-type="fig" rid="F3">Figure 3</xref>. Its core design features a nested dual-path system composed of an NRC-Inner and an NRC-Outer pathway (<xref ref-type="fig" rid="F3">Figure 3a</xref>). The NRC-Inner path processes fine-grained tokens within local image patches to capture high-frequency details and local spatial dependencies. Concurrently, the NRC-Outer path models the global context across these patches, capturing the long-range dependencies essential for holistic scene understanding. A multi-directional zigzag scanning strategy is applied to both pathways, creating a shared geometric prior that ensures spatial consistency and sequence alignment during their interaction.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Architecture of the SSR module. <bold>(a)</bold> The NRC structure, which integrates multiple NRC-Inner blocks with a single NRC-Outer block to jointly model local and global spatial relationships. <bold>(b)</bold> The CZ-WKV attention mechanism, which aggregates multi-directional context by recursively applying m Bi-WKV steps across four distinct zigzag scanning patterns.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating the Nested Recurrent Cell (NRC) collaborative architecture, Cyclic Zigzag Weighted Key Value (CZ-WKV) attention mechanism, spatial and channel mixing flows, visualization of zigzag schemes on image grids, and legends for element-wise operations and activation functions used in the model.</alt-text>
</graphic>
</fig>
<p>In the NRC-Inner path, let <inline-formula id="inf60">
<mml:math id="m65">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the index of the NRC-Inner update (patch group) within the <inline-formula id="inf61">
<mml:math id="m66">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th layer. An input token sequence <inline-formula id="inf62">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (where <inline-formula id="inf63">
<mml:math id="m68">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the sequence length and <inline-formula id="inf64">
<mml:math id="m69">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the channel dimension) undergoes a recursive refinement process. The sequence is first normalized to stabilize the feature distribution. Next, spatial dependencies are modeled using a Spatial Mix operation on the zigzag-ordered sequence, which incorporates a lightweight directional displacement to enhance directional awareness. State updates are then performed via a recursive <inline-formula id="inf65">
<mml:math id="m70">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-step aggregation based on our CZ-RWKV module. This update is formulated as a residual operation in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>:<disp-formula id="e6">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>Z</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:mi>V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>here, <inline-formula id="inf66">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the refined local-token sequence after the <inline-formula id="inf67">
<mml:math id="m73">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th NRC-Inner update in layer <inline-formula id="inf68">
<mml:math id="m74">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The <inline-formula id="inf69">
<mml:math id="m75">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>Z</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> function aggregates information from neighboring tokens along the scan path, allowing the model to progressively refine local details while maintaining representational stability.</p>
<p>The two pathways are coupled via a cross-path feature injection mechanism that allows local features to inform the global context. After its update, the local sequence from the Inner path, <inline-formula id="inf70">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, is summarized into a global descriptor and injected into the Outer path&#x2019;s state, <inline-formula id="inf71">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e7">
<mml:math id="m78">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>C</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>in this step, <inline-formula id="inf72">
<mml:math id="m79">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> flattens the sequence <inline-formula id="inf73">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and a fully connected layer, <inline-formula id="inf74">
<mml:math id="m81">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, projects it to match the channel dimension of the NRC-Outer path. This enriched global state is then updated through the Outer path&#x2019;s own recursive unit, which applies the same CZ-RWKV operator under the shared zigzag scheme:<disp-formula id="e8">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>Z</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:mi>V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>here, <inline-formula id="inf75">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the global context representation after the update at step <inline-formula id="inf76">
<mml:math id="m84">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf77">
<mml:math id="m85">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is layer normalization applied to the previous state. In practice, <inline-formula id="inf78">
<mml:math id="m86">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is small and the state is updated in-place; we maintain only the current state without storing intermediate <inline-formula id="inf79">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. This cyclical process&#x2014;where the global memory is first enriched by fine-grained local details (<xref ref-type="disp-formula" rid="e7">Equation 7</xref>) and subsequently updated to model long-range interactions (<xref ref-type="disp-formula" rid="e8">Equation 8</xref>)&#x2014;enables local structural information to directly inform the global semantic state. This tight integration is critical for preserving spatial continuity along complex object boundaries in medical images.</p>
</sec>
<sec id="s3-4-2">
<label>3.4.2</label>
<title>Quad-directional token shift</title>
<p>The Q-Shift operation (<xref ref-type="bibr" rid="B12">Duan et al., 2024</xref>) introduces directional priors before feature serialization to establish local dependencies between adjacent tokens, thereby enabling subsequent multi-directional context modeling. Operationally, for an input feature map <inline-formula id="inf80">
<mml:math id="m88">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, Q-Shift splits the channel dimension into four groups. It then shifts each group along one of the four orthogonal directions (up, down, left, right), concatenates the resulting features, and integrates them with the original input <inline-formula id="inf81">
<mml:math id="m89">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> via a gated residual connection, as shown in <xref ref-type="disp-formula" rid="e9">Equations 9</xref>, <xref ref-type="disp-formula" rid="e10">10</xref>:<disp-formula id="e9">
<mml:math id="m90">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:msup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2020;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m91">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2020;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mfenced open="(" close="">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mi>X</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mi>X</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>:</mml:mo>
<mml:mn>3</mml:mn>
<mml:mi>C</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mfenced open="" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1,3</mml:mn>
<mml:mi>C</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>where <inline-formula id="inf82">
<mml:math id="m92">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2217;</mml:mo>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> denotes that separate Q-Shift branches are applied for the subsequent computation of <inline-formula id="inf83">
<mml:math id="m93">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf84">
<mml:math id="m94">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf85">
<mml:math id="m95">
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf86">
<mml:math id="m96">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is a learnable, channel-wise gating vector. This quad-directional shifting mechanism explicitly injects anisotropic neighborhood information into distinct channel groups with negligible computational overhead. This process enhances local directional awareness and provides essential guidance for the subsequent CZ-WKV-based recursive modeling.</p>
</sec>
<sec id="s3-4-3">
<label>3.4.3</label>
<title>Spatial mix</title>
<p>Following directional modeling, the feature map is serialized along a zigzag scanning path into a token sequence <inline-formula id="inf87">
<mml:math id="m97">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, a process that preserves spatial continuity between adjacent tokens. The Spatial Mix module then processes this sequence as in <xref ref-type="disp-formula" rid="e11">Equation 11</xref>, by first applying the Q-Shift operation and three distinct linear projections to generate the primary components:<disp-formula id="e11">
<mml:math id="m98">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>Q</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>Q</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>Q</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>where <inline-formula id="inf88">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are learnable projection matrices. Here, <inline-formula id="inf89">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> functions as a content-dependent gating signal, while <inline-formula id="inf90">
<mml:math id="m101">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf91">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> provide the key and value sequences for the subsequent state space computation.</p>
<p>The global spatial response is then computed by the linear-complexity CZ-WKV mechanism as in <xref ref-type="disp-formula" rid="e12">Equation 12</xref>:<disp-formula id="e12">
<mml:math id="m103">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>v</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>Z</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:mi>V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>this mechanism executes an <inline-formula id="inf92">
<mml:math id="m104">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-step alternating recursion along multiple zigzag directions, enabling each token to aggregate information from the entire sequence while maintaining spatial ordering. The final output of the Spatial Mix module is formulated as in <xref ref-type="disp-formula" rid="e13">Equation 13</xref>:<disp-formula id="e13">
<mml:math id="m105">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>where <inline-formula id="inf93">
<mml:math id="m106">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is the sigmoid function, <inline-formula id="inf94">
<mml:math id="m107">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is an output projection matrix, <inline-formula id="inf95">
<mml:math id="m108">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes element-wise multiplication, and <inline-formula id="inf96">
<mml:math id="m109">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents Layer Normalization. In this formulation, <inline-formula id="inf97">
<mml:math id="m110">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> acts as a dynamic gate, modulating the infusion of the global response <inline-formula id="inf98">
<mml:math id="m111">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> into each token. The residual connection with <inline-formula id="inf99">
<mml:math id="m112">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> ensures the preservation of the original local representation. Consequently, the Spatial Mix module effectively integrates long-range spatial context with local feature consistency.</p>
</sec>
<sec id="s3-4-4">
<label>3.4.4</label>
<title>Channel mix</title>
<p>The Channel Mixing module is designed to fuse cross-semantic features along the channel dimension (<xref ref-type="bibr" rid="B39">Zhou et al., 2023</xref>), functioning as a gated feed-forward branch that complements the Spatial Mix module. This module operates on the spatially enhanced features <inline-formula id="inf100">
<mml:math id="m113">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, again applying the Q-Shift operation and distinct linear projections to generate a gating descriptor and a pre-activation sequence, as shown in <xref ref-type="disp-formula" rid="e14">Equation 14</xref>:<disp-formula id="e14">
<mml:math id="m114">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mspace width="0.17em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>Q</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mspace width="0.17em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>Q</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>here, <inline-formula id="inf101">
<mml:math id="m115">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> functions as a channel-wise gating descriptor, while <inline-formula id="inf102">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> serves as the pre-activation input for constructing the value branch. The value sequence <inline-formula id="inf103">
<mml:math id="m117">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is then computed as in <xref ref-type="disp-formula" rid="e15">Equation 15</xref>:<disp-formula id="e15">
<mml:math id="m118">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>where the <inline-formula id="inf104">
<mml:math id="m119">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> activation function suppresses negative responses and amplifies strong positive activations. The final output, <inline-formula id="inf105">
<mml:math id="m120">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, is computed via a gated residual connection as in <xref ref-type="disp-formula" rid="e16">Equation 16</xref>:<disp-formula id="e16">
<mml:math id="m121">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>the core of this operation is the element-wise product <inline-formula id="inf106">
<mml:math id="m122">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which functions as a dynamic, data-dependent gate to selectively amplify informative channels while attenuating irrelevant ones. The residual connection with <inline-formula id="inf107">
<mml:math id="m123">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> preserves the input feature representation, enabling the module to enhance nonlinear cross-channel interactions efficiently and without the quadratic computational complexity typical of self-attention mechanisms.</p>
</sec>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>CZ-WKV attention mechanism</title>
<p>The Bidirectional WKV (Bi-WKV) module, a core component of the Vision-RWKV (<xref ref-type="bibr" rid="B12">Duan et al., 2024</xref>) spatial mixture, effectively models long-range dependencies in linear time. However, its performance is highly sensitive to the orientation of token scanning. As the scanning path changes across a 2D image, the resulting token sequence is altered, leading to inconsistent model outputs. While subsequent methods like Re-WKV (<xref ref-type="bibr" rid="B36">Yang et al., 2025</xref>) mitigate this sensitivity by applying Bi-WKV across multiple scanning directions, this approach disrupts the image&#x2019;s inherent spatial continuity. Consequently, it compromises the model&#x2019;s ability to leverage crucial image-space inductive biases. Zigzag-WKV (<xref ref-type="bibr" rid="B8">Chen et al., 2025</xref>) addresses this by preserving spatial continuity via a zigzag scanning pattern. Nevertheless, its single-pass, unidirectional computation provides only a static representation of global context, limiting its capacity to model complex, long-range dependencies effectively. To overcome these limitations, we introduce the CZ-WKV module (<xref ref-type="fig" rid="F3">Figure 3b</xref>). CZ-WKV performs a cascaded, <inline-formula id="inf108">
<mml:math id="m124">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-step sequence of Bi-WKV operations along a single zigzag path. This design enables the model to dynamically and recursively refine global token interactions. As a result, it preserves the global receptive field and spatial continuity of zigzag scanning while achieving robust performance irrespective of the initial scanning orientation.</p>
<sec id="s3-5-1">
<label>3.5.1</label>
<title>Bi-WKV</title>
<p>To address the limited receptive field of the unidirectional WKV (Uni-WKV) mechanism, we adopt the Bi-WKV formulation from Vision-RWKV (<xref ref-type="bibr" rid="B12">Duan et al., 2024</xref>). This bidirectional approach expands the receptive field to encompass the entire token sequence, enabling global context modeling while maintaining linear-time complexity. Specifically, for a given projected key <inline-formula id="inf109">
<mml:math id="m125">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and value <inline-formula id="inf110">
<mml:math id="m126">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the attention output for the <inline-formula id="inf111">
<mml:math id="m127">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th token, <inline-formula id="inf112">
<mml:math id="m128">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, is computed as in <xref ref-type="disp-formula" rid="e17">Equation 17</xref>:<disp-formula id="e17">
<mml:math id="m129">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:mi>V</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>in this formulation, <inline-formula id="inf113">
<mml:math id="m130">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the total number of tokens, while <inline-formula id="inf114">
<mml:math id="m131">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the key and value vectors for the <inline-formula id="inf115">
<mml:math id="m132">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th token, respectively. The term <inline-formula id="inf116">
<mml:math id="m133">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> encodes the relative position between tokens <inline-formula id="inf117">
<mml:math id="m134">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf118">
<mml:math id="m135">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. A learnable vector <inline-formula id="inf119">
<mml:math id="m136">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> modulates the spatial decay based on this relative position, while a second learnable vector, <inline-formula id="inf120">
<mml:math id="m137">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, applies a specific weighting to the current token <inline-formula id="inf121">
<mml:math id="m138">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, thereby amplifying its feature contribution.</p>
<p>The Bi-WKV mechanism concurrently achieves a global receptive field and high computational efficiency. First, the output for each token incorporates information from all other tokens in the sequence, thereby establishing a global receptive field. Second, this mechanism avoids the quadratic complexity characteristic of standard self-attention by eliminating explicit query-key matrix multiplications. For an input sequence of length <inline-formula id="inf122">
<mml:math id="m139">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> with channel dimension <inline-formula id="inf123">
<mml:math id="m140">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the computational cost of Bi-WKV scales linearly with <inline-formula id="inf124">
<mml:math id="m141">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>(i.e.,<inline-formula id="inf125">
<mml:math id="m142">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>), as demonstrated in Vision-RWKV (<xref ref-type="bibr" rid="B12">Duan et al., 2024</xref>). This linear scalability makes the model particularly well-suited for processing the long token sequences generated from high-resolution medical images, where standard attention mechanisms would be computationally prohibitive.</p>
</sec>
<sec id="s3-5-2">
<label>3.5.2</label>
<title>CZ-WKV</title>
<p>Segmenting 2D medical images&#x2014;such as those from dermatoscopy, ultrasound, or single-slice CT&#x2014;presents substantial challenges. These images are frequently characterized by low contrast, complex boundaries that are often elongated or jagged, significant geometric deformations, and wide variations in lesion shape and scale. Consequently, an effective segmentation model must satisfy two critical requirements: it must preserve local spatial continuity to honor anatomical and geometric priors, and simultaneously model global, long-range dependencies to achieve robustness against changes in object orientation and scanning direction. Existing state space methods, however, fall short of concurrently meeting these demands. The standard Bi-WKV is sensitive to the sequence unrolling direction. While Re-WKV introduces multi-directional interactions, it does so at the cost of disrupting 2D spatial adjacency. Conversely, Zigzag-WKV maintains spatial continuity but is limited by a fixed scanning path in each forward pass, which provides insufficient cross-layer global context for complex structures. To address these limitations, we propose the CZ-WKV. This module executes a scalable <inline-formula id="inf126">
<mml:math id="m143">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-step sequence of Bi-WKV operations, where each step is guided by a cyclically shifting zigzag scanning pattern. This design strategically balances directional robustness with the preservation of spatial continuity, all while maintaining linear-time efficiency. The core of CZ-WKV is the following recursive formulation:<disp-formula id="e18">
<mml:math id="m144">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">zig</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">zig</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>let <inline-formula id="inf127">
<mml:math id="m145">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the <inline-formula id="inf128">
<mml:math id="m146">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th Bi-WKV operation and <inline-formula id="inf129">
<mml:math id="m147">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">zig</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represent the spatial permutation operation corresponding to the scanning direction for the <inline-formula id="inf130">
<mml:math id="m148">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th zigzag scheme at step <inline-formula id="inf131">
<mml:math id="m149">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf132">
<mml:math id="m150">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1,2,3,4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf133">
<mml:math id="m151">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Within a given scheme <inline-formula id="inf134">
<mml:math id="m152">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the scanning direction, defined by <inline-formula id="inf135">
<mml:math id="m153">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">zig</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, alternates between successive steps (e.g., <inline-formula id="inf136">
<mml:math id="m154">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">zig</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for odd steps, <inline-formula id="inf137">
<mml:math id="m155">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">zig</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for even steps) to ensure comprehensive feature aggregation from opposing orientations. To promote directional diversity across layers, the scheme index is cycled as <inline-formula id="inf138">
<mml:math id="m156">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mspace width="0.5em"/>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>d</mml:mi>
<mml:mspace width="0.5em"/>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which traverses the four distinct zigzag patterns in a round-robin manner.</p>
<p>The process is defined recursively. We denote <inline-formula id="inf139">
<mml:math id="m157">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> as the attention output of the <inline-formula id="inf140">
<mml:math id="m158">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th Bi-WKV iteration. The process is initialized using the value projection <inline-formula id="inf141">
<mml:math id="m159">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, such that <inline-formula id="inf142">
<mml:math id="m160">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. As formulated in <xref ref-type="disp-formula" rid="e18">Equation 18</xref>, at each subsequent step <inline-formula id="inf143">
<mml:math id="m161">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the Bi-WKV module uses the output from the preceding iteration, <inline-formula id="inf144">
<mml:math id="m162">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, as its new value input. This recursive mechanism thereby integrates the attention output derived from a different scanning direction in the previous step. After <inline-formula id="inf145">
<mml:math id="m163">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> iterations, the final output is given by <xref ref-type="disp-formula" rid="e19">Equation 19</xref>:<disp-formula id="e19">
<mml:math id="m164">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>v</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>Z</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
<mml:mi>V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
</p>
<p>Thus, CZ-WKV synergistically combines a recurrent attention mechanism with multiple, cycling zigzag scanning paths. This approach strengthens global token interactions far more effectively than Zigzag-WKV while, unlike Re-WKV, preserving vital 2D spatial continuity. Furthermore, this enhanced modeling capability is achieved without sacrificing computational efficiency. As the number of iterations <inline-formula id="inf146">
<mml:math id="m165">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a small constant much less than the sequence length <inline-formula id="inf147">
<mml:math id="m166">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#x226a;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the computational complexity remains linear with respect to sequence length, scaling as <inline-formula id="inf148">
<mml:math id="m167">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. This makes the proposed CZ-WKV module an efficient yet powerful solution for robust medical image segmentation.</p>
</sec>
</sec>
<sec id="s3-6">
<label>3.6</label>
<title>Loss function</title>
<p>To train our model, we employ a composite loss function engineered to balance pixel-level precision with region-level consistency. This function is the sum of a cross-entropy (CE) loss and a Dice loss, formulated as in <xref ref-type="disp-formula" rid="e20">Equation 20</xref>:<disp-formula id="e20">
<mml:math id="m168">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">Dice</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>the CE component, <inline-formula id="inf149">
<mml:math id="m169">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, targets pixel-wise classification accuracy, while the Dice component, <inline-formula id="inf150">
<mml:math id="m170">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">Dice</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, addresses the common challenge of class imbalance in medical segmentation by maximizing the geometric overlap between the model&#x2019;s prediction and the ground-truth annotation (<xref ref-type="bibr" rid="B19">Isensee et al., 2021</xref>; <xref ref-type="bibr" rid="B33">Wu et al., 2024</xref>). The cross-entropy loss is defined as in <xref ref-type="disp-formula" rid="e21">Equation 21</xref>:<disp-formula id="e21">
<mml:math id="m171">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>and the Dice loss is defined as in <xref ref-type="disp-formula" rid="e22">Equation 22</xref>:<disp-formula id="e22">
<mml:math id="m172">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">Dice</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>
</p>
<p>In these equations, <inline-formula id="inf151">
<mml:math id="m173">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the total number of pixels in a batch and <inline-formula id="inf152">
<mml:math id="m174">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of segmentation classes. For a given pixel <inline-formula id="inf153">
<mml:math id="m175">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf154">
<mml:math id="m176">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a binary indicator for the ground-truth label (1 if pixel <inline-formula id="inf155">
<mml:math id="m177">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> belongs to class <inline-formula id="inf156">
<mml:math id="m178">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>; 0 otherwise), and <inline-formula id="inf157">
<mml:math id="m179">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the model&#x2019;s predicted probability of pixel <inline-formula id="inf158">
<mml:math id="m180">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> belonging to class <inline-formula id="inf159">
<mml:math id="m181">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The terms <inline-formula id="inf160">
<mml:math id="m182">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf161">
<mml:math id="m183">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the flattened ground-truth and prediction vectors, respectively. This dual-component loss function compels the model to produce segmentations that are not only precise at the pixel level but also structurally coherent, which is critical for delineating fine anatomical details and ensuring region integrity.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4-1">
<label>4.1</label>
<title>Datasets</title>
<p>To assess the performance and scalability of <inline-formula id="inf162">
<mml:math id="m184">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, we benchmarked the model on five publicly available medical image segmentation datasets. These datasets encompass a range of clinical applications&#x2014;skin, breast lesion, colorectal polyp, and organ segmentation&#x2014;and feature diverse imaging modalities and resolutions, providing a comprehensive testbed for evaluating the model&#x2019;s efficacy and generalizability.<list list-type="simple">
<list-item>
<label>1.</label>
<p>Skin lesion datasets. We utilized two benchmarks from the International Skin Imaging Consortium (ISIC): ISIC 2017 (<xref ref-type="bibr" rid="B4">Berseth, 2017</xref>) and ISIC 2018 (<xref ref-type="bibr" rid="B9">Codella et al., 2019</xref>). The ISIC 2017 dataset contains 2,150 images and ISIC 2018 contains 2,694 images; each image is paired with a corresponding ground-truth lesion mask. Following established protocols (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>), we partitioned ISIC 2017 into training (1,500 images) and test (650 images) sets. For ISIC 2018, the split was 1,886 images for training and 808 for testing.</p>
</list-item>
<list-item>
<label>2.</label>
<p>Polyp dataset. For polyp segmentation, we used the CVC-ClinicDB dataset (<xref ref-type="bibr" rid="B3">Bernal et al., 2015</xref>), originally from the MICCAI 2015 colonoscopic polyp detection challenge. This dataset comprises 612 colonoscopic images with expert-annotated polyp masks. It presents clinically relevant challenges, including polyps of varying sizes and morphologies, inconsistent illumination, complex mucosal structures, and specular artifacts. Adhering to standard splits (<xref ref-type="bibr" rid="B20">Jha et al., 2019</xref>), we divided the dataset into 429 images for training and 183 for testing.</p>
</list-item>
<list-item>
<label>3.</label>
<p>Ultrasound dataset. The Breast Ultrasound Images (BUSI) dataset (<xref ref-type="bibr" rid="B1">Al-Dhabyani et al., 2020</xref>) was used to evaluate performance on ultrasound data. It consists of 780 images, each with a ground-truth mask of a breast lesion. This dataset is particularly challenging due to inherent speckle noise, low contrast, and the irregular lesion morphologies characteristic of ultrasound imaging. The dataset was partitioned into 624 training and 156 test images.</p>
</list-item>
<list-item>
<label>4.</label>
<p>Multi-organ dataset. To evaluate multi-organ segmentation, we employed the Synapse multi-organ CT dataset from the MICCAI 2015 Multi-Atlas Abdomen Labeling Challenge (<xref ref-type="bibr" rid="B21">Landman et al., 2015</xref>). This dataset includes 30 abdominal CT volumes, corresponding to 3,779 axial slices, with segmentations for eight organs: aorta, gallbladder, left kidney, right kidney, liver, pancreas, spleen, and stomach. Consistent with prior work, we used 18 cases for training.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Implementation details</title>
<p>We applied a standardized training protocol across all segmentation tasks to ensure fair and reproducible comparisons. All models were trained and evaluated using the PyTorch framework on a single NVIDIA RTX 3080 GPU. Input images were uniformly resized to <inline-formula id="inf163">
<mml:math id="m185">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels. To prevent overfitting and enhance model generalizability, we applied online data augmentation, including random horizontal flips, vertical flips, and rotations. Models were trained for 150 epochs with a batch size of 16 using the AdamW optimizer (<xref ref-type="bibr" rid="B40">Zhou et al., 2024</xref>). The learning rate was initialized to 1e-3 and adjusted using a linear warmup schedule followed by polynomial decay.</p>
<p>Following common practice, we further split 10% of the training set as a validation set for model selection and checkpointing, while keeping the official test split unchanged and using it only for final evaluation. The validation split was fixed across random seeds to ensure fair paired comparisons. Unless otherwise specified, all reported metrics are obtained using the checkpoint with the best validation DSC on each run. To account for training variability, all experiments were repeated five times using different random seeds while keeping the data splits and all hyperparameters fixed. For fair paired comparisons, the same set of seeds was used for all competing methods on each benchmark.</p>
<p>The results reported in <xref ref-type="table" rid="T1">Tables 1</xref>&#x2013;<xref ref-type="table" rid="T4">4</xref> are presented as mean <inline-formula id="inf164">
<mml:math id="m186">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> standard deviation over the five runs. To further demonstrate optimization stability and convergence behavior under this standardized protocol, we provide training and validation curves on ISIC 2018 in <xref ref-type="fig" rid="F4">Figure 4</xref>, including loss and Dice Similarity Coefficient (DSC) over 150 epochs. For visual clarity, we plot one run for illustration.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Performance comparison with state-of-the-art methods on the ISIC 2017 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Type</th>
<th align="center">
<inline-formula id="inf165">
<mml:math id="m187">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf166">
<mml:math id="m188">
<mml:mrow>
<mml:mi mathvariant="normal">D</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">C</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf167">
<mml:math id="m189">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">I</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">U</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf168">
<mml:math id="m190">
<mml:mrow>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf169">
<mml:math id="m191">
<mml:mrow>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf170">
<mml:math id="m192">
<mml:mrow>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf171">
<mml:math id="m193">
<mml:mrow>
<mml:mi mathvariant="normal">B</mml:mi>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">CNN</td>
<td align="center">UNet (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>)</td>
<td align="center">86.99 <inline-formula id="inf172">
<mml:math id="m194">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.89</td>
<td align="center">76.98 <inline-formula id="inf173">
<mml:math id="m195">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.50</td>
<td align="center">94.65 <inline-formula id="inf174">
<mml:math id="m196">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.49</td>
<td align="center">97.43 <inline-formula id="inf175">
<mml:math id="m197">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.12</td>
<td align="center">86.82 <inline-formula id="inf176">
<mml:math id="m198">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.94</td>
<td align="center">83.81 <inline-formula id="inf177">
<mml:math id="m199">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.55</td>
</tr>
<tr>
<td align="center">UNet&#x2b;&#x2b; (<xref ref-type="bibr" rid="B38">Zhou et al., 2018</xref>)</td>
<td align="center">86.00 <inline-formula id="inf178">
<mml:math id="m200">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.82</td>
<td align="center">75.44 <inline-formula id="inf179">
<mml:math id="m201">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.62</td>
<td align="center">94.35 <inline-formula id="inf180">
<mml:math id="m202">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.47</td>
<td align="center">97.34 <inline-formula id="inf181">
<mml:math id="m203">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.11</td>
<td align="center">85.40 <inline-formula id="inf182">
<mml:math id="m204">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.91</td>
<td align="center">83.42 <inline-formula id="inf183">
<mml:math id="m205">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.43</td>
</tr>
<tr>
<td align="center">Att-Unet (<xref ref-type="bibr" rid="B27">Oktay et al., 2018</xref>)</td>
<td align="center">87.08 <inline-formula id="inf184">
<mml:math id="m206">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.18</td>
<td align="center">77.12 <inline-formula id="inf185">
<mml:math id="m207">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.53</td>
<td align="center">94.79 <inline-formula id="inf186">
<mml:math id="m208">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.52</td>
<td align="center">97.78 <inline-formula id="inf187">
<mml:math id="m209">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.13</td>
<td align="center">85.65 <inline-formula id="inf188">
<mml:math id="m210">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.20</td>
<td align="center">84.12 <inline-formula id="inf189">
<mml:math id="m211">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.78</td>
</tr>
<tr>
<td rowspan="3" align="left">Transformer</td>
<td align="center">TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>)</td>
<td align="center">88.13 <inline-formula id="inf190">
<mml:math id="m212">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.78</td>
<td align="center">78.79 <inline-formula id="inf191">
<mml:math id="m213">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.57</td>
<td align="center">95.12 <inline-formula id="inf192">
<mml:math id="m214">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.41</td>
<td align="center">98.14 <inline-formula id="inf193">
<mml:math id="m215">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.11</td>
<td align="center">86.05 <inline-formula id="inf194">
<mml:math id="m216">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.01</td>
<td align="center">85.47 <inline-formula id="inf195">
<mml:math id="m217">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.47</td>
</tr>
<tr>
<td align="center">TransFuse (<xref ref-type="bibr" rid="B37">Zhang et al., 2021</xref>)</td>
<td align="center">84.40 <inline-formula id="inf196">
<mml:math id="m218">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.71</td>
<td align="center">79.21 <inline-formula id="inf197">
<mml:math id="m219">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.45</td>
<td align="center">95.17 <inline-formula id="inf198">
<mml:math id="m220">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.39</td>
<td align="center">97.98 <inline-formula id="inf199">
<mml:math id="m221">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.14</td>
<td align="center">87.14 <inline-formula id="inf200">
<mml:math id="m222">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.64</td>
<td align="center">82.31 <inline-formula id="inf201">
<mml:math id="m223">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
</tr>
<tr>
<td align="center">TC-Net (<xref ref-type="bibr" rid="B10">Dong et al., 2022</xref>)</td>
<td align="center">87.23 <inline-formula id="inf202">
<mml:math id="m224">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.91</td>
<td align="center">77.35 <inline-formula id="inf203">
<mml:math id="m225">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.43</td>
<td align="center">94.84 <inline-formula id="inf204">
<mml:math id="m226">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.58</td>
<td align="center">98.05 <inline-formula id="inf205">
<mml:math id="m227">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.16</td>
<td align="center">85.85 <inline-formula id="inf206">
<mml:math id="m228">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81</td>
<td align="center">84.28 <inline-formula id="inf207">
<mml:math id="m229">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.48</td>
</tr>
<tr>
<td rowspan="4" align="left">SSM</td>
<td align="center">VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>)</td>
<td align="center">89.03 <inline-formula id="inf208">
<mml:math id="m230">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.96</td>
<td align="center">80.23 <inline-formula id="inf209">
<mml:math id="m231">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.40</td>
<td align="center">95.29 <inline-formula id="inf210">
<mml:math id="m232">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.42</td>
<td align="center">97.58 <inline-formula id="inf211">
<mml:math id="m233">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.10</td>
<td align="center">
<bold>89.90</bold> <inline-formula id="inf212">
<mml:math id="m234">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.15</bold>
</td>
<td align="center">86.51 <inline-formula id="inf213">
<mml:math id="m235">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.58</td>
</tr>
<tr>
<td align="center">CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>)</td>
<td align="center">89.74 <inline-formula id="inf214">
<mml:math id="m236">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.91</td>
<td align="center">81.40 <inline-formula id="inf215">
<mml:math id="m237">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.38</td>
<td align="center">95.60 <inline-formula id="inf216">
<mml:math id="m238">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.49</td>
<td align="center">98.19 <inline-formula id="inf217">
<mml:math id="m239">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.90</td>
<td align="center">88.70 <inline-formula id="inf218">
<mml:math id="m240">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.95</td>
<td align="center">86.72 <inline-formula id="inf219">
<mml:math id="m241">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.55</td>
</tr>
<tr>
<td align="center">SliceMamba (<xref ref-type="bibr" rid="B13">Fan et al., 2025</xref>)</td>
<td align="center">89.93 <inline-formula id="inf220">
<mml:math id="m242">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.71</td>
<td align="center">81.70 <inline-formula id="inf221">
<mml:math id="m243">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.36</td>
<td align="center">
<bold>95.75</bold> <inline-formula id="inf222">
<mml:math id="m244">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.56</bold>
</td>
<td align="center">98.30 <inline-formula id="inf223">
<mml:math id="m245">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.06</td>
<td align="center">88.81 <inline-formula id="inf224">
<mml:math id="m246">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.93</td>
<td align="center">86.98 <inline-formula id="inf225">
<mml:math id="m247">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.35</td>
</tr>
<tr>
<td align="center">SA-UMamba (<xref ref-type="bibr" rid="B26">Liu et al., 2025</xref>)</td>
<td align="center">89.40 <inline-formula id="inf226">
<mml:math id="m248">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.93</td>
<td align="center">80.83 <inline-formula id="inf227">
<mml:math id="m249">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.34</td>
<td align="center">94.44 <inline-formula id="inf228">
<mml:math id="m250">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.53</td>
<td align="center">97.82 <inline-formula id="inf229">
<mml:math id="m251">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.15</td>
<td align="center">89.60 <inline-formula id="inf230">
<mml:math id="m252">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.87</td>
<td align="center">86.32 <inline-formula id="inf231">
<mml:math id="m253">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.55</td>
</tr>
<tr>
<td rowspan="3" align="left">RWKV</td>
<td align="center">Zig-RiR (<xref ref-type="bibr" rid="B8">Chen et al., 2025</xref>)</td>
<td align="center">84.71 <inline-formula id="inf232">
<mml:math id="m254">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.76</td>
<td align="center">76.76 <inline-formula id="inf233">
<mml:math id="m255">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.42</td>
<td align="center">95.10 <inline-formula id="inf234">
<mml:math id="m256">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.27</td>
<td align="center">98.21 <inline-formula id="inf235">
<mml:math id="m257">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.68</td>
<td align="center">88.64 <inline-formula id="inf236">
<mml:math id="m258">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.28</td>
<td align="center">82.56 <inline-formula id="inf237">
<mml:math id="m259">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.34</td>
</tr>
<tr>
<td align="center">HER-Seg (<xref ref-type="bibr" rid="B35">Xu et al., 2025</xref>)</td>
<td align="center">87.71 <inline-formula id="inf238">
<mml:math id="m260">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.64</td>
<td align="center">80.85 <inline-formula id="inf239">
<mml:math id="m261">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.44</td>
<td align="center">94.76 <inline-formula id="inf240">
<mml:math id="m262">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.65</td>
<td align="center">97.83 <inline-formula id="inf241">
<mml:math id="m263">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.66</td>
<td align="center">88.26 <inline-formula id="inf242">
<mml:math id="m264">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.98</td>
<td align="center">84.38 <inline-formula id="inf243">
<mml:math id="m265">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.46</td>
</tr>
<tr>
<td align="center">ZR<sup>2</sup>ViM (Ours)</td>
<td align="center">
<bold>92.12</bold> <inline-formula id="inf244">
<mml:math id="m266">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.27</bold>
</td>
<td align="center">
<bold>85.83</bold> <inline-formula id="inf245">
<mml:math id="m267">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.31</bold>
</td>
<td align="center">95.68 <inline-formula id="inf246">
<mml:math id="m268">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.34</td>
<td align="center">
<bold>98.36</bold> <inline-formula id="inf247">
<mml:math id="m269">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.12</bold>
</td>
<td align="center">89.86 <inline-formula id="inf248">
<mml:math id="m270">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.91</td>
<td align="center">
<bold>89.64</bold> <inline-formula id="inf249">
<mml:math id="m271">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.23</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Results are mean <inline-formula id="inf250">
<mml:math id="m272">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> SD over five runs with different random seeds. BFS uses a 2-pixel tolerance at <inline-formula id="inf251">
<mml:math id="m273">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> with the same protocol for all methods.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Performance comparison with state-of-the-art methods on the ISIC 2018 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Type</th>
<th align="center">
<inline-formula id="inf252">
<mml:math id="m274">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf253">
<mml:math id="m275">
<mml:mrow>
<mml:mi mathvariant="normal">D</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">C</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf254">
<mml:math id="m276">
<mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">I</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">U</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf255">
<mml:math id="m277">
<mml:mrow>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf256">
<mml:math id="m278">
<mml:mrow>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf257">
<mml:math id="m279">
<mml:mrow>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf258">
<mml:math id="m280">
<mml:mrow>
<mml:mi mathvariant="normal">B</mml:mi>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">CNN</td>
<td align="center">UNet (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>)</td>
<td align="center">87.55 <inline-formula id="inf259">
<mml:math id="m281">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.01</td>
<td align="center">77.86 <inline-formula id="inf260">
<mml:math id="m282">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.58</td>
<td align="center">93.05 <inline-formula id="inf261">
<mml:math id="m283">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.37</td>
<td align="center">96.69 <inline-formula id="inf262">
<mml:math id="m284">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.62</td>
<td align="center">85.86 <inline-formula id="inf263">
<mml:math id="m285">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.48</td>
<td align="center">84.52 <inline-formula id="inf264">
<mml:math id="m286">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.93</td>
</tr>
<tr>
<td align="center">UNet&#x2b;&#x2b; (<xref ref-type="bibr" rid="B38">Zhou et al., 2018</xref>)</td>
<td align="center">87.83 <inline-formula id="inf265">
<mml:math id="m287">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.79</td>
<td align="center">78.31 <inline-formula id="inf266">
<mml:math id="m288">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.61</td>
<td align="center">93.02 <inline-formula id="inf267">
<mml:math id="m289">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.46</td>
<td align="center">95.75 <inline-formula id="inf268">
<mml:math id="m290">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.49</td>
<td align="center">88.65 <inline-formula id="inf269">
<mml:math id="m291">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.93</td>
<td align="center">85.23 <inline-formula id="inf270">
<mml:math id="m292">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.77</td>
</tr>
<tr>
<td align="center">Att-Unet (<xref ref-type="bibr" rid="B27">Oktay et al., 2018</xref>)</td>
<td align="center">87.91 <inline-formula id="inf271">
<mml:math id="m293">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.74</td>
<td align="center">78.43 <inline-formula id="inf272">
<mml:math id="m294">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.49</td>
<td align="center">93.13 <inline-formula id="inf273">
<mml:math id="m295">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">96.23 <inline-formula id="inf274">
<mml:math id="m296">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.33</td>
<td align="center">87.60 <inline-formula id="inf275">
<mml:math id="m297">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.89</td>
<td align="center">84.89 <inline-formula id="inf276">
<mml:math id="m298">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.75</td>
</tr>
<tr>
<td rowspan="3" align="left">Transformer</td>
<td align="center">TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>)</td>
<td align="center">89.56 <inline-formula id="inf277">
<mml:math id="m299">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.42</td>
<td align="center">81.09 <inline-formula id="inf278">
<mml:math id="m300">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.51</td>
<td align="center">93.99 <inline-formula id="inf279">
<mml:math id="m301">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">97.02 <inline-formula id="inf280">
<mml:math id="m302">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.33</td>
<td align="center">88.14 <inline-formula id="inf281">
<mml:math id="m303">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.49</td>
<td align="center">86.78 <inline-formula id="inf282">
<mml:math id="m304">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.62</td>
</tr>
<tr>
<td align="center">TransFuse (<xref ref-type="bibr" rid="B37">Zhang et al., 2021</xref>)</td>
<td align="center">89.27 <inline-formula id="inf283">
<mml:math id="m305">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.35</td>
<td align="center">80.63 <inline-formula id="inf284">
<mml:math id="m306">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.47</td>
<td align="center">93.66 <inline-formula id="inf285">
<mml:math id="m307">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">95.74 <inline-formula id="inf286">
<mml:math id="m308">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">
<bold>91.26</bold> <inline-formula id="inf287">
<mml:math id="m309">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.67</bold>
</td>
<td align="center">87.12 <inline-formula id="inf288">
<mml:math id="m310">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81</td>
</tr>
<tr>
<td align="center">TC-Net (<xref ref-type="bibr" rid="B10">Dong et al., 2022</xref>)</td>
<td align="center">88.25 <inline-formula id="inf289">
<mml:math id="m311">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.89</td>
<td align="center">78.97 <inline-formula id="inf290">
<mml:math id="m312">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.44</td>
<td align="center">93.32 <inline-formula id="inf291">
<mml:math id="m313">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.55</td>
<td align="center">96.48 <inline-formula id="inf292">
<mml:math id="m314">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.57</td>
<td align="center">87.60 <inline-formula id="inf293">
<mml:math id="m315">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.52</td>
<td align="center">85.31 <inline-formula id="inf294">
<mml:math id="m316">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.73</td>
</tr>
<tr>
<td rowspan="4" align="left">SSM</td>
<td align="center">VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>)</td>
<td align="center">89.71 <inline-formula id="inf295">
<mml:math id="m317">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.78</td>
<td align="center">81.35 <inline-formula id="inf296">
<mml:math id="m318">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.42</td>
<td align="center">93.91 <inline-formula id="inf297">
<mml:math id="m319">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">96.13 <inline-formula id="inf298">
<mml:math id="m320">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.14</td>
<td align="center">91.12 <inline-formula id="inf299">
<mml:math id="m321">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">87.36 <inline-formula id="inf300">
<mml:math id="m322">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.66</td>
</tr>
<tr>
<td align="center">CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>)</td>
<td align="center">90.06 <inline-formula id="inf301">
<mml:math id="m323">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.56</td>
<td align="center">81.92 <inline-formula id="inf302">
<mml:math id="m324">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.39</td>
<td align="center">94.23 <inline-formula id="inf303">
<mml:math id="m325">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.39</td>
<td align="center">
<bold>97.32</bold> <inline-formula id="inf304">
<mml:math id="m326">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.57</bold>
</td>
<td align="center">88.74 <inline-formula id="inf305">
<mml:math id="m327">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.40</td>
<td align="center">87.42 <inline-formula id="inf306">
<mml:math id="m328">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.38</td>
</tr>
<tr>
<td align="center">SliceMamba (<xref ref-type="bibr" rid="B13">Fan et al., 2025</xref>)</td>
<td align="center">90.30 <inline-formula id="inf307">
<mml:math id="m329">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.09</td>
<td align="center">82.32 <inline-formula id="inf308">
<mml:math id="m330">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.36</td>
<td align="center">94.29 <inline-formula id="inf309">
<mml:math id="m331">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.54</td>
<td align="center">97.14 <inline-formula id="inf310">
<mml:math id="m332">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.59</td>
<td align="center">89.58 <inline-formula id="inf311">
<mml:math id="m333">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.82</td>
<td align="center">87.89 <inline-formula id="inf312">
<mml:math id="m334">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.95</td>
</tr>
<tr>
<td align="center">SA-UMamba (<xref ref-type="bibr" rid="B26">Liu et al., 2025</xref>)</td>
<td align="center">89.49 <inline-formula id="inf313">
<mml:math id="m335">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.71</td>
<td align="center">80.98 <inline-formula id="inf314">
<mml:math id="m336">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.40</td>
<td align="center">85.90 <inline-formula id="inf315">
<mml:math id="m337">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.35</td>
<td align="center">96.75 <inline-formula id="inf316">
<mml:math id="m338">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.42</td>
<td align="center">89.16 <inline-formula id="inf317">
<mml:math id="m339">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.90</td>
<td align="center">86.81 <inline-formula id="inf318">
<mml:math id="m340">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.55</td>
</tr>
<tr>
<td rowspan="3" align="left">RWKV</td>
<td align="center">Zig-RiR (<xref ref-type="bibr" rid="B8">Chen et al., 2025</xref>)</td>
<td align="center">87.42 <inline-formula id="inf319">
<mml:math id="m341">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.13</td>
<td align="center">79.78 <inline-formula id="inf320">
<mml:math id="m342">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.49</td>
<td align="center">94.02 <inline-formula id="inf321">
<mml:math id="m343">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.53</td>
<td align="center">95.18 <inline-formula id="inf322">
<mml:math id="m344">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.98</td>
<td align="center">89.50 <inline-formula id="inf323">
<mml:math id="m345">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">85.05 <inline-formula id="inf324">
<mml:math id="m346">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.35</td>
</tr>
<tr>
<td align="center">HER-Seg (<xref ref-type="bibr" rid="B35">Xu et al., 2025</xref>)</td>
<td align="center">88.63 <inline-formula id="inf325">
<mml:math id="m347">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.17</td>
<td align="center">81.62 <inline-formula id="inf326">
<mml:math id="m348">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.46</td>
<td align="center">93.74 <inline-formula id="inf327">
<mml:math id="m349">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.47</td>
<td align="center">95.75 <inline-formula id="inf328">
<mml:math id="m350">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.87</td>
<td align="center">88.65 <inline-formula id="inf329">
<mml:math id="m351">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.38</td>
<td align="center">86.11 <inline-formula id="inf330">
<mml:math id="m352">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
</tr>
<tr>
<td align="center">ZR<sup>2</sup>ViM (Ours)</td>
<td align="center">
<bold>92.22</bold> <inline-formula id="inf331">
<mml:math id="m353">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.41</bold>
</td>
<td align="center">
<bold>85.65</bold> <inline-formula id="inf332">
<mml:math id="m354">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.34</bold>
</td>
<td align="center">
<bold>94.33</bold> <inline-formula id="inf333">
<mml:math id="m355">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.26</bold>
</td>
<td align="center">97.24 <inline-formula id="inf334">
<mml:math id="m356">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">91.18 <inline-formula id="inf335">
<mml:math id="m357">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.80</td>
<td align="center">
<bold>90.25</bold> <inline-formula id="inf336">
<mml:math id="m358">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Results are mean <inline-formula id="inf337">
<mml:math id="m359">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> SD over five runs with different random seeds. BFS uses a 2-pixel tolerance at <inline-formula id="inf338">
<mml:math id="m360">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> with the same protocol for all methods.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Performance comparison with state-of-the-art methods on the BUSI and CVC-ClinicDB datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Type</th>
<th rowspan="2" align="center">Method</th>
<th colspan="3" align="center">BUSI</th>
<th colspan="3" align="center">CVC-ClinicDB</th>
</tr>
<tr>
<th align="center">DSC <inline-formula id="inf339">
<mml:math id="m361">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">mIoU<inline-formula id="inf340">
<mml:math id="m362">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">BFS <inline-formula id="inf341">
<mml:math id="m363">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">DSC<inline-formula id="inf342">
<mml:math id="m364">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">mIoU <inline-formula id="inf343">
<mml:math id="m365">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">BFS<inline-formula id="inf344">
<mml:math id="m366">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">CNN</td>
<td align="center">UNet (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>)</td>
<td align="center">76.33 <inline-formula id="inf345">
<mml:math id="m367">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.60</td>
<td align="center">62.40 <inline-formula id="inf346">
<mml:math id="m368">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.53</td>
<td align="center">63.48 <inline-formula id="inf347">
<mml:math id="m369">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.28</td>
<td align="center">82.72 <inline-formula id="inf348">
<mml:math id="m370">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.72</td>
<td align="center">70.53 <inline-formula id="inf349">
<mml:math id="m371">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.75</td>
<td align="center">74.22 <inline-formula id="inf350">
<mml:math id="m372">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.16</td>
</tr>
<tr>
<td align="center">UNet&#x2b;&#x2b; (<xref ref-type="bibr" rid="B38">Zhou et al., 2018</xref>)</td>
<td align="center">76.47 <inline-formula id="inf351">
<mml:math id="m373">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.89</td>
<td align="center">65.92 <inline-formula id="inf352">
<mml:math id="m374">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.56</td>
<td align="center">65.62 <inline-formula id="inf353">
<mml:math id="m375">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.33</td>
<td align="center">81.20 <inline-formula id="inf354">
<mml:math id="m376">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.53</td>
<td align="center">68.35 <inline-formula id="inf355">
<mml:math id="m377">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.33</td>
<td align="center">72.70 <inline-formula id="inf356">
<mml:math id="m378">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.13</td>
</tr>
<tr>
<td align="center">Att-Unet (<xref ref-type="bibr" rid="B27">Oktay et al., 2018</xref>)</td>
<td align="center">76.35 <inline-formula id="inf357">
<mml:math id="m379">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.96</td>
<td align="center">68.46 <inline-formula id="inf358">
<mml:math id="m380">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.39</td>
<td align="center">65.14 <inline-formula id="inf359">
<mml:math id="m381">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.96</td>
<td align="center">88.55 <inline-formula id="inf360">
<mml:math id="m382">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">79.46 <inline-formula id="inf361">
<mml:math id="m383">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.46</td>
<td align="center">81.03 <inline-formula id="inf362">
<mml:math id="m384">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.05</td>
</tr>
<tr>
<td rowspan="3" align="left">Transformer</td>
<td align="center">TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>)</td>
<td align="center">71.27 <inline-formula id="inf363">
<mml:math id="m385">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.86</td>
<td align="center">60.09 <inline-formula id="inf364">
<mml:math id="m386">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.43</td>
<td align="center">60.27 <inline-formula id="inf365">
<mml:math id="m387">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.41</td>
<td align="center">86.77 <inline-formula id="inf366">
<mml:math id="m388">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.40</td>
<td align="center">79.95 <inline-formula id="inf367">
<mml:math id="m389">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.65</td>
<td align="center">80.42 <inline-formula id="inf368">
<mml:math id="m390">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.03</td>
</tr>
<tr>
<td align="center">Swin-Unet (<xref ref-type="bibr" rid="B5">Cao et al., 2022</xref>)</td>
<td align="center">82.35 <inline-formula id="inf369">
<mml:math id="m391">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.80</td>
<td align="center">73.65 <inline-formula id="inf370">
<mml:math id="m392">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.68</td>
<td align="center">72.23 <inline-formula id="inf371">
<mml:math id="m393">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.88</td>
<td align="center">87.03 <inline-formula id="inf372">
<mml:math id="m394">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">81.68 <inline-formula id="inf373">
<mml:math id="m395">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.50</td>
<td align="center">81.54 <inline-formula id="inf374">
<mml:math id="m396">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.21</td>
</tr>
<tr>
<td align="center">MISSFormer (<xref ref-type="bibr" rid="B18">Huang et al., 2022</xref>)</td>
<td align="center">75.81 <inline-formula id="inf375">
<mml:math id="m397">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.92</td>
<td align="center">65.33 <inline-formula id="inf376">
<mml:math id="m398">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.49</td>
<td align="center">63.81 <inline-formula id="inf377">
<mml:math id="m399">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.73</td>
<td align="center">86.66 <inline-formula id="inf378">
<mml:math id="m400">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.48</td>
<td align="center">80.40 <inline-formula id="inf379">
<mml:math id="m401">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">80.16 <inline-formula id="inf380">
<mml:math id="m402">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.13</td>
</tr>
<tr>
<td rowspan="4" align="left">SSM</td>
<td align="center">VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>)</td>
<td align="center">78.88 <inline-formula id="inf381">
<mml:math id="m403">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.69</td>
<td align="center">67.55 <inline-formula id="inf382">
<mml:math id="m404">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.33</td>
<td align="center">66.88 <inline-formula id="inf383">
<mml:math id="m405">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.85</td>
<td align="center">88.60 <inline-formula id="inf384">
<mml:math id="m406">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">80.50 <inline-formula id="inf385">
<mml:math id="m407">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.76</td>
<td align="center">82.12 <inline-formula id="inf386">
<mml:math id="m408">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.18</td>
</tr>
<tr>
<td align="center">CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>)</td>
<td align="center">81.39 <inline-formula id="inf387">
<mml:math id="m409">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.52</td>
<td align="center">73.58 <inline-formula id="inf388">
<mml:math id="m410">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.59</td>
<td align="center">71.58 <inline-formula id="inf389">
<mml:math id="m411">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.69</td>
<td align="center">87.73 <inline-formula id="inf390">
<mml:math id="m412">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.19</td>
<td align="center">81.16 <inline-formula id="inf391">
<mml:math id="m413">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.35</td>
<td align="center">81.23 <inline-formula id="inf392">
<mml:math id="m414">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.11</td>
</tr>
<tr>
<td align="center"> <inline-formula id="inf393">
<mml:math id="m415">
<mml:mrow>
<mml:mtext>AEMMamba</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B11">Dong et al., 2025</xref>)</td>
<td align="center">84.24 <inline-formula id="inf394">
<mml:math id="m416">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81</td>
<td align="center">76.12 <inline-formula id="inf395">
<mml:math id="m417">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.67</td>
<td align="center">74.24 <inline-formula id="inf396">
<mml:math id="m418">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.14</td>
<td align="center">92.41 <inline-formula id="inf397">
<mml:math id="m419">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">87.69 <inline-formula id="inf398">
<mml:math id="m420">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.37</td>
<td align="center">86.91 <inline-formula id="inf399">
<mml:math id="m421">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.07</td>
</tr>
<tr>
<td align="center">Swin-U <inline-formula id="inf400">
<mml:math id="m422">
<mml:mrow>
<mml:mtext>Mamba</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B24">Liu J. et al., 2024</xref>)</td>
<td align="center">82.56 <inline-formula id="inf401">
<mml:math id="m423">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.78</td>
<td align="center">73.62 <inline-formula id="inf402">
<mml:math id="m424">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.50</td>
<td align="center">72.56 <inline-formula id="inf403">
<mml:math id="m425">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.01</td>
<td align="center">89.49 <inline-formula id="inf404">
<mml:math id="m426">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.12</td>
<td align="center">84.47 <inline-formula id="inf405">
<mml:math id="m427">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.14</td>
<td align="center">84.69 <inline-formula id="inf406">
<mml:math id="m428">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.05</td>
</tr>
<tr>
<td rowspan="3" align="left">RWKV</td>
<td align="center">Zig-RiR (<xref ref-type="bibr" rid="B8">Chen et al., 2025</xref>)</td>
<td align="center">72.61 <inline-formula id="inf407">
<mml:math id="m429">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.39</td>
<td align="center">62.55 <inline-formula id="inf408">
<mml:math id="m430">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.43</td>
<td align="center">61.61 <inline-formula id="inf409">
<mml:math id="m431">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.95</td>
<td align="center">83.52 <inline-formula id="inf410">
<mml:math id="m432">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.17</td>
<td align="center">76.22 <inline-formula id="inf411">
<mml:math id="m433">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.19</td>
<td align="center">77.04 <inline-formula id="inf412">
<mml:math id="m434">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.08</td>
</tr>
<tr>
<td align="center">HER-Seg (<xref ref-type="bibr" rid="B35">Xu et al., 2025</xref>)</td>
<td align="center">70.63 <inline-formula id="inf413">
<mml:math id="m435">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.67</td>
<td align="center">60.47 <inline-formula id="inf414">
<mml:math id="m436">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.32</td>
<td align="center">60.79 <inline-formula id="inf415">
<mml:math id="m437">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.46</td>
<td align="center">87.07 <inline-formula id="inf416">
<mml:math id="m438">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.15</td>
<td align="center">81.39 <inline-formula id="inf417">
<mml:math id="m439">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.20</td>
<td align="center">81.57 <inline-formula id="inf418">
<mml:math id="m440">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.07</td>
</tr>
<tr>
<td align="center">ZR<sup>2</sup>ViM (Ours)</td>
<td align="center">
<bold>86.45</bold> <inline-formula id="inf419">
<mml:math id="m441">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.32</bold>
</td>
<td align="center">
<bold>77.84</bold> <inline-formula id="inf420">
<mml:math id="m442">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.24</bold>
</td>
<td align="center">
<bold>77.11</bold> <inline-formula id="inf421">
<mml:math id="m443">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.66</bold>
</td>
<td align="center">
<bold>93.95</bold> <inline-formula id="inf422">
<mml:math id="m444">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.13</bold>
</td>
<td align="center">
<bold>89.00</bold> <inline-formula id="inf423">
<mml:math id="m445">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.18</bold>
</td>
<td align="center">
<bold>88.68</bold> <inline-formula id="inf424">
<mml:math id="m446">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.05</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Results are mean <inline-formula id="inf425">
<mml:math id="m447">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> SD over five runs with different random seeds. BFS uses a 2-pixel tolerance at <inline-formula id="inf426">
<mml:math id="m448">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> with the same protocol for all methods.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Performance comparison with state-of-the-art methods on the Synapse multi-organ CT dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Type</th>
<th align="left">
<inline-formula id="inf427">
<mml:math id="m449">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf428">
<mml:math id="m450">
<mml:mrow>
<mml:mi mathvariant="normal">D</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">C</mml:mi>
<mml:mi mathvariant="italic">&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf429">
<mml:math id="m451">
<mml:mrow>
<mml:mi mathvariant="normal">H</mml:mi>
<mml:mi mathvariant="normal">D</mml:mi>
<mml:mn mathvariant="bold">95</mml:mn>
<mml:mi mathvariant="italic">&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf430">
<mml:math id="m452">
<mml:mrow>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf431">
<mml:math id="m453">
<mml:mrow>
<mml:mi mathvariant="normal">G</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">b</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf432">
<mml:math id="m454">
<mml:mrow>
<mml:mi mathvariant="normal">K</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">L</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf433">
<mml:math id="m455">
<mml:mrow>
<mml:mi mathvariant="normal">K</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf434">
<mml:math id="m456">
<mml:mrow>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf435">
<mml:math id="m457">
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf436">
<mml:math id="m458">
<mml:mrow>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Stomach</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">CNN</td>
<td align="left">UNet (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>)</td>
<td align="center">76.85 <inline-formula id="inf437">
<mml:math id="m459">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.19</td>
<td align="center">39.70 <inline-formula id="inf438">
<mml:math id="m460">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.69</td>
<td align="center">80.07</td>
<td align="center">69.72</td>
<td align="center">77.77</td>
<td align="center">68.60</td>
<td align="center">93.43</td>
<td align="center">53.98</td>
<td align="center">86.67</td>
<td align="center">75.58</td>
</tr>
<tr>
<td align="left"/>
<td align="left">UNet&#x2b;&#x2b; (<xref ref-type="bibr" rid="B38">Zhou et al., 2018</xref>)</td>
<td align="center">78.11 <inline-formula id="inf439">
<mml:math id="m461">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.07</td>
<td align="center">36.87 <inline-formula id="inf440">
<mml:math id="m462">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.42</td>
<td align="center">81.46</td>
<td align="center">68.57</td>
<td align="center">80.46</td>
<td align="center">78.59</td>
<td align="center">93.74</td>
<td align="center">56.94</td>
<td align="center">87.47</td>
<td align="center">77.61</td>
</tr>
<tr>
<td align="left"/>
<td align="left">Att-Unet (<xref ref-type="bibr" rid="B27">Oktay et al., 2018</xref>)</td>
<td align="center">77.77 <inline-formula id="inf441">
<mml:math id="m463">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.86</td>
<td align="center">36.02 <inline-formula id="inf442">
<mml:math id="m464">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.60</td>
<td align="center">
<bold>89.55</bold>
</td>
<td align="center">68.88</td>
<td align="center">77.98</td>
<td align="center">71.11</td>
<td align="center">93.57</td>
<td align="center">58.04</td>
<td align="center">87.30</td>
<td align="center">75.75</td>
</tr>
<tr>
<td align="left">Transformer</td>
<td align="left">TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>)</td>
<td align="center">77.48 <inline-formula id="inf443">
<mml:math id="m465">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.91</td>
<td align="center">31.69 <inline-formula id="inf444">
<mml:math id="m466">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.01</td>
<td align="center">87.23</td>
<td align="center">63.13</td>
<td align="center">81.87</td>
<td align="center">77.02</td>
<td align="center">94.08</td>
<td align="center">55.86</td>
<td align="center">85.08</td>
<td align="center">75.62</td>
</tr>
<tr>
<td align="left"/>
<td align="left">Swin-Unet (<xref ref-type="bibr" rid="B5">Cao et al., 2022</xref>)</td>
<td align="center">79.51 <inline-formula id="inf445">
<mml:math id="m467">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.73</td>
<td align="center">21.55 <inline-formula id="inf446">
<mml:math id="m468">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.70</td>
<td align="center">85.47</td>
<td align="center">66.53</td>
<td align="center">83.28</td>
<td align="center">79.61</td>
<td align="center">94.29</td>
<td align="center">56.58</td>
<td align="center">90.66</td>
<td align="center">79.62</td>
</tr>
<tr>
<td align="left"/>
<td align="left">MISSFormer (<xref ref-type="bibr" rid="B18">Huang et al., 2022</xref>)</td>
<td align="center">81.96 <inline-formula id="inf447">
<mml:math id="m469">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.85</td>
<td align="center">18.20 <inline-formula id="inf448">
<mml:math id="m470">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.59</td>
<td align="center">86.99</td>
<td align="center">68.65</td>
<td align="center">85.21</td>
<td align="center">82.00</td>
<td align="center">94.41</td>
<td align="center">65.67</td>
<td align="center">91.92</td>
<td align="center">80.81</td>
</tr>
<tr>
<td align="left">SSM</td>
<td align="left">VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>)</td>
<td align="center">81.08 <inline-formula id="inf449">
<mml:math id="m471">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.80</td>
<td align="center">19.21 <inline-formula id="inf450">
<mml:math id="m472">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.42</td>
<td align="center">86.45</td>
<td align="center">69.49</td>
<td align="center">86.36</td>
<td align="center">82.76</td>
<td align="center">94.17</td>
<td align="center">59.36</td>
<td align="center">89.51</td>
<td align="center">80.54</td>
</tr>
<tr>
<td align="left"/>
<td align="left">CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>)</td>
<td align="center">82.65 <inline-formula id="inf451">
<mml:math id="m473">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.75</td>
<td align="center">17.83 <inline-formula id="inf452">
<mml:math id="m474">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.47</td>
<td align="center">87.63</td>
<td align="center">68.45</td>
<td align="center">86.23</td>
<td align="center">83.22</td>
<td align="center">94.67</td>
<td align="center">
<bold>67.12</bold>
</td>
<td align="center">92.05</td>
<td align="center">
<bold>81.82</bold>
</td>
</tr>
<tr>
<td align="left"/>
<td align="left">SliceMamba (<xref ref-type="bibr" rid="B13">Fan et al., 2025</xref>)</td>
<td align="center">81.95 <inline-formula id="inf453">
<mml:math id="m475">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.77</td>
<td align="center">16.04 <inline-formula id="inf454">
<mml:math id="m476">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.38</td>
<td align="center">87.78</td>
<td align="center">68.77</td>
<td align="center">88.30</td>
<td align="center">84.26</td>
<td align="center">95.25</td>
<td align="center">64.49</td>
<td align="center">86.91</td>
<td align="center">79.82</td>
</tr>
<tr>
<td align="left"/>
<td align="left">SA-UMamba (<xref ref-type="bibr" rid="B26">Liu et al., 2025</xref>)</td>
<td align="center">82.54 <inline-formula id="inf455">
<mml:math id="m477">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.90</td>
<td align="center">16.80 <inline-formula id="inf456">
<mml:math id="m478">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.53</td>
<td align="center">88.07</td>
<td align="center">70.46</td>
<td align="center">86.46</td>
<td align="center">83.96</td>
<td align="center">94.42</td>
<td align="center">65.32</td>
<td align="center">89.89</td>
<td align="center">81.76</td>
</tr>
<tr>
<td align="left">RWKV</td>
<td align="left">Zig-RiR (<xref ref-type="bibr" rid="B8">Chen et al., 2025</xref>)</td>
<td align="center">82.26 <inline-formula id="inf457">
<mml:math id="m479">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.99</td>
<td align="center">16.65 <inline-formula id="inf458">
<mml:math id="m480">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81</td>
<td align="center">88.14</td>
<td align="center">70.15</td>
<td align="center">87.51</td>
<td align="center">83.38</td>
<td align="center">94.29</td>
<td align="center">66.02</td>
<td align="center">90.72</td>
<td align="center">77.86</td>
</tr>
<tr>
<td align="left"/>
<td align="left">HER-Seg (<xref ref-type="bibr" rid="B35">Xu et al., 2025</xref>)</td>
<td align="center">82.37 <inline-formula id="inf459">
<mml:math id="m481">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.64</td>
<td align="center">18.74 <inline-formula id="inf460">
<mml:math id="m482">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.48</td>
<td align="center">87.46</td>
<td align="center">
<bold>71.33</bold>
</td>
<td align="center">87.46</td>
<td align="center">84.17</td>
<td align="center">94.75</td>
<td align="center">66.13</td>
<td align="center">89.08</td>
<td align="center">78.59</td>
</tr>
<tr>
<td align="left"/>
<td align="left">ZR<sup>2</sup>ViM (Ours)</td>
<td align="center">
<bold>83.04</bold> <inline-formula id="inf461">
<mml:math id="m483">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.84</bold>
</td>
<td align="center">
<bold>15.68</bold> <inline-formula id="inf462">
<mml:math id="m484">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.36</bold>
</td>
<td align="center">87.92</td>
<td align="center">69.37</td>
<td align="center">
<bold>88.65</bold>
</td>
<td align="center">
<bold>84.54</bold>
</td>
<td align="center">
<bold>95.28</bold>
</td>
<td align="center">64.42</td>
<td align="center">
<bold>92.81</bold>
</td>
<td align="center">81.32</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Overall DSC and HD95 are reported as mean <inline-formula id="inf463">
<mml:math id="m485">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> SD over five runs with different random seeds. For space constraints, per-organ results report only the mean DSC over five runs (SD omitted).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Training and validation curves on ISIC 2018. The plots show the loss (left) and DSC (right) over 150 epochs, indicating stable convergence.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g004.tif">
<alt-text content-type="machine-generated">Side-by-side line graphs show model training metrics over one hundred fifty epochs. Left panel plots loss for training and validation, both decreasing and stabilizing over time. Right panel charts DSC (Dice Similarity Coefficient), with training and validation curves rising and plateauing.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Evaluation metrics</title>
<p>To ensure a rigorous evaluation of model generalization across diverse imaging modalities and anatomical structures, we employed task-specific protocols aligned with established benchmarks in medical image segmentation (<xref ref-type="bibr" rid="B19">Isensee et al., 2021</xref>; <xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>). For 2D segmentation tasks, we quantified performance using a comprehensive suite of metrics, including mean Intersection over Union (mIoU), Dice Similarity Coefficient (DSC), sensitivity (Sen), specificity (Spe), accuracy (Acc), and Boundary F1-score (BFS). While all metrics were computed, we prioritized DSC and mIoU as the primary overlap-based metrics for all 2D benchmarks. To explicitly assess contour accuracy and boundary integrity, we additionally adopted BFS to quantify boundary alignment and contour continuity between predicted masks and ground-truth annotations. For the 3D multi-organ segmentation on the Synapse multi-organ CT dataset, we used DSC to assess volumetric overlap and supplemented it with the 95% Hausdorff distance (HD95) to specifically evaluate the accuracy of boundary delineation. BFS was computed on binarized masks with a boundary tolerance of 2 pixels at <inline-formula id="inf464">
<mml:math id="m486">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> resolution for all 2D datasets, and the same setting was applied consistently to all methods. We used a standard distance-transform based implementation to match predicted and ground-truth boundaries within the tolerance band (<xref ref-type="bibr" rid="B28">Perazzi et al., 2016</xref>).</p>
<p>To account for training variability, all results are reported as mean <inline-formula id="inf465">
<mml:math id="m487">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> standard deviation (SD) over five independent runs with different random seeds under identical settings. We further computed 95% confidence intervals (CI) of the mean using a t-distribution-based interval across the five runs. CIs are reported for the primary metrics only (DSC/mIoU for 2D tasks and DSC/HD95 for Synapse). For 2D benchmarks, we additionally report the CI for BFS to directly characterize the reliability of boundary accuracy improvements. Statistical significance was evaluated using a two-sided paired t-test on the primary metric between <inline-formula id="inf466">
<mml:math id="m488">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM and the strongest competing baseline on each benchmark, based on the five paired runs with matched random seeds, with a significance level of <inline-formula id="inf467">
<mml:math id="m489">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.05</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The strongest baseline was selected per benchmark according to the primary metric (DSC/mIoU for 2D tasks and DSC/HD95 for Synapse).</p>
<p>These metrics are defined as follows in <xref ref-type="disp-formula" rid="e23">Equations 23</xref>&#x2013;<xref ref-type="disp-formula" rid="e28">28</xref>:<disp-formula id="e23">
<mml:math id="m490">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(23)</label>
</disp-formula>
<disp-formula id="e24">
<mml:math id="m491">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(24)</label>
</disp-formula>
<disp-formula id="e25">
<mml:math id="m492">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(25)</label>
</disp-formula>
<disp-formula id="e26">
<mml:math id="m493">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(26)</label>
</disp-formula>
<disp-formula id="e27">
<mml:math id="m494">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(27)</label>
</disp-formula>
<disp-formula id="e28">
<mml:math id="m495">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(28)</label>
</disp-formula>where <inline-formula id="inf468">
<mml:math id="m496">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf469">
<mml:math id="m497">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the numbers of true positives, false positives, true negatives, and false negatives, respectively. For mIoU, <inline-formula id="inf470">
<mml:math id="m498">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf471">
<mml:math id="m499">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the corresponding values for class <inline-formula id="inf472">
<mml:math id="m500">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> across <inline-formula id="inf473">
<mml:math id="m501">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> total classes. In the BFS equation, <inline-formula id="inf474">
<mml:math id="m502">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf475">
<mml:math id="m503">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent boundary precision and boundary recall, respectively, which are calculated based on the distance between the predicted boundaries and the ground-truth boundaries within a specified tolerance. HD95 is calculated as the 95th percentile of the bidirectional surface distances between the predicted and ground-truth segmentation boundaries.</p>
</sec>
<sec id="s4-4">
<label>4.4</label>
<title>Comparisons with state-of-the-art methods</title>
<p>We rigorously evaluated the performance and generalization of <inline-formula id="inf476">
<mml:math id="m504">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM across five representative medical image segmentation benchmarks. To ensure a fair and robust comparison, all experiments adhered to identical data splits and standardized evaluation protocols. Across these benchmarks, <inline-formula id="inf477">
<mml:math id="m505">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM consistently achieved superior performance with improved boundary delineation. Improvements on the primary metrics are statistically significant under a two-sided paired t-test <inline-formula id="inf478">
<mml:math id="m506">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.05</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> across five matched-seed runs. We further report 95% confidence intervals (CI) of the mean for the primary metrics; the CIs show limited dispersion across seeds, supporting that the observed gains are stable and not driven by random-seed listvariability. <list list-type="order">
<list-item>
<p>Results on skin lesion segmentation. ZR<sup>2</sup>ViM was evaluated on the ISIC 2017 (<xref ref-type="bibr" rid="B4">Berseth, 2017</xref>) and ISIC 2018 (<xref ref-type="bibr" rid="B9">Codella et al., 2019</xref>) skin lesion segmentation benchmarks. On these tasks, SSMs such as VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>) and CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>) generally surpass conventional CNN-based (e.g., U-Net (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>)) and Transformer-based architectures (e.g., TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>)), largely due to their superior capacity for modeling long-range dependencies. This trend highlights the promise of SSMs for vision tasks. However, <inline-formula id="inf479">
<mml:math id="m507">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM substantially improves upon existing SSMs, establishing new state-of-the-art performance on both benchmarks (<xref ref-type="table" rid="T1">Tables 1</xref>, <xref ref-type="table" rid="T2">2</xref>). On the ISIC 2017 dataset, <inline-formula id="inf480">
<mml:math id="m508">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM achieved a DSC of 92.12% (95% CI: 91.78%&#x2013;92.46%) and an mIoU of 85.83% (95% CI: 85.45%&#x2013;86.21%), surpassing the previous leading model, SliceMamba (<xref ref-type="bibr" rid="B13">Fan et al., 2025</xref>), by 2.19 and 4.13 percentage points, respectively (DSC: <inline-formula id="inf481">
<mml:math id="m509">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Notably, it also attained the highest specificity (98.36%), indicating a low false-positive rate, and achieved a BFS of 89.64% (95% CI: 89.35%&#x2013;89.93%), reflecting improved boundary alignment. This superior performance was replicated on the ISIC 2018 dataset, where <inline-formula id="inf482">
<mml:math id="m510">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM again outperformed all baseline models, achieving a DSC of 92.22% (95% CI: 91.71%&#x2013;92.73%) and an mIoU of 85.65% (95% CI: 85.23%&#x2013;86.07%) (improvements of 1.92 and 3.33 percentage points over SliceMamba, DSC:<inline-formula id="inf483">
<mml:math id="m511">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Furthermore, the model achieved a BFS of 90.25% (95% CI: 89.99%&#x2013;90.51%), quantitatively confirming the finer margin delineation observed in the qualitative results (<xref ref-type="fig" rid="F5">Figure 5</xref>). Qualitative results (<xref ref-type="fig" rid="F5">Figure 5</xref>) visually corroborate these quantitative gains. Predictions from <inline-formula id="inf484">
<mml:math id="m512">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM align more closely with ground-truth contours, producing segmentation masks with more continuous boundaries and fewer false positives or extensions. This precision is particularly evident for lesions with intricate boundaries, irregular morphologies, and small sizes. In contrast, models like TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>) and VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>) frequently produce over-segmented or blurred boundaries. These improvements stem directly from the architectural innovations of <inline-formula id="inf485">
<mml:math id="m513">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM. The scalable, multi-directional zigzag scanning ensures that the model captures features with directional robustness, which is critical for preserving boundary continuity. Concurrently, the nested recursive connections within the SSR module enable an efficient fusion of fine-grained local details with global contextual information. This dual-pronged approach allows <inline-formula id="inf486">
<mml:math id="m514">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM to excel at characterizing fine local features without sacrificing global region consistency, leading to more accurate and reliable segmentation.</p>
</list-item>
<list-item>
<p>Results on breast ultrasound and colorectal polyp segmentation To assess its generalization capabilities, we evaluated <inline-formula id="inf487">
<mml:math id="m515">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM on two clinically challenging and distinct imaging modalities: breast ultrasound (BUSI (<xref ref-type="bibr" rid="B1">Al-Dhabyani et al., 2020</xref>)) and colonoscopic polyp imaging (CVC-ClinicDB (<xref ref-type="bibr" rid="B3">Bernal et al., 2015</xref>)). These datasets present formidable challenges, including segmenting lesions obscured by severe speckle noise in ultrasound and delineating polyps with varied morphologies against low-contrast mucosa in endoscopy. On the BUSI dataset, <inline-formula id="inf488">
<mml:math id="m516">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM established state-of-the-art performance, achieving a DSC of 86.45% (95% CI: 86.05%&#x2013;86.85%) and an mIoU of 77.84% (95% CI: 77.54%&#x2013;78.14%) (<xref ref-type="table" rid="T3">Table 3</xref>). This represents a significant <inline-formula id="inf489">
<mml:math id="m517">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.05</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.21 percentage point improvement in DSC over the next-best model, AEMMamba (<xref ref-type="bibr" rid="B11">Dong et al., 2025</xref>). Notably, the high BFS of 77.11% (95% CI: 76.29%&#x2013;77.93%) demonstrates the model&#x2019;s capability to accurately localize lesion boundaries even in the presence of severe speckle noise. This performance gain is directly attributable to the multi-directional recurrent aggregation within the CZ-WKV module, which effectively suppresses acoustic artifacts and enhances boundary discrimination in low-contrast conditions. Similarly, on the CVC-ClinicDB dataset, <inline-formula id="inf490">
<mml:math id="m518">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM again surpassed all baselines, achieving a DSC of 93.95% (95% CI: 93.79%&#x2013;94.11%) and an mIoU of 89.00% (95% CI: 88.78%&#x2013;89.22%). This result outperformed AEMMamba (<xref ref-type="bibr" rid="B11">Dong et al., 2025</xref>) by 1.54 and 1.31 percentage points, respectively (DSC: <inline-formula id="inf491">
<mml:math id="m519">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.05</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). The model also achieved a robust BFS of 88.68% (95% CI: 88.62%&#x2013;88.74%), substantiating its effectiveness in delineating polyp margins against low-contrast mucosa. To our knowledge, a DSC of 93.95% is unprecedented for this benchmark, establishing a new state-of-the-art performance. Qualitative analysis further substantiates these quantitative results (<xref ref-type="fig" rid="F6">Figure 6</xref>). In breast ultrasound images, <inline-formula id="inf492">
<mml:math id="m520">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM demonstrates a remarkable ability to suppress speckle noise while accurately localizing lesion boundaries. In colonoscopic images, it produces visibly finer margin delineation and superior structural continuity compared to competing methods. Collectively, these results across two disparate and challenging modalities underscore the robustness and broad applicability of the <inline-formula id="inf493">
<mml:math id="m521">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM architecture.</p>
</list-item>
<list-item>
<p>Results on synapse multi-organ CT segmentation. We further assessed <inline-formula id="inf494">
<mml:math id="m522">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM on the Synapse multi-organ CT dataset (<xref ref-type="bibr" rid="B21">Landman et al., 2015</xref>), a demanding benchmark characterized by intricate anatomical structures, significant scale variation, and complex inter-organ boundaries. This task places stringent requirements on a model&#x2019;s ability to integrate global context with fine-grained local detail. <inline-formula id="inf495">
<mml:math id="m523">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM established new state-of-the-art performance, achieving an average DSC of 83.04% (95% CI: 82.00%&#x2013;84.08%) (<xref ref-type="table" rid="T4">Table 4</xref>). More critically, it demonstrated a substantial improvement in boundary delineation accuracy, reducing the average HD95 from 17.83 mm (CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>)) to 15.68 mm (95% CI: 15.23&#x2013;16.13). This represents a clinically significant 12.1% reduction in surface distance error (HD95: <inline-formula id="inf496">
<mml:math id="m524">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>), highlighting the model&#x2019;s superior precision. An organ-level analysis reveals a balanced and robust performance profile. The model excelled in segmenting large organs, attaining high DSC scores for the liver (95.28%) and left kidney (88.65%), as well as for complex structures like the spleen (92.81%) and aorta (87.92%). While its performance on smaller, more challenging organs like the gallbladder and pancreas was highly competitive, it did not uniformly surpass every baseline. Nevertheless, the absence of pronounced weaknesses on any single organ underscores the model&#x2019;s reliability for comprehensive anatomical segmentation. Visual inspection of the segmentation results (<xref ref-type="fig" rid="F7">Figure 7</xref>) corroborates these quantitative findings. <inline-formula id="inf497">
<mml:math id="m525">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM consistently generates masks with sharper, more anatomically plausible organ boundaries and superior structural integrity, especially in regions with low tissue contrast. This qualitative evidence directly supports the marked improvement observed in the HD95 metric, confirming the model&#x2019;s advanced capability for precise 3D segmentation.</p>
</list-item>
</list>
</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Qualitative segmentation results on the ISIC 2017 and ISIC 2018 datasets.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g005.tif">
<alt-text content-type="machine-generated">Four rows of skin lesion images in the first column are followed by corresponding binary segmentation masks. Columns include ground truth, TransUNet, VM-UNet, and ZR squared ViM model results, showing differences in segmentation accuracy for each method.</alt-text>
</graphic>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Qualitative segmentation results on the BUSI and CVC-ClinicDB datasets.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g006.tif">
<alt-text content-type="machine-generated">Medical image segmentation comparison grid showing four rows and five columns. Rows display original ultrasound and endoscopic images, followed by corresponding ground truth masks and predicted masks from TransUNet, VM-UNet, and ZR2ViM models, illustrating region detection accuracy.</alt-text>
</graphic>
</fig>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Qualitative segmentation results on the Synapse multi-organ CT dataset.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g007.tif">
<alt-text content-type="machine-generated">Comparison of abdominal CT slices in the first column with colored organ segmentation masks in four columns: ground truth (GT), TransUNet, VM-UNet, and ZR squared ViM; each method visually demonstrates varied segmentation accuracy by overlaying distinct color-coded organs on a black background for each of four patients.</alt-text>
</graphic>
</fig>
<p>Experimental results across five diverse medical imaging datasets establish that <inline-formula id="inf498">
<mml:math id="m526">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM consistently delivers state-of-the-art or near state-of-the-art segmentation performance. Unlike traditional CNNs and Transformers, <inline-formula id="inf499">
<mml:math id="m527">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM maintains the linear-time complexity of SSMs, enabling the efficient modeling of global dependencies. Furthermore, it surpasses other SSM- and RWKV-based counterparts by capturing richer feature representations and achieving superior spatial-modeling accuracy. This enhanced performance is directly attributable to its core architectural innovations: the <inline-formula id="inf500">
<mml:math id="m528">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Block for nested recursive state modeling, integrated Q-Shift for directional-prior injection, and a scalable multi-directional zigzag scanning mechanism coupled with CZ-WKV attention. Collectively, these innovations render <inline-formula id="inf501">
<mml:math id="m529">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM highly effective for addressing critical challenges in medical imaging, such as segmenting complex boundaries, small targets, and multi-scale structures across various modalities. This combination of high accuracy and computational efficiency positions <inline-formula id="inf502">
<mml:math id="m530">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM as a robust and versatile solution for practical clinical applications.</p>
</sec>
<sec id="s4-5">
<label>4.5</label>
<title>Efficiency analysis</title>
<p>To assess computational efficiency, we evaluate each model in terms of parameter count (Params), floating-point operations (FLOPs), and representative inference latency. For complexity profiling, all methods are measured with a unified input size of <inline-formula id="inf503">
<mml:math id="m531">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> using the same profiling script. For efficiency benchmarking in <xref ref-type="table" rid="T5">Table 5</xref>, all methods are profiled under the same GPU, input resolution, and batch size. Under this setting, <inline-formula id="inf504">
<mml:math id="m532">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM requires 38.66 M parameters and 17.84G FLOPs, positioning it within the lightweight-to-midrange complexity regime. This computational profile places <inline-formula id="inf505">
<mml:math id="m533">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM in a highly advantageous position regarding the accuracy-efficiency trade-off, as illustrated in <xref ref-type="fig" rid="F8">Figure 8</xref> and detailed in <xref ref-type="table" rid="T5">Table 5</xref>. The bubble plot (<xref ref-type="fig" rid="F8">Figure 8</xref>), which visualizes the relationship between mean DSC score and FLOPs with bubble size encoding parameter count, shows <inline-formula id="inf506">
<mml:math id="m534">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM consistently occupying the upper-left quadrant. This position demonstrates superior segmentation accuracy at a comparable or lower computational cost. This performance contrasts sharply with that of other major architectural classes. For instance, CNN-based models like U-Net (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>) present an unfavorable trade-off, with our model reducing FLOPs by 72.8% for only a 12% increase in parameters. Transformer-based architectures such as TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>) and Swin-UNet (<xref ref-type="bibr" rid="B5">Cao et al., 2022</xref>) achieve high accuracy but at the cost of substantial computational and parameter overhead. Conversely, while other lightweight state space models like CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>) minimize complexity, they do not reach the same accuracy ceiling as <inline-formula id="inf507">
<mml:math id="m535">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, which delivers state-of-the-art accuracy for a modest increase in computational cost. In addition to FLOPs and parameter count, we report a representative inference latency to provide practical insight into runtime efficiency. Inference time is measured on a single NVIDIA RTX 3080 GPU with batch size 1 and an input resolution of <inline-formula id="inf508">
<mml:math id="m536">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, excluding data loading and preprocessing overhead. All models are warmed up prior to measurement, and the reported latency is averaged over multiple forward passes. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, <inline-formula id="inf509">
<mml:math id="m537">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM maintains competitive inference latency despite incorporating cross-directional zigzag scanning and recursive connections, indicating that the proposed design does not introduce prohibitive runtime overhead in practice.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Model complexity and inference latency comparison.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Type</th>
<th align="left">Method</th>
<th align="center">Params <inline-formula id="inf510">
<mml:math id="m538">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (M)</th>
<th align="center">FLOPs <inline-formula id="inf511">
<mml:math id="m539">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (G)</th>
<th align="center">Inference <inline-formula id="inf512">
<mml:math id="m540">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (ms)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">CNN</td>
<td align="left">UNet (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>)</td>
<td align="center">34.50</td>
<td align="center">65.52</td>
<td align="center">28.87</td>
</tr>
<tr>
<td align="left">UNet&#x2b;&#x2b; (<xref ref-type="bibr" rid="B38">Zhou et al., 2018</xref>)</td>
<td align="center">
<bold>9.26</bold>
</td>
<td align="center">34.65</td>
<td align="center">18.79</td>
</tr>
<tr>
<td align="left">Att-UNet (<xref ref-type="bibr" rid="B27">Oktay et al., 2018</xref>)</td>
<td align="center">34.87</td>
<td align="center">66.63</td>
<td align="center">29.49</td>
</tr>
<tr>
<td rowspan="5" align="left">Transformer</td>
<td align="left">TransUNet (<xref ref-type="bibr" rid="B7">Chen et al., 2024</xref>)</td>
<td align="center">109.54</td>
<td align="center">56.66</td>
<td align="center">48.65</td>
</tr>
<tr>
<td align="left">TransFuse (<xref ref-type="bibr" rid="B37">Zhang et al., 2021</xref>)</td>
<td align="center">43.40</td>
<td align="center">47.28</td>
<td align="center">28.83</td>
</tr>
<tr>
<td align="left">TC-Net (<xref ref-type="bibr" rid="B10">Dong et al., 2022</xref>)</td>
<td align="center">33.71</td>
<td align="center">33.56</td>
<td align="center">29.02</td>
</tr>
<tr>
<td align="left">Swin-Unet (<xref ref-type="bibr" rid="B5">Cao et al., 2022</xref>)</td>
<td align="center">82.30</td>
<td align="center">67.30</td>
<td align="center">34.82</td>
</tr>
<tr>
<td align="left">MISSFormer (<xref ref-type="bibr" rid="B18">Huang et al., 2022</xref>)</td>
<td align="center">42.46</td>
<td align="center">27.36</td>
<td align="center">23.22</td>
</tr>
<tr>
<td rowspan="6" align="left">SSM</td>
<td align="left">VM-UNet (<xref ref-type="bibr" rid="B30">Ruan et al., 2024</xref>)</td>
<td align="center">26.35</td>
<td align="center">21.38</td>
<td align="center">26.73</td>
</tr>
<tr>
<td align="left">CC-ViM (<xref ref-type="bibr" rid="B41">Zhu et al., 2025</xref>)</td>
<td align="center">23.56</td>
<td align="center">14.45</td>
<td align="center">
<bold>18.57</bold>
</td>
</tr>
<tr>
<td align="left">Swin-UMamba (<xref ref-type="bibr" rid="B24">Liu J. et al., 2024</xref>)</td>
<td align="center">60.18</td>
<td align="center">68.00</td>
<td align="center">30.71</td>
</tr>
<tr>
<td align="left">SliceMamba (<xref ref-type="bibr" rid="B13">Fan et al., 2025</xref>)</td>
<td align="center">20.53</td>
<td align="center">16.52</td>
<td align="center">25.45</td>
</tr>
<tr>
<td align="left">SA-UMamba (<xref ref-type="bibr" rid="B26">Liu et al., 2025</xref>)</td>
<td align="center">43.45</td>
<td align="center">24.72</td>
<td align="center">21.78</td>
</tr>
<tr>
<td align="left">
<inline-formula id="inf513">
<mml:math id="m541">
<mml:mrow>
<mml:mtext>AEMMamba</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B11">Dong et al., 2025</xref>)</td>
<td align="center">52.96</td>
<td align="center">39.31</td>
<td align="center">29.66</td>
</tr>
<tr>
<td rowspan="3" align="left">RWKV</td>
<td align="left">Zig-RiR (<xref ref-type="bibr" rid="B8">Chen et al., 2025</xref>)</td>
<td align="center">24.58</td>
<td align="center">
<bold>12.45</bold>
</td>
<td align="center">24.27</td>
</tr>
<tr>
<td align="left">HER-Seg (<xref ref-type="bibr" rid="B35">Xu et al., 2025</xref>)</td>
<td align="center">25.21</td>
<td align="center">14.48</td>
<td align="center">22.71</td>
</tr>
<tr>
<td align="left">ZR<sup>2</sup>ViM (Ours)</td>
<td align="center">38.66</td>
<td align="center">17.84</td>
<td align="center">22.35</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p> Top results are highlighted in bold. FLOPs and Params are profiled with a unified input <inline-formula id="inf514">
<mml:math id="m542">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Inference latency is measured under the fixed setting described in the main text (RTX 3080, batch size 1, <inline-formula id="inf515">
<mml:math id="m543">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, warm-up, averaged over multiple forward passes).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Comparative analysis of model accuracy and efficiency. Each point represents a model, plotting its average DSC coefficient against computational cost (FLOPs) across five medical imaging datasets: ISIC 2017, ISIC 2018, BUSI, CVC-ClinicDB, and Synapse. Bubble size corresponds to the parameter count (M), illustrating the three-way trade-off between performance, computational demand, and model size.</p>
</caption>
<graphic xlink:href="fbinf-06-1768786-g008.tif">
<alt-text content-type="machine-generated">Bubble chart comparing segmentation models, with FLOPs (G) on the x-axis and average DSC percentage on the y-axis. Each bubble represents a model, labeled and sized differently. ZR2ViM has the highest average DSC and relatively low FLOPs, while U-Net, Att-UNet, and TransUNet have higher FLOPs and slightly lower DSC. CC-ViM, VM-UNet, HER-Seg, Zig-RiR, and U-Net&#x2b;&#x2b; are shown with distinct positions and bubble sizes.</alt-text>
</graphic>
</fig>
<p>This advantageous balance between efficiency and effectiveness originates from a targeted architectural restructuring of the ViM framework. First, we replaced the standard S6 state space kernel with our SSR nested recursive state units. This design, coupled with NRC-driven local&#x2013;global modeling under a unified scanning framework, minimizes the redundancy inherent in unstructured multi-branch approaches. Concurrently, the integration of the CZ-WKV module, featuring scalable <inline-formula id="inf516">
<mml:math id="m544">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-step recursion, explicitly restores 2D spatial adjacency prior to sequence unrolling, thereby enhancing directional robustness. Notably, all components of <inline-formula id="inf517">
<mml:math id="m545">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, including the cross-directional zigzag scanning and recursive operations, are implemented using standard PyTorch operators without relying on custom CUDA kernels or hardware-specific optimizations, facilitating consistent evaluation and fair comparison with Transformer-based and other baseline models.</p>
</sec>
<sec id="s4-6">
<label>4.6</label>
<title>Ablation study</title>
<p>To quantify the contribution of each key component in <inline-formula id="inf518">
<mml:math id="m546">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, we conducted a series of module-wise ablation studies on the ISIC 2018 and Synapse multi-organ CT datasets. These two datasets were selected as representative testbeds because they cover two complementary segmentation regimes relevant to boundary-preserving modeling: ISIC 2018 is a 2D dermatoscopic lesion dataset with highly irregular contours, whereas Synapse is a multi-organ CT benchmark where boundary quality can be directly assessed using the boundary-sensitive HD95 metric in addition to DSC. Across all experiments, the network architecture and training hyperparameters were held constant, with only the specific module under investigation being modified. For skin-lesion segmentation (ISIC 2018), we report mIoU and DSC to measure overall accuracy. For multi-organ segmentation (Synapse), we report DSC and HD95 to assess boundary precision. This dual-metric design allows us to validate the proposed mechanisms under both region-overlap and boundary-focused criteria, while keeping the evaluation tailored to the distinct objectives of each task.</p>
<p>To clarify the performance&#x2013;complexity trade-off of individual components, we additionally report the parameter count for each ablation variant in <xref ref-type="table" rid="T6">Tables 6</xref>&#x2013;<xref ref-type="table" rid="T10">10</xref>. Notably, several ablation settings have identical or near-identical parameter counts, as these variants keep the overall network configuration fixed and only modify the target mechanism (e.g., scan scheduling, recursion depth, or path coordination) without introducing additional learnable layers. Therefore, the performance differences mainly reflect modeling capability rather than increased model capacity. Moreover, the full <inline-formula id="inf519">
<mml:math id="m547">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM model is evaluated on five public datasets across four imaging domains in the main experiments, and the ablations on these two representative datasets are used to explain why the overall gains occur.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Contribution of key architectural components.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Methods</th>
<th rowspan="2" align="center">Params <inline-formula id="inf520">
<mml:math id="m548">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (M)</th>
<th colspan="2" align="center">ISIC18</th>
<th colspan="2" align="center">Synapse</th>
</tr>
<tr>
<th align="center">DSC <inline-formula id="inf521">
<mml:math id="m549">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">mIoU <inline-formula id="inf522">
<mml:math id="m550">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">DSC <inline-formula id="inf523">
<mml:math id="m551">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">HD95 <inline-formula id="inf524">
<mml:math id="m552">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">ViM (Baseline)</td>
<td align="center">26.35</td>
<td align="center">89.90</td>
<td align="center">82.08</td>
<td align="center">78.73</td>
<td align="center">23.09</td>
</tr>
<tr>
<td align="left">ViM &#x2b; SSR (Ours)</td>
<td align="center">38.66</td>
<td align="center">
<bold>92.22</bold>
</td>
<td align="center">
<bold>85.65</bold>
</td>
<td align="center">
<bold>83.04</bold>
</td>
<td align="center">
<bold>15.68</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Params denote the total number of parameters (M). This table directly compares ViM (Baseline) and ViM &#x2b; SSR, under the same settings to illustrate the accuracy&#x2013;complexity trade-off.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>Efficacy of different spatial-mixing mechanisms.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Methods</th>
<th rowspan="2" align="center">Params <inline-formula id="inf525">
<mml:math id="m553">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (M)</th>
<th colspan="2" align="center">ISIC18</th>
<th colspan="2" align="center">Synapse</th>
</tr>
<tr>
<th align="center">DSC <inline-formula id="inf526">
<mml:math id="m554">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">mIoU <inline-formula id="inf527">
<mml:math id="m555">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">DSC <inline-formula id="inf528">
<mml:math id="m556">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">HD95 <inline-formula id="inf529">
<mml:math id="m557">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Bi-WKV</td>
<td align="center">38.19</td>
<td align="center">91.14</td>
<td align="center">84.82</td>
<td align="center">81.42</td>
<td align="center">16.30</td>
</tr>
<tr>
<td align="left">Re-WKV</td>
<td align="center">38.19</td>
<td align="center">91.31</td>
<td align="center">85.08</td>
<td align="center">81.59</td>
<td align="center">16.19</td>
</tr>
<tr>
<td align="left">Zigzag-WKV</td>
<td align="center">38.19</td>
<td align="center">91.39</td>
<td align="center">85.17</td>
<td align="center">82.19</td>
<td align="center">16.04</td>
</tr>
<tr>
<td align="left">CZ-WKV (Ours)</td>
<td align="center">38.66</td>
<td align="center">
<bold>92.22</bold>
</td>
<td align="center">
<bold>85.65</bold>
</td>
<td align="center">
<bold>83.04</bold>
</td>
<td align="center">
<bold>15.68</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Params denote the total number of parameters (M). The compared variants have near-identical parameter counts because only the spatial-mixing mechanism is altered within a fixed architecture.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T8" position="float">
<label>TABLE 8</label>
<caption>
<p>Effectiveness of different scanning schemes.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Methods</th>
<th rowspan="2" align="center">Params <inline-formula id="inf530">
<mml:math id="m558">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (M)</th>
<th colspan="2" align="center">ISIC18</th>
<th colspan="2" align="center">Synapse</th>
</tr>
<tr>
<th align="center">DSC <inline-formula id="inf531">
<mml:math id="m559">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">mIoU <inline-formula id="inf532">
<mml:math id="m560">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">DSC <inline-formula id="inf533">
<mml:math id="m561">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">HD95 <inline-formula id="inf534">
<mml:math id="m562">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Sweep</td>
<td align="center">38.19</td>
<td align="center">91.64</td>
<td align="center">84.38</td>
<td align="center">81.08</td>
<td align="center">16.77</td>
</tr>
<tr>
<td align="left">CZ-Scan-1 (Single-Dir)</td>
<td align="center">38.66</td>
<td align="center">91.96</td>
<td align="center">84.82</td>
<td align="center">81.71</td>
<td align="center">16.34</td>
</tr>
<tr>
<td align="left">CZ-Scan-1 (Alt-Dir)</td>
<td align="center">38.66</td>
<td align="center">92.11</td>
<td align="center">85.21</td>
<td align="center">82.26</td>
<td align="center">16.07</td>
</tr>
<tr>
<td align="left">CZ-Scan-4 (Single-Dir)</td>
<td align="center">38.66</td>
<td align="center">92.15</td>
<td align="center">85.37</td>
<td align="center">82.58</td>
<td align="center">15.86</td>
</tr>
<tr>
<td align="left">CZ-Scan-4 (Alt-Dir)</td>
<td align="center">38.66</td>
<td align="center">
<bold>92.22</bold>
</td>
<td align="center">
<bold>85.65</bold>
</td>
<td align="center">
<bold>83.04</bold>
</td>
<td align="center">
<bold>15.68</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Params denote the total number of parameters (M). Scan scheduling changes the traversal order and does not introduce additional learnable parameters.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T9" position="float">
<label>TABLE 9</label>
<caption>
<p>Sensitivity to recursion depth.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Setting</th>
<th rowspan="2" align="center">Params <inline-formula id="inf535">
<mml:math id="m563">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (M)</th>
<th colspan="2" align="center">ISIC18</th>
<th colspan="2" align="center">Synapse</th>
</tr>
<tr>
<th align="center">DSC <inline-formula id="inf536">
<mml:math id="m564">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">mIoU <inline-formula id="inf537">
<mml:math id="m565">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">DSC <inline-formula id="inf538">
<mml:math id="m566">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">HD95 <inline-formula id="inf539">
<mml:math id="m567">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">m &#x3d; 1</td>
<td align="center">38.66</td>
<td align="center">91.21</td>
<td align="center">83.99</td>
<td align="center">82.32</td>
<td align="center">16.17</td>
</tr>
<tr>
<td align="left">m &#x3d; 2</td>
<td align="center">38.66</td>
<td align="center">91.35</td>
<td align="center">84.22</td>
<td align="center">82.58</td>
<td align="center">16.03</td>
</tr>
<tr>
<td align="left">m &#x3d; 3</td>
<td align="center">38.66</td>
<td align="center">91.58</td>
<td align="center">84.60</td>
<td align="center">82.82</td>
<td align="center">15.84</td>
</tr>
<tr>
<td align="left">
<bold>m &#x3d; 4</bold>
</td>
<td align="center">38.66</td>
<td align="center">
<bold>92.22</bold>
</td>
<td align="center">
<bold>85.65</bold>
</td>
<td align="center">
<bold>83.04</bold>
</td>
<td align="center">
<bold>15.68</bold>
</td>
</tr>
<tr>
<td align="left">m &#x3d; 5</td>
<td align="center">38.66</td>
<td align="center">91.80</td>
<td align="center">84.96</td>
<td align="center">82.72</td>
<td align="center">15.81</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Params denote the total number of parameters (M). Varying the recursion depth <inline-formula id="inf540">
<mml:math id="m568">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> changes the number of recursion steps but does not add learnable parameters.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T10" position="float">
<label>TABLE 10</label>
<caption>
<p>Impact of the NRC structure.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Methods</th>
<th rowspan="2" align="center">Params <inline-formula id="inf541">
<mml:math id="m569">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (M)</th>
<th colspan="2" align="center">ISIC18</th>
<th colspan="2" align="center">Synapse</th>
</tr>
<tr>
<th align="center">DSC <inline-formula id="inf542">
<mml:math id="m570">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">mIoU <inline-formula id="inf543">
<mml:math id="m571">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">DSC <inline-formula id="inf544">
<mml:math id="m572">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">HD95 <inline-formula id="inf545">
<mml:math id="m573">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">w/o NRC-Inner</td>
<td align="center">37.89</td>
<td align="center">91.08</td>
<td align="center">83.01</td>
<td align="center">81.35</td>
<td align="center">16.33</td>
</tr>
<tr>
<td align="left">w/o NRC-Outer</td>
<td align="center">37.89</td>
<td align="center">91.44</td>
<td align="center">83.53</td>
<td align="center">82.03</td>
<td align="center">16.05</td>
</tr>
<tr>
<td align="left">NRC-Inner <inline-formula id="inf546">
<mml:math id="m574">
<mml:mrow>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> DWConv</td>
<td align="center">37.92</td>
<td align="center">91.87</td>
<td align="center">84.16</td>
<td align="center">82.57</td>
<td align="center">15.87</td>
</tr>
<tr>
<td align="left">NRC (Ours)</td>
<td align="center">38.66</td>
<td align="center">
<bold>92.22</bold>
</td>
<td align="center">
<bold>85.65</bold>
</td>
<td align="center">
<bold>83.04</bold>
</td>
<td align="center">
<bold>15.68</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Top results are highlighted in bold. Params denote the total number of parameters (M). Minor parameter differences arise from removing or substituting NRC, pathways (e.g., replacing the inner pathway with DWConv).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<sec id="s4-6-1">
<label>4.6.1</label>
<title>Efficacy of the SSR core operator</title>
<p>To determine if the proposed SSR operator mitigates spatial information loss during the 2D-to-1D sequence conversion, we compared its performance against the native S6 operator from the original ViM framework. As shown in <xref ref-type="table" rid="T6">Table 6</xref>, replacing the standard S6 operator with SSR yields substantial performance gains, accompanied by a moderate increase in parameter count. On the ISIC 2018 dataset, DSC and mIoU improved from 89.90% and 82.08%&#x2013;92.22% and 85.65%, respectively. On the Synapse multi-organ CT dataset, DSC increased from 78.73% to 83.04%, while HD95 decreased from 23.09 mm to 15.68 mm. These results indicate that the performance improvements introduced by SSR are highly cost-effective relative to the added model capacity. The nested recursion and direction-aware mechanisms effectively suppress the inherent &#x201c;stripe bias&#x201d; of unidirectional scanning, better preserving spatial adjacency during sequence serialization and enabling more precise delineation of complex anatomical structures, particularly those with long boundaries or weak contrast.</p>
</sec>
<sec id="s4-6-2">
<label>4.6.2</label>
<title>Comparison of attention-like kernels</title>
<p>To assess the efficacy of different spatial-mixing mechanisms for dynamic context aggregation, we compared several variants of the selective scan (WKV) kernel within a fixed architectural configuration. As presented in <xref ref-type="table" rid="T7">Table 7</xref>, the compared variants exhibit nearly identical parameter counts, isolating the effect of the spatial-mixing strategy itself. Performance improves progressively from Bi-WKV to Re-WKV and Zigzag-WKV, culminating in the highest accuracy with the proposed CZ-WKV. On the Synapse multi-organ CT dataset, CZ-WKV improves DSC by &#x2b;1.62, &#x2b;1.45, and &#x2b;0.85 points over Bi-WKV, Re-WKV, and Zigzag-WKV, respectively, while also reducing HD95. This monotonic performance improvement underscores the effectiveness of cyclic recursion combined with zigzag scanning in enhancing long-range context aggregation without increasing model complexity.</p>
</sec>
<sec id="s4-6-3">
<label>4.6.3</label>
<title>Impact of scan scheduling strategies</title>
<p>To assess the contribution of the scanning strategy to spatial context modeling, we compared the conventional Sweep scanning algorithm with our proposed Zigzag scheme in both unidirectional (Single-Dir) and alternating-direction (Alt-Dir) configurations. As detailed in <xref ref-type="table" rid="T8">Table 8</xref>, all Zigzag-based variants share identical parameter counts, allowing a fair comparison focused solely on scanning behavior. A consistent trend emerged across both datasets. First, all Zigzag scan variants (CZ-Scan) consistently outperformed the Sweep baseline, confirming that Zigzag paths better preserve spatial adjacency during sequence serialization. Second, within the Zigzag schemes, the Alt-Dir configuration consistently surpassed the Single-Dir one. On Synapse, for example, CZ&#x2013;Scan-4 (Alt-Dir) increased DSC from 82.58% to 83.04% and reduced HD95 from 15.86 mm to 15.68 mm relative to its single-direction counterpart. This performance gain can be attributed to a &#x201c;cross-path gap-filling&#x201d; effect, wherein alternating scan directions correct for perceptual blind spots inherent in any single-direction scan. This mechanism significantly improves the continuity of segmented boundaries, particularly for intricate anatomical structures. These results empirically validate that the Zigzag scanning strategy enhances directional robustness while maintaining spatial coherence. Crucially, these findings demonstrate that substantial improvements in boundary continuity and directional robustness can be achieved solely through strategic scan scheduling, without increasing model capacity.</p>
</sec>
<sec id="s4-6-4">
<label>4.6.4</label>
<title>Sensitivity to recursion depth</title>
<p>To determine the optimal recursion depth for modeling long-range feature interactions, we conducted a sensitivity analysis on the number of recursion steps, m. As reported in <xref ref-type="table" rid="T9">Table 9</xref>, all tested settings share identical parameter counts. Performance improved monotonically as m increased from 1 to 4, peaking at m &#x3d; 4 (e.g., DSC on ISIC 2018 rose from 91.21% to 92.22%). However, performance slightly declined at m &#x3d; 5 across both datasets. This finding suggests that a moderate increase in recursion efficiently expands the model&#x2019;s effective receptive field, enabling more comprehensive aggregation of global contextual information. Conversely, excessive depth (m &#x3d; 5) appears to introduce optimization challenges or result in saturated gains. Accordingly, we set m &#x3d; 4 as the default, empirically optimal setting, which balances receptive-field expansion with training stability while preserving linear time complexity.</p>
</sec>
<sec id="s4-6-5">
<label>4.6.5</label>
<title>Role of the NRC coordination mechanism</title>
<p>To elucidate the synergistic interaction between the inner and outer pathways of the NRC, we conducted systematic path removal and substitution experiments. As shown in <xref ref-type="table" rid="T10">Table 10</xref>, the ablation variants introduce only minor differences in parameter count, allowing a fair assessment of performance changes attributable to architectural coordination rather than increased model capacity. Despite these minimal parameter variations, removing or simplifying either pathway leads to pronounced performance degradation, revealing a clear performance hierarchy. Specifically, the complete nested structure achieves optimal performance, replacing the inner path with a depthwise convolution (DWConv) degrades accuracy, and ablating either the outer or inner pathway causes a more substantial drop. This gradient demonstrates that both pathways are indispensable and that their hierarchical coordination outperforms simpler parallel or partially ablated designs. These results suggest a functional division of labor within the NRC. The inner pathway focuses on restoring fine-grained local details, while the outer pathway leverages these cues to enhance global structural coherence. The &#x201c;inside-out&#x201d; residual injection mechanism is therefore essential for effectively integrating local and global context, enabling the nested architecture to deliver improved performance without relying on increased model capacity.</p>
</sec>
</sec>
<sec id="s4-7">
<label>4.7</label>
<title>Discussion</title>
<p>Conventional segmentation frameworks, including those based on CNNs, Transformers, and SSMs (<xref ref-type="bibr" rid="B29">Ronneberger et al., 2015</xref>; <xref ref-type="bibr" rid="B15">Han et al., 2023</xref>; <xref ref-type="bibr" rid="B25">Liu Y. et al., 2024</xref>), universally rely on serializing two-dimensional feature maps into one-dimensional sequences. However, this serialization process disrupts the inherent spatial adjacency critical for accurate boundary localization, particularly in medical images where anatomical structures are elongated, tortuous, or have low contrast. This disruption manifests as stripe-like artifacts and inconsistencies along fine boundaries, resulting in higher (worse) HD95 scores on datasets like Synapse and fragmented lesion contours in ISIC. Consequently, the serialization bottleneck, rather than model capacity, emerges as the primary factor limiting boundary fidelity in these architectures.</p>
<p>To address this limitation, <inline-formula id="inf547">
<mml:math id="m575">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM restores two-dimensional spatial adjacency by introducing SSR and the CZ-WKV. By integrating direction-aware recursive updates with a zigzag-based sequence modeling strategy, <inline-formula id="inf548">
<mml:math id="m576">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM mitigates the anisotropy induced by conventional scanning methods. This dual approach enhances long-range dependency learning while preserving spatial adjacency during feature unfolding, ensuring a consistent and stable boundary representation. This is particularly effective for structures with irregular or elongated morphologies, as demonstrated in our experiments.</p>
<sec id="s4-7-1">
<label>4.7.1</label>
<title>Clinical significance</title>
<p>The model&#x2019;s enhanced boundary preservation and robustness for low-contrast or slender anatomical structures hold substantial clinical value. Accurate delineation of irregular skin lesions enables more reliable tumor burden estimation, while clearer breast mass boundaries can reduce diagnostic uncertainty in ultrasound imaging. Furthermore, more consistent multi-organ segmentation is crucial for improving the precision of radiotherapy planning and preoperative assessments. By reducing boundary fragmentation and improving regional continuity, <inline-formula id="inf549">
<mml:math id="m577">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM can potentially decrease the manual correction workload for radiologists and enhance the reliability of downstream quantitative analyses.</p>
</sec>
<sec id="s4-7-2">
<label>4.7.2</label>
<title>Limitations and future directions</title>
<p>Despite its strong performance, <inline-formula id="inf550">
<mml:math id="m578">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM has several limitations that suggest avenues for future research. First, the model exhibits slightly reduced accuracy on small or highly variable organs (e.g., the pancreas and gallbladder), highlighting the need for more powerful fine-grained feature modeling. Second, although the <inline-formula id="inf551">
<mml:math id="m579">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-step recursive updates in CZ-WKV enhance directional context aggregation, they yield diminishing returns with increasing recursion depth; adaptive or data-driven recursion strategies could better balance computational cost and performance. Third, while zigzag scanning effectively restores spatial adjacency, its multi-path scheduling increases architectural complexity. More lightweight or learnable scanning schemes could reduce this structural overhead while preserving directional robustness.</p>
<p>Moreover, our current design primarily targets 2D feature serialization and thus does not explicitly model inter-slice correlations in volumetric data. Given the near-linear computational complexity of <inline-formula id="inf552">
<mml:math id="m580">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, a promising direction is to extend the framework to direct 3D volumetric segmentation or video-based analysis. The proposed zigzag scanning strategy could be naturally generalized to 3D traversals (e.g., volumetric space-filling scheduling) to preserve adjacency along all three axes. This can be further combined with depth-wise recursive state propagation to capture cross-slice anatomical continuity without incurring the prohibitive memory cost of 3D Transformers. Similarly, for medical video segmentation (e.g., in ultrasound or endoscopic videos), the recursion-enhanced mechanism in SSR can be extended along the time axis by propagating the global state across frames while retaining spatial zigzag unfolding within each frame, enabling robust spatiotemporal consistency under motion and deformation.</p>
<p>Finally, large-scale evaluation on multi-center datasets representing diverse clinical scenarios is necessary to fully validate the generalizability of <inline-formula id="inf553">
<mml:math id="m581">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM and to facilitate its translation into real-world medical workflows.</p>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>This paper proposes a recursion-enhanced visual state space model, <inline-formula id="inf554">
<mml:math id="m582">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM, that addresses the loss of two-dimensional spatial adjacency and directional continuity arising from the serialization of medical images into one-dimensional sequences. Through its innovative SSR and CZ-WKV modules, <inline-formula id="inf555">
<mml:math id="m583">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM restores these critical spatial relationships. This approach enables the seamless integration of fine-grained local details with global semantic context while maintaining near-linear computational complexity. Evaluated across four imaging domains (dermatoscopic, ultrasound, endoscopic, and multi-organ CT) on five public datasets, <inline-formula id="inf556">
<mml:math id="m584">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM consistently outperforms representative CNN, Transformer, and SSM baselines. For instance, compared to the strong CC-ViM baseline, <inline-formula id="inf557">
<mml:math id="m585">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM improves the DSC by 2.32% on the ISIC 2018 dataset and reduces the HD95 by 2.15 mm on the Synapse multi-organ CT dataset. These quantitative improvements underscore the model&#x2019;s superior capacity for preserving fine boundaries, maintaining structural continuity, and robustly segmenting challenging anatomical regions, such as those that are elongated or exhibit low contrast. By preserving spatial adjacency during sequence modeling and ensuring coherent information propagation along complex anatomical structures, <inline-formula id="inf558">
<mml:math id="m586">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>ZR</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>ViM achieves robust long-range dependency modeling and precise boundary representation. This combination of high accuracy and computational efficiency makes the framework particularly valuable for clinical workflows requiring reliable boundary delineation, especially under resource constraints. Future research will extend this approach to 3D volumetric segmentation and explore adaptive or learnable scanning strategies. We also plan to investigate the benefits of large-scale pre-training, multimodal data integration, and rigorous multi-center clinical validation to further enhance the model&#x2019;s robustness and generalizability for real-world medical applications.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: The datasets analyzed for this study can be found in the following public repositories. All links provided were accessible at the time of manuscript submission. 1. ISIC 2018 Dataset Repository/Platform: International Skin Imaging Collaboration (ISIC) Archive Direct URL: <ext-link ext-link-type="uri" xlink:href="https://challenge.isic-archive.com/data/#2018">https://challenge.isic-archive.com/data/&#x23;2018</ext-link> Reference/Citation: Codella, N., Rotemberg, V., Tschandl, P., Celebi, M. E., Dusza, S., Gutman, D., and Halpern, A. (2019). Skin lesion analysis toward melanoma detection 2018: A challenge hosted by the international skin imaging collaboration (ISIC). arXiv preprint arXiv:1902.03368. 2. BUSI (Breast Ultrasound Images) Dataset Repository/Platform: Cairo University Scholar Direct URL: <ext-link ext-link-type="uri" xlink:href="https://scholar.cu.edu.eg/?q&#x26;equals;afahmy/pages/dataset">https://scholar.cu.edu.eg/?q&#x26;equals;afahmy/pages/dataset</ext-link> Reference/Citation: Al-Dhabyani, W., Gomaa, M., Khaled, H., &#x26; Fahmy, A. (2020). Dataset of breast ultrasound images. Data in Brief, 28, 104863. 3. CVC-ClinicDB Dataset Repository/Platform: EndoVis (MICCAI Endoscopic Vision) Grand Challenge Direct URL: <ext-link ext-link-type="uri" xlink:href="https://polyp.grand-challenge.org/CVCClinicDB/">https://polyp.grand-challenge.org/CVCClinicDB/</ext-link> Reference/Citation: Bernal, J., S&#xe1;nchez, F. J., Fern&#xe1;ndez-Esparrach, G., Gil, D., Rodr&#xed;guez, C., &#x26; Vilari&#xf1;o, F. (2015). WM-DOVA maps for accurate polyp highlighting in colonoscopy: Validation vs. saliency maps from physicians. Computerized Medical Imaging and Graphics, 43, 99-111. 4. Synapse Multi-Organ CT Dataset &#x25e6; Repository/Platform: Synapse Direct URL: <ext-link ext-link-type="uri" xlink:href="https://www.synapse.org/#!Synapse:syn3193805/wiki/217789">https://www.synapse.org/&#x23;!Synapse:syn3193805/wiki/217789</ext-link> Reference/Citation: Landman, B., Xu, Z., Iglesias, J. E., Styner, M., Langerak, T., &#x26; Klein, A. (2015). MICCAI multi-atlas labeling beyond the cranial vault&#x2013;workshop and challenge. In Proc. of the MICCAI Multi-Atlas Labeling Beyond Cranial Vault Workshop. The data supporting the findings of this study are publicly available from the repositories mentioned above. The corresponding accession numbers, DOIs, or direct links are provided in the list.</p>
</sec>
<sec sec-type="ethics-statement" id="s7">
<title>Ethics statement</title>
<p>This study involved secondary analysis of existing, de-identified, and publicly available datasets. No new data from human participants were collected directly by the authors. The use of such data for research purposes is exempt from ethical approval according to our institutional guidelines and international regulations (e.g., the Declaration of Helsinki) governing the use of publicly available retrospective data. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and institutional requirements because the research involved the secondary analysis of existing, de-identified, and publicly available medical image datasets. All datasets used (e.g., ISIC, BUSI, CVC-ClinicDB, Synapse) are publicly accessible for non-commercial research purposes. Ethical approval and participant informed consent were obtained in the original studies in which these datasets were created. No personally identifiable information is accessible to the authors, and no new data were collected from human participants directly for this work. Therefore, the requirement for written informed consent was waived.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>CH: Project administration, Methodology, Formal Analysis, Investigation, Writing &#x2013; review and editing, Conceptualization, Funding acquisition, Supervision. CX: Visualization, Writing &#x2013; original draft, Validation, Data curation, Conceptualization, Methodology, Investigation, Software. LL: Resources, Data curation, Project administration, Funding acquisition, Writing &#x2013; review and editing. XZ: Project administration, Data curation, Funding acquisition, Resources, Writing &#x2013; review and editing.</p>
</sec>
<ack>
<title>Acknowledgements</title>
<p>Many thanks to the staff of the Department of Traditional Chinese Medicine at Zigong First People&#x2019;s Hospital for their help in this work.</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Al-Dhabyani</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Gomaa</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Khaled</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Fahmy</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Dataset of breast ultrasound images</article-title>. <source>Data Brief</source> <volume>28</volume>, <fpage>104863</fpage>. <pub-id pub-id-type="doi">10.1016/j.dib.2019.104863</pub-id>
<pub-id pub-id-type="pmid">31867417</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Deep ensemble learning-driven fully automated multi-structure segmentation for precision craniomaxillofacial surgery</article-title>. <source>Front. Bioeng. Biotechnol.</source> <volume>13</volume>, <fpage>1580502</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2025.1580502</pub-id>
<pub-id pub-id-type="pmid">40406586</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bernal</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>S&#xe1;nchez</surname>
<given-names>F. J.</given-names>
</name>
<name>
<surname>Fern&#xe1;ndez-Esparrach</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gil</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rodr&#xed;guez</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Vilari&#xf1;o</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Wm-dova maps for accurate polyp highlighting in colonoscopy: validation vs. saliency maps from physicians</article-title>. <source>Comput. Medical Imaging Graphics</source> <volume>43</volume>, <fpage>99</fpage>&#x2013;<lpage>111</lpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2015.02.007</pub-id>
<pub-id pub-id-type="pmid">25863519</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Berseth</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Isic 2017-skin lesion analysis towards melanoma detection</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.1703.00523</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Swin-unet: unet-like pure transformer for medical image segmentation</article-title>,&#x201d; in <conf-name>European conference on computer vision</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>205</fpage>&#x2013;<lpage>218</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-25066-8_9</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chan</surname>
<given-names>H.-P.</given-names>
</name>
<name>
<surname>Samala</surname>
<given-names>R. K.</given-names>
</name>
<name>
<surname>Hadjiiski</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep learning in medical image analysis</article-title>. <source>Adv. Exp. Med. Biol.</source> <volume>1213</volume>. <fpage>3</fpage>&#x2013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-33128-3_1</pub-id>
<pub-id pub-id-type="pmid">32030660</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Transunet: rethinking the u-net architecture design for medical image segmentation through the lens of transformers</article-title>. <source>Med. Image Anal.</source> <volume>97</volume>, <fpage>103280</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2024.103280</pub-id>
<pub-id pub-id-type="pmid">39096845</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Zig-rir: Zigzag rwkv-in-rwkv for efficient medical image segmentation</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>44</volume>, <fpage>3245</fpage>&#x2013;<lpage>3257</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2025.3561797</pub-id>
<pub-id pub-id-type="pmid">40244838</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Codella</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Rotemberg</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Tschandl</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Celebi</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Dusza</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gutman</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Skin lesion analysis toward melanoma detection 2018: a challenge hosted by the international skin imaging collaboration (isic)</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.1902.03368</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Tc-net: dual coding network of transformer and cnn for skin lesion segmentation</article-title>. <source>Plos One</source> <volume>17</volume>, <fpage>e0277578</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0277578</pub-id>
<pub-id pub-id-type="pmid">36409714</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>I. Y.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>AEmmamba: an efficient medical segmentation model with edge enhancement</article-title>. <source>IEEE J. Biomed. Health Inf.</source> <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2025.3572088</pub-id>
<pub-id pub-id-type="pmid">40397628</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Vision-rwkv: efficient and scalable visual perception with rwkv-like architectures</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2403.02308</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Slicemamba with neural architecture search for medical image segmentation</article-title>. <source>IEEE J. Biomed. Health Inf.</source> <volume>29</volume>, <fpage>7446</fpage>&#x2013;<lpage>7458</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2025.3564381</pub-id>
<pub-id pub-id-type="pmid">40279217</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Mamba: linear-time sequence modeling with selective state spaces</article-title>,&#x201d; in <source>First conference on language modeling</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2312.00752">https://arxiv.org/abs/2312.00752</ext-link> (Accessed May 31, 2024)</comment>.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>A survey on vision transformer</article-title>. <source>IEEE Transactions Pattern Analysis Machine Intelligence</source> <volume>45</volume>, <fpage>87</fpage>&#x2013;<lpage>110</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3152247</pub-id>
<pub-id pub-id-type="pmid">35180075</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hatamizadeh</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Nath</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Myronenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Landman</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Unetr: transformers for 3d medical image segmentation</article-title>,&#x201d; in <conf-name>2022 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)</conf-name>, <fpage>1748</fpage>&#x2013;<lpage>1758</lpage>. <pub-id pub-id-type="doi">10.1109/WACV51458.2022.00181</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Missformer: an effective transformer for 2d medical image segmentation</article-title>. <source>IEEE Transactions Medical Imaging</source> <volume>42</volume>, <fpage>1484</fpage>&#x2013;<lpage>1494</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2022.3230943</pub-id>
<pub-id pub-id-type="pmid">37015444</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Isensee</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Jaeger</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Kohl</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Petersen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Maier-Hein</surname>
<given-names>K. H.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>nnu-net: a self-configuring method for deep learning-based biomedical image segmentation</article-title>. <source>Nat. Methods</source> <volume>18</volume>, <fpage>203</fpage>&#x2013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-020-01008-z</pub-id>
<pub-id pub-id-type="pmid">33288961</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Jha</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Smedsrud</surname>
<given-names>P. H.</given-names>
</name>
<name>
<surname>Riegler</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Halvorsen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>De Lange</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Johansen</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Kvasir-seg: a segmented polyp dataset</article-title> in <conf-name>International conference on multimedia modeling</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>451</fpage>&#x2013;<lpage>462</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-37734-2_37</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Landman</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Igelsias</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Styner</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Langerak</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Miccai multi-atlas labeling beyond the cranial vault&#x2013;workshop and challenge</article-title>,&#x201d; in <conf-name>Proc. MICCAI multi-atlas labeling beyond cranial vault&#x2014;workshop challenge</conf-name> (<publisher-loc>Munich, Germany</publisher-loc>), <fpage>12</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2103.10504</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Litjens</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kooi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bejnordi</surname>
<given-names>B. E.</given-names>
</name>
<name>
<surname>Setio</surname>
<given-names>A. A. A.</given-names>
</name>
<name>
<surname>Ciompi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Ghafoorian</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>A survey on deep learning in medical image analysis</article-title>. <source>Med. Image Analysis</source> <volume>42</volume>, <fpage>60</fpage>&#x2013;<lpage>88</lpage>. <pub-id pub-id-type="doi">10.1016/j.media.2017.07.005</pub-id>
<pub-id pub-id-type="pmid">28778026</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00986</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Xi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). &#x201c;<article-title>Swin-umamba: Mamba-based unet with imagenet-based pretraining</article-title>,&#x201d; in <conf-name>International conference on medical image computing and computer-assisted intervention</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>615</fpage>&#x2013;<lpage>625</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-72114-4_59</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Vmamba: visual state space model</article-title>. <source>Adv. Neural Information Processing Systems</source> <volume>37</volume>, <fpage>103031</fpage>&#x2013;<lpage>103063</lpage>. <pub-id pub-id-type="doi">10.5555/3737916.3741189</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Sa-umamba: spatial attention convolutional neural networks for medical image segmentation</article-title>. <source>PLoS One</source> <volume>20</volume>, <fpage>e0325899</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0325899</pub-id>
<pub-id pub-id-type="pmid">40504872</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name>
<surname>Oktay</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Schlemper</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Folgoc</surname>
<given-names>L. L.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Heinrich</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Misawa</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Attention u-net: learning where to look for the pancreas</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1804.03999">http://arxiv.org/abs/1804.03999</ext-link>
</comment> (<comment>Accessed May 20, 2018</comment>).</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Perazzi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Pont-Tuset</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>McWilliams</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Van Gool</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sorkine-Hornung</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>A benchmark dataset and evaluation methodology for video object segmentation</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, <fpage>724</fpage>&#x2013;<lpage>732</lpage>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-net: convolutional networks for biomedical image segmentation</article-title>,&#x201d;<source>Int. Conf. Med. Image Computing Computer-Assisted Intervention</source>, <volume>9351</volume>. <publisher-name>Springer</publisher-name>, <fpage>234</fpage>&#x2013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ruan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <source>Vm-unet: vision mamba unet for medical image segmentation</source>. <publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>. <comment>Available online at:</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2402.02491</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shamshad</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zamir</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>M. H.</given-names>
</name>
<name>
<surname>Hayat</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>F. S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Transformers in medical imaging: a survey</article-title>. <source>Med. Image Analysis</source> <volume>88</volume>, <fpage>102802</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2023.102802</pub-id>
<pub-id pub-id-type="pmid">37315483</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Annotation-efficient deep learning for automatic medical image segmentation</article-title>. <source>Nat. Communications</source> <volume>12</volume>, <fpage>5915</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-021-26216-9</pub-id>
<pub-id pub-id-type="pmid">34625565</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ning</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Ultralight vm-unet: parallel vision mamba significantly reduces parameters for skin lesion segmentation</article-title>. <source>Patterns</source> <volume>6</volume>, <fpage>101298</fpage>. <pub-id pub-id-type="doi">10.1016/j.patter.2025.101298</pub-id>
<pub-id pub-id-type="pmid">41328156</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Weighted res-unet for high-quality retina vessel segmentation</article-title>,&#x201d; in <conf-name>2018 9th international conference on information technology in medicine and education (ITME)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>327</fpage>&#x2013;<lpage>331</lpage>. <pub-id pub-id-type="doi">10.1109/ITME.2018.00080</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Lou</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Berhanu</surname>
<given-names>T. F.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Her-seg: holistically efficient segmentation for high-resolution medical images</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2504.06205</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Restore-rwkv: efficient and effective medical image restoration with rwkv</article-title>. <source>IEEE J. Biomed. Health Inf.</source> <volume>30</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2025.3588555</pub-id>
<pub-id pub-id-type="pmid">40663663</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Transfuse: fusing transformers and cnns for medical image segmentation</article-title>,&#x201d; in <conf-name>International conference on medical image computing and computer-assisted intervention</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>14</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-87193-2_2</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Rahman Siddiquee</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Tajbakhsh</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Unet&#x2b;&#x2b;: a nested u-net architecture for medical image segmentation</article-title>,&#x201d; in <source>Deep Learn. Med. Image Anal. Multimodal Learn. Clin. Decis. Support</source> (<publisher-name>Springer</publisher-name>), <fpage>3</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-00889-5_1</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ning</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Rwkv-based encoder-decoder model for code completion</article-title>,&#x201d; in <conf-name>2023 3rd International Conference on Electronic Information Engineering and Computer (EIECT)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>425</fpage>&#x2013;<lpage>428</lpage>. <pub-id pub-id-type="doi">10.1109/EIECT60552.2023.10442108</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Towards understanding convergence and generalization of adamw</article-title>. <source>IEEE Transactions Pattern Analysis Machine Intelligence</source> <volume>46</volume>, <fpage>6486</fpage>&#x2013;<lpage>6493</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2024.3382294</pub-id>
<pub-id pub-id-type="pmid">38536692</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Merging context clustering with visual state space models for medical image segmentation</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>44</volume>, <fpage>2131</fpage>&#x2013;<lpage>2142</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2025.3525673</pub-id>
<pub-id pub-id-type="pmid">40030866</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/6849/overview">Badri Roysam</ext-link>, University of Houston, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3210077/overview">Bhavana Jamalpur</ext-link>, SR University, India</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3217226/overview">Shyam Reddy Kotha</ext-link>, CellChorus Inc., United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3326470/overview">Lin Bai</ext-link>, University of Houston, United States</p>
</fn>
</fn-group>
</back>
</article>