<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2026.1759114</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>LBMNet: a hybrid multi-scale CNN&#x02013;Mamba framework for enhanced 3D stroke lesion segmentation in MRI</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Kuang</surname> <given-names>Zhejun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<uri xlink:href="https://loop.frontiersin.org/people/2766411"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yan</surname> <given-names>Xingxue</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Yu</surname> <given-names>Jiaxuan</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Sun</surname> <given-names>Dawen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<uri xlink:href="https://loop.frontiersin.org/people/3280386"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhao</surname> <given-names>Jian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/1696795"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Sun</surname> <given-names>Lei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>College of Computer Science and Technology, Changchun University</institution>, <city>Changchun</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Jilin Provincial Key Laboratory of Human Health Status Identification Function &#x00026; Enhancement</institution>, <city>Changchun</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>Key Laboratory of Intelligent Rehabilitation and Barrier-Free for the Disabled, Changchun University, Ministry of Education</institution>, <city>Changchun</city>, <country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>College of Artificial Intelligence, Nankai University</institution>, <city>Tianjin</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Dawen Sun, <email xlink:href="mailto:sundawen_ccdx@163.com">sundawen_ccdx@163.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-09">
<day>09</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1759114</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>16</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>22</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Kuang, Yan, Yu, Sun, Zhao and Sun.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Kuang, Yan, Yu, Sun, Zhao and Sun</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-09">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Brain stroke is one of the leading causes of death and disability worldwide, and accurate lesion segmentation from MRI is critical for clinical diagnosis and treatment planning. However, existing methods struggle with the high variability of stroke lesions in size and morphology. In particular, they fail to detect small lesions due to the limited receptive fields of CNNs and the computational inefficiency of Transformer-based approaches. To address these challenges, we propose LBMNet, a novel CNN&#x02013;Mamba network that integrates multi-scale convolutional encoding with Mamba-based decoding.</p></sec>
<sec>
<title>Methods</title>
<p>Owing to the high heterogeneity of stroke lesions, the encoder design employs a top-down LSC module to capture cross-scale representations. The decoder designs the BSC-Mamba (Bidirectional Spatial Context Mamba) model, integrating bidirectional state space modeling with adaptive spatial convolutions to enhance local feature information while modeling global dependencies with linear complexity. Furthermore, asymmetric adaptive gated feature fusion (BAGF) bridges the semantic gap by selectively merging encoder and decoder features, suppressing redundant information whilst highlighting critical lesion details.</p></sec>
<sec>
<title>Results</title>
<p>Extensive experiments on two benchmark datasets demonstrate state-of-the-art performance, achieving Dice coefficients of 67.57% on ATLAS v2.0 and 82.03% on ISLES 2022. Compared with existing CNN, Transformer, and hybrid models, LBMNet shows significant improvements in small lesion segmentation. This study presents a robust and efficient framework with strong clinical potential for accurate stroke lesion segmentation across diverse lesion sizes and morphologies.</p></sec></abstract>
<kwd-group>
<kwd>brain stroke segmentation</kwd>
<kwd>deep learning</kwd>
<kwd>hybrid architecture</kwd>
<kwd>Mamba</kwd>
<kwd>medical imaging</kwd>
<kwd>MRI</kwd>
<kwd>multi-scale convolution</kwd>
<kwd>state space model</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was funded by the Natural Science Foundation of Jilin Province (Grant No. YDZJ202501ZYTS589).</funding-statement>
</funding-group>
<counts>
<fig-count count="11"/>
<table-count count="9"/>
<equation-count count="20"/>
<ref-count count="44"/>
<page-count count="18"/>
<word-count count="10512"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Pathology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Stroke is a leading global cause of disability and mortality (<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>). According to the World Stroke Organization, it has become the third leading cause of death globally, with stroke-related fatalities projected to rise by nearly 50% between 2020 and 2050. Etiologically, stroke is categorized into ischemic and hemorrhagic types, with ischemic</p>
<p>stroke being predominant, accounting for approximately 80% of all cases. In clinical practice, rapid and precise lesion segmentation is crucial for treatment planning, formulating rehabilitation strategies, and assessing prognosis (<xref ref-type="bibr" rid="B3">3</xref>). However, stroke lesions on medical images often exhibit pronounced heterogeneity, posing formidable challenges for manual segmentation. Manual annotation depends heavily on the operator&#x00027;s expertise, introduces subjectivity, and is both time-consuming and inconsistent (<xref ref-type="bibr" rid="B4">4</xref>). Consequently, developing efficient and accurate automated segmentation techniques has become an urgent priority in this domain.</p>
<p>Despite the rapid progress of deep learning in medical image segmentation (<xref ref-type="bibr" rid="B5">5</xref>), automatic segmentation of stroke lesions remains challenging. Stroke lesions vary extensively in morphology: they can manifest as small ischemic regions with a volume under 10 cm<sup>3</sup>&#x02014;highly susceptible to omission&#x02014;or as large, irregular hemorrhagic lesions with indistinct boundaries that are difficult to delineate. As illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>, stroke lesions exhibit substantial variability in size, morphology, and anatomical location, ranging from small focal lesions to large irregular regions, which poses significant challenges for accurate segmentation.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Representative examples of stroke lesions with diverse sizes, shapes, and anatomical locations. Red regions indicate lesions (ATLAS v2.0).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0001.tif">
<alt-text content-type="machine-generated">Six MRI brain scans with outlined red markings indicating specific areas of interest. Three scans are in the top row and three in the bottom row, each highlighting different regions with irregular red boundaries.</alt-text>
</graphic>
</fig>
<p>Due to the limited receptive fields, the CNN methods cannot capture global context information well. For example, U-Net (<xref ref-type="bibr" rid="B6">6</xref>) and its variants can obtain high Dice for medium-to-large lesions, but often miss small lesions due to their limitation of fine-grained features capture. On the other hand, the transformer-based architectures, motivated by the self-attention, can capture global context information well. However, these methods require large-scale annotated datasets, which are hard to obtain. While labeled samples are abundant in stroke imaging, lesion boundaries are often ambiguous, which would lead transformers to overfit and consequently perform poorly in boundary delineation (<xref ref-type="bibr" rid="B7">7</xref>). Recently, CNN-Transformer hybrids such as TransUNet (<xref ref-type="bibr" rid="B8">8</xref>), Swin-UNETR (<xref ref-type="bibr" rid="B9">9</xref>) alleviate the above issues to a certain degree; however, they adopt naive skip connections through simply concatenating features, which would bring redundant information, feature conflicts, and degrade robustness under multi-scale lesion conditions.</p>
<p>These limitations indicate that current methods are unable to effectively capture global context information and simultaneously localize the boundaries of small and irregular stroke lesions accurately. CNNs are good at local feature extraction, while Transformers are good at global modeling. Still, these two approaches, used individually or even in a trivial combination, are unable to adaptively integrate the heterogeneous characteristics of stroke lesions.</p>
<p>To address these challenges, we propose LBMNet&#x02014;a three-dimensional stroke lesion segmentation hybrid network tailored for highly heterogeneous lesion-scale scenarios. During the encoding phase, LBMNet employs an LSC module that captures global contextual information through dual large-kernel convolutions, while progressively refining local details via small-kernel cascading processing. This effectively characterizes lesion features ranging from minute to extensively distributed. During decoding, the BSC-Mamba module builds upon the Mamba state space modeling framework (<xref ref-type="bibr" rid="B10">10</xref>) to construct local spatial augmentations. Combined with bidirectional state space scanning, this approach efficiently models long-range dependencies while preserving spatial continuity. These multi-level features are further coordinated through the BAGF (Bidirectional Adaptive Gate Fusion) module in the skip connection. By dynamically integrating complementary information from the encoder and decoder via attention mechanisms, the model achieves precise segmentation of minute and morphologically complex stroke lesions.</p>
<p>To address these challenges, we propose a multi-scale CNN&#x02013;Mamba fusion network (LBMNet) tailored for stroke lesion segmentation. Unlike Transformers with quadratic computational complexity (O(n<sup>2</sup>)), our model leverages Mamba&#x00027;s state-space mechanism (<xref ref-type="bibr" rid="B10">10</xref>) to achieve this goal efficiently. The main contributions are as follows:</p>
<list list-type="order">
<list-item><p>To address the extreme scale variation of stroke lesions-ranging from minute localized foci to extensive diffuse areas-we designed a &#x0201C;coarse-to-fine&#x0201D; LSC module. Unlike standard convolutional neural networks constrained by fixed receptive fields, this architecture employs dual-branch large-kernel convolutions to concurrently capture global semantic information, followed by cascaded small-kernel convolutions to refine local structural details. This dynamic adaptation mechanism ensures precise feature capture across the entire lesion size spectrum, effectively mitigating omissions caused by lesion heterogeneity.</p></list-item>
<list-item><p>To address the inevitable information loss during global context modeling and sequence flattening, we propose BSC-Mamba. As standard state-space models necessitate flattening 3D volumes into 1D sequences, this process disrupts the voxel-level neighborhood structure essential for defining irregular boundaries. BSC-Mamba&#x00027;s design effectively compensates for this information loss by integrating Adaptive Spatial Convolution (ASC) to pre-enhance local structural representations. Through bidirectional state space processing, it simultaneously captures global long-range dependencies with linear complexity while preserving voxel integrity for precise segmentation.</p></list-item>
<list-item><p>We have implemented Adaptive Gated Feature Fusion (BAGF). To overcome semantic gaps and feature conflicts arising from highly heterogeneous lesions, conventional methods employ direct concatenation for jump connections, which frequently introduces noise due to mismatched encoder and decoder representations. BAGF employs an asymmetric attention strategy: spatial attention in the encoder and channel attention in the decoder enable autonomous feature selection and dynamic fusion. This design ensures only lesion-relevant information is transmitted, effectively suppressing redundant and superfluous feature information.</p></list-item>
</list>
<p>Extensive experiments conducted on two benchmark datasets, ATLAS v2.0 and ISLES 2022, demonstrate that LBMNet substantially outperforms existing CNN-based methods, Transformer-based methods, and hybrid architectures across multiple evaluation metrics. LBMNet achieves a Dice score of 67.57% on ATLAS v2.0 and 82.03% on ISLES 2022. Compared with state-of-the-art baselines, the proposed model exhibits superior segmentation performance, particularly in detecting small lesions.</p></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec>
<label>2.1</label>
<title>CNN-based methods</title>
<sec>
<label>2.1.1</label>
<title>U-Net and its variants in general medical segmentation</title>
<p>Convolutional neural networks are considered the most appropriate models for medical image segmentation. U-Net successfully segments medical images using an encoder-decoder design and skip-connection architecture. R2U-Net (<xref ref-type="bibr" rid="B11">11</xref>) enhances feature representation by utilizing recurrent convolution and residual modules; Attention U-Net (<xref ref-type="bibr" rid="B12">12</xref>). Unlapping irrelevant features of skip connections in an attention gating mode. The 3D U-Net proposed by &#x000C7;i&#x000E7;ek et al. directly works for volumetric data by fully leveraging the spatial continuity of 3D scans. However, convolutional neural networks cannot capture the long-term dependency involved in segmenting morphologically complex lesions. Due to the large size variations of the lesions, small lesions do not segment well, and large lesions only partially perform segmentation, which is considered a major problem in segmentation failure.</p></sec>
<sec>
<label>2.1.2</label>
<title>Multi-scale convolutional strategies</title>
<p>To adapt to the scale variations of visual targets, inception networks (<xref ref-type="bibr" rid="B13">13</xref>), Res2Net (<xref ref-type="bibr" rid="B14">14</xref>), and HRNet (<xref ref-type="bibr" rid="B15">15</xref>) employ parallel or multi-branch convolutions for feature extraction from 2D classification and detection tasks. While these methods demonstrate excellent performance, they exhibit limitations in stroke lesion segmentation, failing to capture fine-grained lesion features due to a lack of voxel consistency. In our LSC module complete preservation of semantic information through a dual mechanism of global modeling (large-kernel convolutions) and local refinement (small-kernel convolutions).</p></sec>
<sec>
<label>2.1.3</label>
<title>CNN approach for stroke detection</title>
<p>There are also some works reported in studies suggesting CNN architectures for stroke lesion segmentation. For instance, Dolz et al. (<xref ref-type="bibr" rid="B16">16</xref>) presented a dense multi-path CNN that passes features of different scales in parallel. It is able to capture rich 2D features but cannot make good use of the highly 3D spatial context. Another method, ML-Net, is strong in terms of distance-based metrics (HD95) but not stable in the Dice score, especially for small lesions. Unlike FRPNet (<xref ref-type="bibr" rid="B17">17</xref>), which employs a feature refinement pyramid to enhance the segmentation accuracy, these methods are based on fixed-scale kernels and thus are limited to dealing with the high heterogeneity of stroke lesion sizes. These methods still lack the adaptability to cover the wide range of lesion sizes. It is still a challenging task to accurately segment both small lesions and large diffuse lesions (<xref ref-type="bibr" rid="B18">18</xref>).</p></sec></sec>
<sec>
<label>2.2</label>
<title>Transformer-based methods</title>
<p>Transformers, based on self-attention, give powerful global context modeling for medical segmentation. TransUNet (<xref ref-type="bibr" rid="B8">8</xref>) combined Transformers with CNNs to achieve a better result in segmentation boundaries. Swin-UNETR (<xref ref-type="bibr" rid="B9">9</xref>) also improved feature encoding by using a hierarchical Swin Transformer, which gave excellent results for multi-organ segmentation, which requires very large labeled datasets. In stroke imaging, where annotated samples are scarce and lesion boundaries are often ambiguous, Transformers may underfit and fail to segment boundaries (<xref ref-type="bibr" rid="B7">7</xref>) accurately. Their quadratic complexity (O(n<sup>2</sup>)) also limits scalability in the high-resolution 3D data. In the stroke segmentation problems, they lead to fine boundary segmentation and instability in small-lesion segmentation issues. While hybrid CNN-Transformer models partly solve the problems, their simple concatenation of skip connections can result in feature redundancy and cross-scale instability.</p></sec>
<sec>
<label>2.3</label>
<title>State space models and the Mamba architecture</title>
<p>As the sequence modeling ability is being adopted, the State Space Model (SSM) has received a lot of attention recently. Mamba (<xref ref-type="bibr" rid="B10">10</xref>) models global dependency linearly (O(n)); therefore, this eliminates the quadratic bottleneck of Transformers. Recently, researchers have applied Mamba to medical image segmentation. Earlier works VMamba (<xref ref-type="bibr" rid="B19">19</xref>) and VM-U-Net (<xref ref-type="bibr" rid="B20">20</xref>) successfully adapted the Mamba block for vision tasks. Recently UMamba (<xref ref-type="bibr" rid="B21">21</xref>), SegMamba (<xref ref-type="bibr" rid="B22">22</xref>) and HCMUNet (<xref ref-type="bibr" rid="B23">23</xref>) were proposed for medical image analysis. The common approach in these approaches is to serialize the 2D/3D patches of the image to 1D sequences as the input of Mamba blocks. This &#x0201C;flattening&#x0201D; process may also destroy the 3D spatial connection of voxels and lose fine-grained information used to draw boundaries.</p>
<p>In this work, we propose our method, the BSC-Mamba decoder, which introduces a local convolutional enhancement prior to the state space modeling and preserves local features as much as possible, accompanied by a bidirectional scanning strategy. While previous applications of Mamba focus on improving efficiency, our method aims at balancing global context and local spatial details to provide more robust performance for various lesion shapes.</p></sec></sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<p>We present LBMNet and its architecture shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. The network architecture includes three main components: Encoder, Adaptive Feature Fusion (BAGF) module, and Decoder. For feature extraction in the encoding process, multiple LSC Block layers are stacked in the encoder. The medical image is (C, D, H, W), where C, D, H, and W are the number of channels, depth, height, and width, respectively. The input image goes through a STEM convolution layer for initial feature extraction, which keeps spatial structure while enhances channel expressiveness for multi-level feature extraction. The image will be fed into an encoder consisting of 4 stages, and these stages are alternatively stacked with LSC Blocks. When the network goes deeper, the features are gradually squeezed: spatially downsampling while channels become deeper. Since the information will be lost when downsampling the feature maps, BAGF modules on skip connections will connect the features of the encoder generated layer by layer with the features of the decoder output and transfer them to the corresponding lower decoder stage. The decoder adopts a progressive upsampling-based architecture, and BSC-Mamba modules are used to combine local and global information to recover spatial resolution. The final segmentation output will be obtained by a 3D convolution layer that maps the features of fused layers.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>LBMNet consists of a multi-scale convolutional encoder (LSC), a locally enhanced bidirectional Mamba decoder (BSC-Mamba), and an adaptive feature fusion module (BAGF).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0002.tif">
<alt-text content-type="machine-generated">Diagram illustrating an encoder-decoder architecture for image processing. The encoder, starting with a STEM block, processes input through multiple LSC blocks across four stages, each increasing in complexity. The decoder reconstructs the output using BSC-Mamba blocks and trilinear interpolation, with data flow indicated by arrows. Feature maps and operations like batch normalization, convolution, and activation are part of the process. The layout incorporates various connection types, such as skip connections and up samples, to maintain data integrity.</alt-text>
</graphic>
</fig>
<sec>
<label>3.1</label>
<title>Encoder based on multi-scale dynamic receptive fields</title>
<p>Stroke lesions exhibit a high degree of heterogeneity at different scales. Fine-grained features need to be extracted for accurate boundary segmentation in small lesions (&#x0003C; 10 cm<sup>3</sup>), while the global context needs to be captured using larger receptive fields in relatively larger lesions (&#x0003E;50 cm<sup>3</sup>). However, standard convolutional neural networks employing fixed-size kernels suffer from inherent limitations: they either fail to detect minute lesions due to excessive stride sizes or are unable to fully capture comprehensive information from large lesions. To address this scale-sensitivity issue, to capture such scale-dependent heterogeneity, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>, we design the &#x0201C;Large-to-Small Convolution&#x0201D; module in the encoder in a similar way to a large kernel in RepLKNet (<xref ref-type="bibr" rid="B24">24</xref>) and ConvNeXt (<xref ref-type="bibr" rid="B25">25</xref>). The most distinctive contribution of the LSC module is that it explicitly introduces a &#x0201C;coarse-to-fine&#x0201D; hierarchical refinement process, while previous models solely depend on a single large-kernel branch.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>LSC adopts dual large kernels with 3 &#x000D7; 3 refinement for hierarchical &#x0201C;large-to-small&#x0201D; feature extraction.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0003.tif">
<alt-text content-type="machine-generated">Flowchart of a convolutional neural network architecture featuring two branches. Each branch starts with a 3D reshaping process. The first path involves a depthwise convolution followed by two 3D convolutions. The second path contains one 3D convolution and a depthwise convolution followed by another 3D convolution. Both paths merge, leading to batch normalization, another 3D convolution, and final output. Legend indicates &#x0201C;R&#x0201D; for reshape and &#x0201C;C&#x0201D; for convolution.</alt-text>
</graphic>
</fig>
<p>LSC initially extracts extensive multi-scale contextual information using parallel large-kernel branches, followed by immediate feature refinement through a dedicated small-kernel path. Specifically, the input feature map <italic>x</italic> is first processed through two parallel large-kernel depthwise separable convolutions (with kernel sizes 5<sup>3</sup> and 7<sup>3</sup>):</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>5</mml:mn><mml:mo>,</mml:mo><mml:mn>7</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:munder></mml:mstyle><mml:mtext>DWConv</mml:mtext><mml:mn>3</mml:mn><mml:msub><mml:mrow><mml:mtext>D</mml:mtext></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="true">(</mml:mo><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>Subsequently, small-kernel convolutions are introduced for local refinement:</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mtext>Con</mml:mtext><mml:msub><mml:mrow><mml:mtext>v</mml:mtext></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>Finally, feature stability is ensured through residual paths and batch normalization:</p>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mi>x</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">BN</mml:mtext><mml:mo stretchy="true">(</mml:mo><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>The LSC module combines dual large cores with refined small cores, specifically addressing lesion-scale heterogeneity. It provides a powerful mechanism for extracting robust, multi-scale features tailored to the specific distribution characteristics of stroke lesions.</p></sec>
<sec>
<label>3.2</label>
<title>Global&#x02013;local state space module</title>
<p>Segmentation of stroke lesions is typically applied on large-scale, irregularly shaped 3D data, thus requiring models with a global receptive field to model long-range dependencies. Transformer models are feasible to model long-range dependencies, while the self-attention cost is (O(n<sup>2</sup>)). For typical 3D medical images, this leads to prohibitive computational cost and memory consumption. Therefore, we use the Mamba architecture to model global context with linear complexity (O(n)) and greatly reduce the computational overhead of global context modeling. Thus, it is extremely scalable in modern medical imaging. On the other hand, training Mamba directly on images is not desirable. Mamba takes data as one-dimensional sequences, which implies that 3D feature maps need to be &#x0201C;unrolled.&#x0201D; However, this &#x0201C;unrolling&#x0201D; process inevitably degrades spatial information at the voxel level, preventing the standard Mamba architecture from effectively delineating the irregular boundaries characteristic of stroke lesions. To address this, we propose the Bidirectional Spatial Context Mamba (BSC-Mamba) decoder, which incorporates Adaptive Spatial Convolution (ASC) prior to bidirectional global scanning. This enhances local features while efficiently modeling global information. The BSC-Mamba module comprises two main stages, as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>BSC-Mamba module combines bidirectional state space modeling with local space context enhancement.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0004.tif">
<alt-text content-type="machine-generated">Diagram of a neural network architecture with labeled sections. The top section shows data flow through processes labeled &#x0201C;ASC,&#x0201D; &#x0201C;Backward Scan,&#x0201D; &#x0201C;Forward Scan,&#x0201D; and &#x0201C;Mamba,&#x0201D; with arrows indicating direction and operations like &#x0201C;Linear&#x0201D; and &#x0201C;De-flatten.&#x0201D; The bottom section details the &#x0201C;ASC&#x0201D; process with components like &#x0201C;DWConv 3x3x3,&#x0201D; &#x0201C;SE,&#x0201D; &#x0201C;Conv3D,&#x0201D; and &#x0201C;SeLU,&#x0201D; leading to &#x0201C;DWConv 7x7x7.&#x0201D; The right section highlights &#x0201C;Mamba&#x0201D; components such as &#x0201C;SSM,&#x0201D; &#x0201C;Conv3D,&#x0201D; and &#x0201C;Linear,&#x0201D; with operations like &#x0201C;Deserialization,&#x0201D; &#x0201C;Concatenation,&#x0201D; and &#x0201C;Sigmoid&#x0201D; shown.</alt-text>
</graphic>
</fig>
<sec>
<label>3.2.1</label>
<title>Adaptive spatial convolution (ASC) for local enhancement</title>
<p>Before sequence flattening, the input feature map <italic>x</italic> is processed by the ASC module. This module is designed to enhance local spatial information that may be lost during the serialization process. Initially, a depth-separable 3 &#x000D7; 3 &#x000D7; 3 convolution is applied to capture direct neighborhood relationships. Then, the SE attention mechanism adapts the channel feature information based on the local spatial context, effectively &#x0201C;preprocessing&#x0201D; the features before passing them to the global Mamba stage. This approach ensures the encoding and preservation of fine details, such as the edges of minute lesions.</p>
<disp-formula id="EQ4"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">SE-Attn</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02297;</mml:mo><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">DWConv</mml:mtext></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<p>Subsequently, a gated convolutional unit and a large kernel (7 &#x000D7; 7 &#x000D7; 7) depthwise convolution further expand the local receptive field, thereby creating a robust feature representation with local perceptual capabilities <italic>y</italic>.</p>
<disp-formula id="EQ5"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mi>x</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>7</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02297;</mml:mo><mml:mi>&#x003D5;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula></sec>
<sec>
<label>3.2.2</label>
<title>Bidirectional state space modeling for global context</title>
<p>After local enhancement by ASC, the 3D feature map <italic>y</italic> is flattened into a one-dimensional sequence <italic>y</italic><sub><italic>seq</italic></sub>. Standard unidirectional Mamba scans are limited in that they can only capture context from preceding elements. This approach is inadequate for spatial data, where the spatial context is isotropic. The features of a lesion at a given voxel depend on its global context, which must be considered to capture the full spatial relationship. Therefore, we adopted a bidirectional scanning mechanism to process the sequence in both forward and backward directions:</p>
<disp-formula id="EQ6"><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">fwd</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">Mamba</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<disp-formula id="EQ7"><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">bwd</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">reverse</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Mamba</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext class="textrm" mathvariant="normal">reverse</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>Forward scanning captures dependencies from the &#x0201C;beginning&#x0201D; to the &#x0201C;end&#x0201D; of the flattened sequence, while backward scanning tracks dependencies from the &#x0201C;end&#x0201D; to the &#x0201C;beginning.&#x0201D; By concatenating <italic>y</italic><sub>fwd</sub> and <italic>y</italic><sub>bwd</sub>, we ensure that the feature representation at each position is informed by the complete global context from both directions along the scan axis. This bidirectional approach is essential for accurately modeling complex structures, such as stroke lesions.</p>
<p>Finally, the concatenated features are projected back to the original channel dimension, reshaped into 3D volumetric data, and subsequently linked through residual connections to preserve the integrity of the information.</p>
<disp-formula id="EQ8"><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">output</mml:mtext><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">reshape</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">fwd</mml:mtext></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">bwd</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>x</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula></sec></sec>
<sec>
<label>3.3</label>
<title>Bidirectional feature fusion</title>
<p>As demonstrated by the U-Net, these skip connections help reintroduce high-resolution information into the decoding path. However, a significant &#x0201C;semantic gap&#x0201D; persists between encoder and decoder features. While encoder features from deeper layers provide highly semantic information, their weak spatial resolution makes it challenging to precisely localize minute lesions and delineate their boundaries accurately. Conversely, the decoder&#x00027;s highly spatially resolved information is semantically weaker and more susceptible to noise.</p>
<p>This leads to poor segmentation results and may even cause over-segmentation. If the two types of feature information are directly concatenated, conflicts and redundant information will arise, resulting in segmentation errors.</p>
<p>Therefore, we propose the adaptive fusion (BAGF) module shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, which efficiently utilizes effective information from both the encoder and decoder. Encoder features contain multi-scale contextual information but lack spatial information; decoder features contain spatial information but may contain noise. To address this, we apply a spatial attention mechanism in the encoder path to suppress irrelevant contextual regions while enhancing effective spatial information. Simultaneously, a channel attention mechanism is employed in the decoder path to highlight lesion-relevant feature channels while suppressing noisy ones. Finally, these two optimized feature streams are selectively fused to achieve the desired integration.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>The BAGF module applies channel and spatial attention to encoder&#x02013;decoder features and adaptively fuses them.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0005.tif">
<alt-text content-type="machine-generated">Diagram showing a neural network architecture with two parts. On the left, sequences D and E undergo 3D convolutions followed by depthwise convolutions, merging into Integration. On the right, D and E influence channel and spatial attention modules, followed by a softmax function and spatial attention, with results integrated through a 3D convolution.</alt-text>
</graphic>
</fig>
<p>First, the input encoder features (<italic>E</italic>) and decoder features (<italic>D</italic>) are aligned along a common channel dimension and subsequently augmented. The encoder features undergo further refinement through a lightweight residual block, enhancing local feature representations. These refined features are denoted as <italic>E</italic>&#x02032; and <italic>D</italic>&#x02032;, respectively. Next, an asymmetric attention mechanism is applied. The spatially attention-enhanced encoder features <italic>F</italic><sub>enc</sub> and the channel attention-enhanced decoder features <italic>F</italic><sub>dec</sub> are generated as follows:</p>
<disp-formula id="EQ9"><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>Among these, <italic>SA</italic> represents spatial attention, and <italic>CA</italic> is channel attention. Then, these two enhanced feature maps are fused through a gating mechanism. This mechanism adaptively calculates weight coefficients &#x003B1; and &#x003B2; to balance the feature information from both paths.</p>
<disp-formula id="EQ10"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>&#x003B1;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">Softmax</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Gate</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<disp-formula id="EQ11"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">fused</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>Finally, the fused feature map passes through the final convolutional layer to generate the output <italic>O</italic>, which is then fed into the next decoder stage.</p>
<disp-formula id="EQ12"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>O</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">fused</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>The BAGF module significantly enhances the model&#x00027;s ability to segment complex lesions by effectively integrating multi-level features. This module improves segmentation accuracy by effectively emphasizing lesion regions while suppressing irrelevant information.</p></sec>
<sec>
<label>3.4</label>
<title>Loss function</title>
<p>In medical image segmentation, the significant foreground-background class imbalance is addressed by a hybrid loss function that combines Dice loss and weighted cross-entropy (CE) loss. This hybrid approach is widely used in current segmentation tasks.</p>
<disp-formula id="EQ13"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mtext>&#x003A9;</mml:mtext></mml:mrow></mml:msub><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mtext>&#x003A9;</mml:mtext></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003F5;</mml:mi></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mtext>&#x003A9;</mml:mtext></mml:mrow></mml:msub></mml:mstyle><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle="true"><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mtext>&#x003A9;</mml:mtext></mml:mrow></mml:msub></mml:mstyle><mml:msub><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003F5;</mml:mi></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<p>Here, &#x00177; represents the predicted probability distribution, and <italic>y</italic> denotes the true label. &#x003B1; and &#x003B2; are the weighting coefficients for cross-entropy and Dice loss, respectively. &#x003F5; is the smoothing factor for numerical stability, &#x003A9; is the pixel set, and <italic>C</italic> is the total number of categories.</p></sec>
<sec>
<label>3.5</label>
<title>Evaluation metrics</title>
<p>To quantitatively evaluate the segmentation performance of LBMNet and compare it with other methods, we employed several standard metrics widely used in medical image segmentation, including the Dice coefficient, Intersection over Union (IoU), Recall, F2 score, and the 95th percentile Hausdorff Distance (HD95). All metrics were computed based on true positives (TP), false positives (FP), and false negatives (FN). The Dice coefficient measures the overlap between the predicted and ground truth regions, the IoU (or Jaccard index) evaluates the ratio of intersection to union, Recall quantifies the proportion of correctly identified positive voxels, and the F2 score emphasizes Recall, while HD95 measures the 95th percentile bidirectional Hausdorff distance between the predicted contour (<italic>P</italic>) and the ground truth (<italic>G</italic>). Their definitions are as follows:</p>
<disp-formula id="EQ14"><mml:math id="M15"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Dice</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<disp-formula id="EQ15"><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">IoU</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<disp-formula id="EQ16"><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Recall</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<disp-formula id="EQ17"><mml:math id="M18"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">F2</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Precision</mml:mtext><mml:mo>&#x000D7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">Recall</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">Precision</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">Recall</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula>
<disp-formula id="E18"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">HD95</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo>,</mml:mo><mml:mi>G</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo class="qopname">max</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mn>95</mml:mn><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="true">(</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>&#x02202;</mml:mi><mml:mi>G</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mo>||</mml:mo><mml:mi>p</mml:mi><mml:mo>-</mml:mo><mml:mi>g</mml:mi><mml:msub><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>&#x02202;</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mn>95</mml:mn><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>&#x02202;</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mo>||</mml:mo><mml:mi>g</mml:mi><mml:mo>-</mml:mo><mml:mi>p</mml:mi><mml:msub><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>&#x02202;</mml:mi><mml:mi>G</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="true">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(18)</label></disp-formula></sec></sec>
<sec id="s4">
<label>4</label>
<title>Results and discussion</title>
<sec>
<label>4.1</label>
<title>Datasets</title>
<p>This is shown through experiments on two publicly available stroke lesion segmentation datasets: ATLAS v2.0 and ISLES 2022. ATLAS v2.0 is a dataset of single-modality T1-weighted (T1W) MR images from chronic-phase stroke patients, extending the previous public dataset of the same name (<xref ref-type="bibr" rid="B26">26</xref>). The dataset consists of 1,271 cases, comprising 655 publicly available training cases, 300 public test cases with annotations covered up (hidden annotations), and 316 new test cases that are unpublished. The training dataset released by the organizers was split into training, validation, and test sets with 8:1:1 ratio.</p>
<p>ISLES 2022 is a dataset of ischemic stroke lesions in acute and sub-acute stages using multimodal MRI imaging, including FLAIR, DWI, and ADC images. Although the DWI and ADC images were registered to each other, the FLAIR images were not. Since the DWI and ADC images are most effective at representing the ischemic stroke lesions, we used these two images and discarded the FLAIR image. We used 250 cases from ISLES 2022 dataset (<xref ref-type="bibr" rid="B27">27</xref>). To guarantee a rigorous and fair comparison with state-of-the-art baselines, and to strictly align with the widely accepted evaluation protocol of the nnUNet (<xref ref-type="bibr" rid="B28">28</xref>) framework, we maintained a consistent data partitioning strategy. Consequently, the dataset was randomly split into training, validation, and test sets with a fixed ratio of 8:1:1.</p></sec>
<sec>
<label>4.2</label>
<title>Implementation details</title>
<p>All the experiments in this study are conducted with PyTorch in NVIDIA GeForce RTX 3090. We employ Python 3.10 and CUDA 11.8 to integrate LBMNet into nnUNet (<xref ref-type="bibr" rid="B28">28</xref>). We use PyTorch&#x00027;s Auto Mixed Precision (AMP) to perform mixed precision training (FP16). AMP can decrease the GPU memory consumption while keeping good numerical stability.</p>
<p>We adopt the standardized preprocessing pipeline of the nnUNet framework, which has been validated in numerous medical imaging challenges. The preprocessing pipeline includes the following steps:</p>
<list list-type="bullet">
<list-item><p>Resampling: all volumetric data were resampled to the median voxel spacing of 1.0 &#x000D7; 1.0 &#x000D7; 1.0 mm<sup>3</sup>. The images were processed using cubic spline interpolation, while segmentation masks utilized nearest-neighbor interpolation to maintain the integrity of the labels.</p></list-item>
<list-item><p>Intensity normalization: for each MRI modality, we apply <italic>z</italic>-score normalization based on foreground voxels (non-zero regions).</p></list-item>
<list-item><p>Trim: trim the volume to a non-zero bounding box while preserving a 5-voxel margin.</p></list-item>
</list>
<p>Reasons for adopting nnUNet preprocessing: this protocol ensures alignment with cutting-edge medical image segmentation techniques, enabling equitable comparisons with baseline models.</p>
<p>During training, the initial learning rate was set to 0.001, the weight decay coefficient to 3 &#x000D7; 10<sup>&#x02212;5</sup>, and the optimizer selected was SGD with a momentum parameter of 0.99. Deep supervision was enabled. The model was trained for 300 epochs with a batch size of 2. The loss function employed a hybrid Dice-cross-entropy loss. Mamba utilized a state dimension (<italic>d</italic><sub><italic>s</italic></sub><italic>tate</italic>) of 16 and an expansion factor of 2. To ensure fair comparison, all baseline methods employed the same configuration.</p></sec>
<sec>
<label>4.3</label>
<title>Analysis of experimental results</title>
<sec>
<label>4.3.1</label>
<title>Quantitative and qualitative analysis</title>
<p>In order to assess LBMNet&#x00027;s performance, we trained the model on the ATLAS v2.0 dataset and compared LBMNet&#x00027;s results with a conventional CNN-based network [U-Net (<xref ref-type="bibr" rid="B6">6</xref>), 3DUnet (<xref ref-type="bibr" rid="B29">29</xref>), U-Net&#x0002B;&#x0002B; (<xref ref-type="bibr" rid="B30">30</xref>)], a Transformer-based network UNETR, TransUNet, SwinUNet, and recently proposed hybrid models [STU-Net (<xref ref-type="bibr" rid="B31">31</xref>), FRPNet, UMamba]. As shown in <xref ref-type="table" rid="T1">Table 1</xref>, LBMNet receives a Dice score of 67.57% as compared to the baseline U-Net (Dice = 48.34%), which improves by 17.51%. In comparison with the previous state-of-the-art UMamba (Dice = 63.31%), LBMNet improves by 4.26%.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Comparison of different models on the ATLAS v2.0 dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
<th valign="top" align="center"><bold>F2 (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>HD95 (px)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">3Dunet</td>
<td valign="top" align="center">50.06</td>
<td valign="top" align="center">41.39</td>
<td valign="top" align="center">51.12</td>
<td valign="top" align="center">57.07</td>
<td valign="top" align="center">39.57</td>
</tr>
<tr>
<td valign="top" align="left">U-Net&#x0002B;&#x0002B;</td>
<td valign="top" align="center">45.85</td>
<td valign="top" align="center">34.07</td>
<td valign="top" align="center">50.86</td>
<td valign="top" align="center">42.62</td>
<td valign="top" align="center">40.88</td>
</tr>
<tr>
<td valign="top" align="left">SwinUNet</td>
<td valign="top" align="center">51.10</td>
<td valign="top" align="center">40.00</td>
<td valign="top" align="center">57.23</td>
<td valign="top" align="center">62.47</td>
<td valign="top" align="center">34.52</td>
</tr>
<tr>
<td valign="top" align="left">TransUNet</td>
<td valign="top" align="center">50.01</td>
<td valign="top" align="center">46.14</td>
<td valign="top" align="center">55.47</td>
<td valign="top" align="center">58.29</td>
<td valign="top" align="center">42.86</td>
</tr>
<tr>
<td valign="top" align="left">UNETR</td>
<td valign="top" align="center">56.23</td>
<td valign="top" align="center">44.11</td>
<td valign="top" align="center">57.43</td>
<td valign="top" align="center">58.33</td>
<td valign="top" align="center">45.44</td>
</tr>
<tr>
<td valign="top" align="left">STU-Net</td>
<td valign="top" align="center">59.92</td>
<td valign="top" align="center">44.65</td>
<td valign="top" align="center">62.39</td>
<td valign="top" align="center">64.00</td>
<td valign="top" align="center">38.30</td>
</tr>
<tr>
<td valign="top" align="left">FRPNet</td>
<td valign="top" align="center">60.16</td>
<td valign="top" align="center">47.06</td>
<td valign="top" align="center">64.35</td>
<td valign="top" align="center">67.74</td>
<td valign="top" align="center">36.20</td>
</tr>
<tr>
<td valign="top" align="left">Umamba</td>
<td valign="top" align="center">63.31</td>
<td valign="top" align="center">49.64</td>
<td valign="top" align="center">68.31</td>
<td valign="top" align="center">68.25</td>
<td valign="top" align="center">29.06</td>
</tr>
<tr>
<td valign="top" align="left">LBMNet</td>
<td valign="top" align="center">67.57</td>
<td valign="top" align="center">52.03</td>
<td valign="top" align="center">72.39</td>
<td valign="top" align="center">69.38</td>
<td valign="top" align="center">22.32</td>
</tr></tbody>
</table>
</table-wrap>
<p>This impressive performance boost is due to our proposed architectural design. First, the performance gap with respect to CNN models shows the effectiveness of our LSC encoder. By dynamically modeling features from various receptive fields, the LSC module effectively alleviates the limitation of great variation of lesion-size existing in fixed-kernels CNN models. Second, compared with the Transformer-based models (note they are strong at modeling global context but are likely to ignore fine-grained information), LBMNet shows the effectiveness of our method. Specifically, the cooperation between BSC-Mamba decoder and BAGF skip-connection module is strong. BSC-Mamba decoder is able to model effective global dependencies and local spatial contexts efficiently, while BAGF module can filter and aggregate most informative features from both encoder and decoder adaptively. Thus, the information loss in pure Transformer model is alleviated. The global-connected segmentation result of BSC-Mamba decoder can help to refine the local segmentation results to accurately segment the lesion boundary.</p>
<p>In addition, as can be seen from the four metrics, LBMNet shows the best performance. As shown in <xref ref-type="fig" rid="F6">Figure 6</xref>, the IoU of 52.03% is the largest, which means the segmentation result is closer to the ground truth. What is more interesting is that the F2-score of LBMNet achieves 72.39% and the Recall rate of LBMNet is 69.38%. In a medical application, high recall indicates that the model is unlikely to lead to false negative, which means the model is unlikely to miss the actual lesion area and further misdiagnose the disease. Because the lesion area to segment is very complex, having a globally connected segmentation result helps to minimize the false negatives in local segmentation.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>ATLAS v2.0 dataset visualization qualitative analysis.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0006.tif">
<alt-text content-type="machine-generated">Eleven images display the results of a medical image segmentation task using different models. The first image shows a brain scan with a highlighted red region labeled &#x0201C;Case.&#x0201D; The other ten images display segmentation outputs in white on a black background, titled: Ours, Umamba, FRPnet, STU-Net, UNETR, Transunet, SwinUNet, U-Net++, and 3Dunet. The dataset is ATLAS 2.0.</alt-text>
</graphic>
</fig>
<p>In addition, in <xref ref-type="fig" rid="F6">Figure 6</xref>, we give the qualitative results and the results are visually convincing for the quantitative results above. Specifically, it is obvious that compared with pure CNN-based model, U-Net&#x0002B;&#x0002B;, due to the limitation of receptive field, the global continuity of large lesions cannot be well captured, which leads to the undersegmentation. While, TransUNet, although it is better at modeling global information of lesions, tends to smooth the global connection or even omit the detailed boundary information, which leads to unsatisfactory segmentation. Globally connected segmentation result of U-Net&#x0002B;&#x0002B; and local detailed segmentation result of TransUNet are used to refine the final segmentation result of LBMNet.</p></sec>
<sec>
<label>4.3.2</label>
<title>Diagnostic analysis of different and sizes of lesions</title>
<p>To evaluate the model&#x00027;s adaptability to lesion size, we specifically categorized the ATLAS v2.0 test set into three groups based on lesion volume: small lesions (&#x0003C; 10 cm<sup>3</sup>), medium lesions (10&#x02013;50 cm<sup>3</sup>), and large lesions (&#x0003E;50 cm<sup>3</sup>). Due to the limited sample size of the ISLES 2022 dataset, we did not conduct lesion size stratification experiments on it. The detailed results for ATLAS v2.0 are provided in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Performance evaluation of the model on the ATLAS v2.0 test set for different lesion sizes.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Size (cm<sup>3</sup>)</bold></th>
<th valign="top" align="center"><bold>Proportion (%)</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">&#x0003C; 10</td>
<td valign="top" align="center">61.8</td>
<td valign="top" align="center">58.47</td>
<td valign="top" align="center">48.31</td>
<td valign="top" align="center">66.52</td>
</tr>
<tr>
<td valign="top" align="left">10&#x02013;50</td>
<td valign="top" align="center">20.6</td>
<td valign="top" align="center">70.06</td>
<td valign="top" align="center">53.94</td>
<td valign="top" align="center">69.74</td>
</tr>
<tr>
<td valign="top" align="left">&#x0003E;50</td>
<td valign="top" align="center">17.6</td>
<td valign="top" align="center">74.18</td>
<td valign="top" align="center">59.51</td>
<td valign="top" align="center">74.02</td>
</tr></tbody>
</table>
</table-wrap>
<p>In the segmentation of small lesions, we obtain satisfactory model performance, and the Dice coefficient and recall are 58.47 and 66.52%, respectively. The above results demonstrate that LBMNet can still discover and retain most of the lesion areas for minute targets. The hierarchical strategy of LSC module &#x0201C;coarse-grained capture and then fine-grained refinement&#x0201D; is very important to avoid losing the local details, which are usually ignored by ordinary convolutional layers. As expected, the evaluation metrics increase gradually with the increase of lesion volume. For the medium-sized lesions, the Dice score achieves 70.06%. And for the large lesions, the Dice score can also reach an excellent 74.18%. The above consistent and robust performance for all the lesion sizes further demonstrates the strength of our model on handling such large scale variation for stroke lesions.</p>
<p>The effectiveness of LBMNet is also verified in the comparison shown in <xref ref-type="fig" rid="F7">Figure 7</xref>. Specifically, for the case of small lesions (&#x0003C; 10 cm<sup>3</sup>). In addition to Umamba, our LBMNet also achieves state-of-the-art performance. It is of great importance that missing small lesions are highly risky, and the effectiveness of our model in maintaining details is remarkable due to its multi-scale feature extraction. For medium-sized lesions, our model still has an advantage over the models, while some models also get similar performance for this range. For large lesions, our model has its own advantage, which shows the ability of our model to integrate global context with local details and keep the segmentation part intact. To rigorously validate the model&#x00027;s robustness on small lesions (&#x0003C; 10 cm<sup>3</sup>), we conducted a comparative analysis against state-of-the-art methods (<xref ref-type="table" rid="T3">Table 3</xref>). Consequently, we utilized Recall (Sensitivity) and Hausdorff Distance (HD95) as more clinically relevant metrics to experimentally validate the performance on the small lesion subset. As shown in <xref ref-type="table" rid="T3">Table 3</xref>, while baseline models like SwinUNet struggle with a low Recall of 47.55%, LBMNet achieves a superior Recall of 66.52%. This indicates a significantly lower rate of missed diagnoses, which is critical for early stroke screening. Furthermore, our HD95 is reduced to 24.12 px (compared to &#x0003C; 29 px for UMamba), demonstrating that LBMNet preserves the topological structure of small lesions more accurately than competitors, even when the Dice metric fluctuates.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>The figure compares Dice scores of LBMNet and other models across lesion volumes.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0007.tif">
<alt-text content-type="machine-generated">Six line graphs compare LBMNet with different networks (Umamba, FRPnet, UNETR, Transunet, SwinUNet, U-Net++) by plotting Dice scores against lesion volume. Each graph shows LBMNet outperforming its counterpart as lesion volume increases.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Performance comparison on the small lesion subset (&#x0003C; 10 cm<sup>3</sup>) of ATLAS v2.0.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
<th valign="top" align="center"><bold>F2 (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>HD95 (px)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">3D U-Net</td>
<td valign="top" align="center">39.15</td>
<td valign="top" align="center">27.65</td>
<td valign="top" align="center">43.10</td>
<td valign="top" align="center">45.20</td>
<td valign="top" align="center">42.18</td>
</tr>
<tr>
<td valign="top" align="left">U-Net&#x0002B;&#x0002B;</td>
<td valign="top" align="center">41.50</td>
<td valign="top" align="center">30.15</td>
<td valign="top" align="center">45.80</td>
<td valign="top" align="center">48.25</td>
<td valign="top" align="center">39.50</td>
</tr>
<tr>
<td valign="top" align="left">SwinUNet</td>
<td valign="top" align="center">40.82</td>
<td valign="top" align="center">28.56</td>
<td valign="top" align="center">44.20</td>
<td valign="top" align="center">47.55</td>
<td valign="top" align="center">40.15</td>
</tr>
<tr>
<td valign="top" align="left">TransUNet</td>
<td valign="top" align="center">42.15</td>
<td valign="top" align="center">29.85</td>
<td valign="top" align="center">46.50</td>
<td valign="top" align="center">49.12</td>
<td valign="top" align="center">38.92</td>
</tr>
<tr>
<td valign="top" align="left">UNETR</td>
<td valign="top" align="center">38.45</td>
<td valign="top" align="center">26.90</td>
<td valign="top" align="center">42.10</td>
<td valign="top" align="center">45.30</td>
<td valign="top" align="center">43.20</td>
</tr>
<tr>
<td valign="top" align="left">STU-Net</td>
<td valign="top" align="center">47.92</td>
<td valign="top" align="center">35.88</td>
<td valign="top" align="center">52.85</td>
<td valign="top" align="center">54.20</td>
<td valign="top" align="center">33.56</td>
</tr>
<tr>
<td valign="top" align="left">FRPNet</td>
<td valign="top" align="center">49.65</td>
<td valign="top" align="center">37.12</td>
<td valign="top" align="center">54.10</td>
<td valign="top" align="center">56.33</td>
<td valign="top" align="center">31.20</td>
</tr>
<tr>
<td valign="top" align="left">Umamba</td>
<td valign="top" align="center">51.28</td>
<td valign="top" align="center">39.85</td>
<td valign="top" align="center">56.42</td>
<td valign="top" align="center">58.14</td>
<td valign="top" align="center">29.45</td>
</tr>
<tr>
<td valign="top" align="left">LBMNet (Ours)</td>
<td valign="top" align="center">58.47</td>
<td valign="top" align="center">48.31</td>
<td valign="top" align="center">63.85</td>
<td valign="top" align="center">66.52</td>
<td valign="top" align="center">24.12</td>
</tr></tbody>
</table>
</table-wrap>
<p>To further assess the accuracy of volume segmentation, we conducted a Bland&#x02013;Altman analysis to evaluate segmentation errors across lesions of various sizes, as illustrated in <xref ref-type="fig" rid="F8">Figure 8</xref>. The Bland&#x02013;Altman analysis revealed a mean bias of &#x02013;5.22%, indicating a slight systematic underestimation of lesion volume. The 95% confidence interval ranged from &#x02013;68.11 to 57.68%, with larger relative deviations mainly observed in small lesions. This bias was most pronounced in small lesions (<xref ref-type="fig" rid="F8">Figure 8a</xref>), where boundary ambiguity likely contributed to larger relative volume discrepancies. However, even within this challenging subgroup, the majority of data points remained within the confidence interval. As the lesion size increased to medium (<xref ref-type="fig" rid="F8">Figure 8b</xref>) and large (<xref ref-type="fig" rid="F8">Figure 8c</xref>), the variability in segmentation error notably decreased, with data points clustering tightly around the mean bias line. This progressively improved consistency underscores the model&#x00027;s reliability for clinically relevant lesions. Overall, the Bland&#x02013;Altman analysis confirms that LBMNet provides robust and accurate volume assessments across the full range of lesion sizes.</p>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Bland&#x02013;Altman plots for different lesion volumes: <bold>(a)</bold> small lesions, <bold>(b)</bold> medium lesions, and <bold>(c)</bold> large lesions.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0008.tif">
<alt-text content-type="machine-generated">Bland-Altman analysis comparing volume differences for small, medium, and large lesions across three panels. Each panel shows scattered data points with mean bias at -5.22% (blue line) and limits of agreement between 57.68% and -68.11% (red lines).</alt-text>
</graphic>
</fig></sec></sec>
<sec>
<label>4.4</label>
<title>Experimental analysis on the ISLES 2022 dataset</title>
<p>In order to thoroughly evaluate the generalization ability and robustness of LBMNet, we extend the analysis to the ISLES 2022 dataset. Unlike ATLAS v2.0, the ISLES 2022 dataset provides MRI images containing stroke lesions in both acute and sub-acute phase. The stroke lesions are show greater signal heterogeneity and have less distinct boundaries, which challenges the model.</p>
<p>As shown in <xref ref-type="table" rid="T4">Table 4</xref>, LBMNet achieves a Dice coefficient of 82.03% and F2 score of 85.50%. Not only does its performance outperform other prominent CNNs (U-Net&#x0002B;&#x0002B; 68.10%), or even Transformer-based networks (UNETR 76.92%), but it also exhibits remarkable consistency across datasets with different characteristics. In addition, the HD95 decreases dramatically to 21.15 pixels, outperforming most competing models. Such an impressive improvement on boundary delineation accuracy further verifies the effectiveness of our proposed BAGF module, which can retain some spatial details via selecting salient features from both encoder and decoder adaptively and accurately segment the lesions with blurry boundaries.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Comparison of different models on the ISLES 2022 dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
<th valign="top" align="center"><bold>F2 (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>HD95 (px)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">U-Net</td>
<td valign="top" align="center">56.34</td>
<td valign="top" align="center">45.47</td>
<td valign="top" align="center">59.47</td>
<td valign="top" align="center">60.80</td>
<td valign="top" align="center">36.35</td>
</tr>
<tr>
<td valign="top" align="left">3Dunet</td>
<td valign="top" align="center">58.06</td>
<td valign="top" align="center">48.39</td>
<td valign="top" align="center">59.12</td>
<td valign="top" align="center">67.07</td>
<td valign="top" align="center">39.57</td>
</tr>
<tr>
<td valign="top" align="left">U-Net&#x0002B;&#x0002B;</td>
<td valign="top" align="center">62.85</td>
<td valign="top" align="center">50.07</td>
<td valign="top" align="center">67.86</td>
<td valign="top" align="center">59.62</td>
<td valign="top" align="center">35.88</td>
</tr>
<tr>
<td valign="top" align="left">SwinUNet</td>
<td valign="top" align="center">68.10</td>
<td valign="top" align="center">57.00</td>
<td valign="top" align="center">74.23</td>
<td valign="top" align="center">79.47</td>
<td valign="top" align="center">35.52</td>
</tr>
<tr>
<td valign="top" align="left">TransUNet</td>
<td valign="top" align="center">67.01</td>
<td valign="top" align="center">63.14</td>
<td valign="top" align="center">72.47</td>
<td valign="top" align="center">75.29</td>
<td valign="top" align="center">36.86</td>
</tr>
<tr>
<td valign="top" align="left">UNETR</td>
<td valign="top" align="center">73.23</td>
<td valign="top" align="center">61.11</td>
<td valign="top" align="center">74.43</td>
<td valign="top" align="center">75.33</td>
<td valign="top" align="center">33.44</td>
</tr>
<tr>
<td valign="top" align="left">STU-Net</td>
<td valign="top" align="center">76.92</td>
<td valign="top" align="center">61.65</td>
<td valign="top" align="center">79.39</td>
<td valign="top" align="center">81.00</td>
<td valign="top" align="center">27.31</td>
</tr>
<tr>
<td valign="top" align="left">FRPNet</td>
<td valign="top" align="center">77.16</td>
<td valign="top" align="center">64.03</td>
<td valign="top" align="center">79.33</td>
<td valign="top" align="center">79.74</td>
<td valign="top" align="center">29.20</td>
</tr>
<tr>
<td valign="top" align="left">Umamba</td>
<td valign="top" align="center">74.31</td>
<td valign="top" align="center">57.64</td>
<td valign="top" align="center">80.31</td>
<td valign="top" align="center">78.25</td>
<td valign="top" align="center">26.06</td>
</tr>
<tr>
<td valign="top" align="left">LBMNet</td>
<td valign="top" align="center">82.03</td>
<td valign="top" align="center">61.50</td>
<td valign="top" align="center">85.50</td>
<td valign="top" align="center">84.30</td>
<td valign="top" align="center">21.15</td>
</tr></tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="fig" rid="F9">Figure 9</xref>, qualitative analysis also reveals the benefits of LBMNet in the case of segmentation of small lesions. It can be seen from the figures that our model delineates the small lesion from the data. Severe under-segmentation by UMamba and SwinUNet, as well as under-segmentation by U-Net&#x0002B;&#x0002B;, is evident in these visualizations. The strong global context provided by the BSC-Mamba decoder (bidirectional state space modeling) allows the model to segment small but highly intricate lesions. Furthermore, the convolution strategy of the LSC encoder makes the model sensitive to small targets. In conclusion, extensive evaluations on the ISLES 2022 dataset confirm that LBMNet is not solely fine-tuned for a particular data distribution, but instead exhibits strong generalization ability and robustness.</p>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>ISLES 2022 dataset visualization qualitative analysis.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0009.tif">
<alt-text content-type="machine-generated">Medical imaging comparison showing a brain scan with a highlighted red area labeled &#x0201C;Case&#x0201D; and eight black images with small white regions labeled &#x0201C;Ours,&#x0201D; &#x0201C;Uramamba,&#x0201D; &#x0201C;FRPnet,&#x0201D; &#x0201C;STU-Net,&#x0201D; &#x0201C;UNETR,&#x0201D; &#x0201C;Transunet,&#x0201D; &#x0201C;SwinUNet,&#x0201D; &#x0201C;U-Net++,&#x0201D; and &#x0201C;3Dunet.&#x0201D; Dataset is ISLES 2022.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>4.5</label>
<title>Robustness analysis via repeated random splits</title>
<p>While the fixed 8:1:1 split ensures fair comparison with benchmarks, the relatively small size of the test set (especially for ISLES 2022) may raise concerns regarding statistical reliability. To further validate the stability and generalization capability of LBMNet, we conducted an additional robustness analysis using 5 independent runs of repeated random splits. In this experiment, which is independent of the main benchmark comparison, the dataset was randomly shuffled and re-split into 8:1:1 for each run with different random seeds.</p>
<p>As presented in <xref ref-type="table" rid="T5">Table 5</xref>, the average performance across these five random splits remains highly consistent with our reported main results. Specifically, for the ISLES 2022 dataset, LBMNet achieved an average Dice score of 81.75% &#x000B1; 1.45%, which is very close to the 82.03% reported in the fixed split. The low standard deviation (&#x000B1;1.45%) indicates that the model&#x00027;s performance is stable and not dependent on a specific data partition. Similarly, for ATLAS v2.0, the mean Dice score of 67.42% &#x000B1; 1.24% confirms the robustness of our method against data variations.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Robustness analysis results under five repeated random splits.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>HD95 (px)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ATLAS v2.0</td>
<td valign="top" align="center">67.42 &#x000B1; 1.24</td>
<td valign="top" align="center">51.88 &#x000B1; 1.15</td>
<td valign="top" align="center">69.15 &#x000B1; 2.10</td>
<td valign="top" align="center">22.95 &#x000B1; 3.40</td>
</tr>
<tr>
<td valign="top" align="left">ISLES 2022</td>
<td valign="top" align="center">81.75 &#x000B1; 1.45</td>
<td valign="top" align="center">61.30 &#x000B1; 1.60</td>
<td valign="top" align="center">84.10 &#x000B1; 1.90</td>
<td valign="top" align="center">21.90 &#x000B1; 3.80</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>4.6</label>
<title>Ablation study</title>
<sec>
<label>4.6.1</label>
<title>Components ablation</title>
<p>To further analyze the effectiveness of each proposed component, we conducted a series of ablation experiments on the ATLAS v2.0 dataset. These experiments were performed in independent runs and were primarily used to analyze the relative performance contributions of each component. We use 3D U-Net as a baseline model and successively add our approach, namely multi-scale LSC encoder, BSC-Mamba decoder, and adaptive BAGF skip-connection module, to evaluate the improvement in both quantitative and qualitative results shown in <xref ref-type="table" rid="T6">Table 6</xref> and <xref ref-type="fig" rid="F10">Figure 10</xref>, respectively.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Performance comparison of different components.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>LSC encoder</bold></th>
<th valign="top" align="center"><bold>Mamba decode</bold></th>
<th valign="top" align="center"><bold>BAGF</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Model 1</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">50.06</td>
<td valign="top" align="center">41.39</td>
</tr>
<tr>
<td valign="top" align="left">Model 2</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">60.09</td>
<td valign="top" align="center">47.01</td>
</tr>
<tr>
<td valign="top" align="left">Model 3</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">63.11</td>
<td valign="top" align="center">51.38</td>
</tr>
<tr>
<td valign="top" align="left">Model 4</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">65.03</td>
<td valign="top" align="center">53.23</td>
</tr>
<tr>
<td valign="top" align="left">Model 5</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">67.57</td>
<td valign="top" align="center">54.97</td>
</tr></tbody>
</table>
</table-wrap>
<fig position="float" id="F10">
<label>Figure 10</label>
<caption><p>Qualitative comparison of various models in melting research on the ATLAS v2.0 dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0010.tif">
<alt-text content-type="machine-generated">Six brain MRI images labeled as &#x0201C;Case,&#x0201D; &#x0201C;Model5,&#x0201D; &#x0201C;Model4,&#x0201D; &#x0201C;Model3,&#x0201D; &#x0201C;Model2,&#x0201D; and &#x0201C;Model1&#x0201D; are displayed in a two-row grid. Each image has a yellow-highlighted inset showing a zoomed-in portion with red and blue outlines, indicating areas of interest or abnormalities.</alt-text>
</graphic>
</fig>
<p>Our baseline model, the standard 3D U-Net, obtains Dice = 50.06% and IoU = 41.39%. As shown in <xref ref-type="fig" rid="F10">Figure 10</xref>, the model can roughly locate the lesions, but it clearly undersegments them and fails to capture the complicated boundaries of the lesions. These results demonstrate the limitations of traditional convolutional networks in modeling the variety of sizes and irregular shapes of stroke lesions.</p>
<p>To alleviate the limitations of the baseline in multi-scale feature extraction, we replaced the conventional encoder with our proposed LSC encoder (Model 2). The single modification leads to a remarkable improvement in the Dice score (&#x0002B;10.03 percentage points, 60.09%). As shown in <xref ref-type="fig" rid="F10">Figure 10</xref>, the qualitative results also support this improvement, where the model outlines a more complete region of the primary lesion area. These results demonstrate the effectiveness of the LSC module in learning features from various receptive fields, which are crucial for modeling both core and peripheral areas of lesions. However, the model is still unable to detect the lesions in the opposite hemisphere.</p>
<p>To validate the effectiveness of global context modeling, we swapped out the baseline decoder for BSC-Mamba (Model 3). With this setup, we obtained Dice score of 63.11%. From <xref ref-type="fig" rid="F10">Figure 10</xref>, we can see that Model 3 was able to model bilateral lesions, which demonstrated the power of BSC-Mamba to model long-range dependencies and utilize distant spatial information. Meanwhile, the model lost its fine-grained local information and thus led to slight under-segmentation in boundary regions.</p>
<p>After independently validating the effectiveness of the encoder and decoder, we concatenate the encoder and decoder to build Model 4. This new model takes advantage of LSC&#x00027;s ability to extract multi-scale local features and BSC-Mamba&#x00027;s ability to integrate global&#x02013;local information. The Dice coefficient achieves 65.03%. Visualization results show that the lesion region is more completely captured by Model 4, and the contour is sharper and clearer. However, directly fusing features may cause over-segmentation at blurred boundaries, which means that conflicting or redundant features are probably forwarded via skip connections.</p>
<p>To address the suboptimal feature fusion issue, we further developed the BAGF module to obtain the complete LBMNet (Model 5). Specifically, BAGF module adaptively chooses and fuses salient features from encoder and decoder pathways while suppressing noise and redundant information. With the further improvement of feature fusion, the final model achieves the best segmentation performance as shown in (Model 5: Dice = 67.57%, IoU = 54.97%). As shown in <xref ref-type="fig" rid="F10">Figure 10</xref>, the segmentation masks generated by Model 5 are most accurate. That is, the boundaries of segmentation masks are clear and there is neither over-segmentation nor under-segmentation. The above results also demonstrate that the BAGF module plays a key role in integrating multi-scale and global representations to obtain robust and accurate segmentation results.</p></sec>
<sec>
<label>4.6.2</label>
<title>LSC module ablation</title>
<p>To analyze the influence of our particular design choices in the LSC module, we perform an ablation study in <xref ref-type="table" rid="T7">Table 7</xref>. The baseline model in <xref ref-type="table" rid="T7">Table 7</xref> is different from Model 1 in <xref ref-type="fig" rid="F9">Figure 9</xref>. Specifically, the baseline is the complete LBMNet model (Model 5), where we replace its LSC module with a typical 3D residual block with only 3<sup>3</sup> convolutions. With this controlled setup, the only part of the network that varies from the baseline is the LSC module, meaning that if the LSC design choices lead to any improvement, this excludes the possibility that other components of the decoder or skip connections might be causing it.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Experimental results of LSC ablation.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Configuration</bold></th>
<th valign="top" align="center"><bold>Kernel</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Baseline</td>
<td valign="top" align="center">Res-block</td>
<td valign="top" align="center">Single 3<sup>3</sup></td>
<td valign="top" align="center">58.21</td>
<td valign="top" align="center">44.39</td>
</tr>
<tr>
<td valign="top" align="left">Model A</td>
<td valign="top" align="center">Kernel only</td>
<td valign="top" align="center">Single 5<sup>3</sup></td>
<td valign="top" align="center">62.15</td>
<td valign="top" align="center">50.90</td>
</tr>
<tr>
<td valign="top" align="left">Model B</td>
<td valign="top" align="center">Kernel only</td>
<td valign="top" align="center">Single 7<sup>3</sup></td>
<td valign="top" align="center">64.93</td>
<td valign="top" align="center">52.78</td>
</tr>
<tr>
<td valign="top" align="left">Model C</td>
<td valign="top" align="center">Kernel only</td>
<td valign="top" align="center">Single 9<sup>3</sup></td>
<td valign="top" align="center">64.52</td>
<td valign="top" align="center">52.41</td>
</tr>
<tr>
<td valign="top" align="left">Model D</td>
<td valign="top" align="center">Dual-branch refinement</td>
<td valign="top" align="center">No 3<sup>3</sup></td>
<td valign="top" align="center">66.17</td>
<td valign="top" align="center">54.39</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">LSC module</td>
<td valign="top" align="center">5<sup>3</sup>&#x0002B;7<sup>3</sup>&#x0002B;3<sup>3</sup></td>
<td valign="top" align="center">67.57</td>
<td valign="top" align="center">54.97</td>
</tr></tbody>
</table>
</table-wrap>
<p>Replacing the baseline 3<sup>3</sup> convolutional kernel with a single large kernel (Models A and B) resulted in substantial performance improvements, with gains of &#x0002B;3.94% for 5<sup>3</sup> and &#x0002B;6.72% for 7<sup>3</sup>. These findings confirm the importance of large receptive fields in effectively capturing stroke lesions. However, increasing the kernel size from 7<sup>3</sup> to 9<sup>3</sup> (Model C) provided only a marginal gain of &#x0002B;0.19% in Dice score, accompanied by higher computational costs. More importantly, a single large kernel struggles to balance global context with local details. In contrast, the dual-branch design, which combines 5<sup>3</sup> and 7<sup>3</sup> convolutional kernels (Model D), significantly outperformed any single-kernel configuration, achieving a Dice score of 66.17%. This highlights the superiority of explicitly modeling features across multiple scales, rather than relying on a single large convolutional kernel. Our final LSC module design, which incorporates a 3<sup>3</sup> refinement path after the dual-branch output (Ours), achieves the highest Dice score of 67.57%. The &#x0002B;1.4% improvement over Model D strongly supports our core &#x0201C;coarse-to-fine&#x0201D; hypothesis: capturing broad context with large kernels, followed by refinement with small kernels, is the optimal strategy.</p></sec>
<sec>
<label>4.6.3</label>
<title>BSC-Mamba module ablation</title>
<p>To rigorously validate the design of our BSC-Mamba module, we conducted two sets of targeted ablation experiments.</p>
<p>First, our goal is to analyze the independent and synergistic advantages of two parts in BSC-Mamba: the ASC module in local enhancement and the bidirectional state-space model in global modeling. We designed four variants on top of a standard convolutional decoder baseline. The baseline (Baseline) uses LBMNet with a basic 3D convolutional decoder (<xref ref-type="table" rid="T8">Table 8</xref>) shows the necessity of these two parts. With only the ASC module (1.14 Dice gain), the network can still greatly improve local feature representations and thus proves the advantage of its design to preserve local information. With only Bi-directional Mamba (2.22 Dice gain), the network demonstrates the necessity of global context modeling. But the performance of Bi-directional Mamba is limited due to the information loss from sequence flattening. The whole BSC-Mamba that applies ASC in local preprocessing and Bi-SSM in global modeling obtains the best performance.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>Ablation study results of the BSC-Mamba module on stroke segmentation tasks.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>SSM</bold></th>
<th valign="top" align="center"><bold>ASC</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Baseline</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">64.01</td>
<td valign="top" align="center">52.76</td>
</tr>
<tr>
<td valign="top" align="left">Only-ASC</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">65.15</td>
<td valign="top" align="center">53.28</td>
</tr>
<tr>
<td valign="top" align="left">Only-SSM</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">66.13</td>
<td valign="top" align="center">53.86</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">67.57</td>
<td valign="top" align="center">54.97</td>
</tr></tbody>
</table>
</table-wrap>
<p>Next, we evaluated the necessity of the proposed bidirectional scanning strategy by comparing it with alternative scanning methods, as illustrated in the <xref ref-type="table" rid="T9">Table 9</xref>. Transitioning from unidirectional to bidirectional scanning (our approach) produced a substantial performance improvement, with the Dice score increasing by 1.9%. This result highlights the critical role of reverse contextual information in accurately segmenting non-causal objects. Further extending the approach to quad-directional scanning yielded negligible additional gains in Dice score while introducing higher computational overhead. These findings strongly support the effectiveness of the bidirectional scanning strategy, which captures nearly all essential global information without the unnecessary computational burden of multi-axis scanning.</p>
<table-wrap position="float" id="T9">
<label>Table 9</label>
<caption><p>Ablation study results of the BSC-Mamba module on stroke segmentation tasks.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Strategy</bold></th>
<th valign="top" align="center"><bold>Description</bold></th>
<th valign="top" align="center"><bold>Dice (%)</bold></th>
<th valign="top" align="center"><bold>IoU (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">One-way Mamba</td>
<td valign="top" align="center">Forward (D)</td>
<td valign="top" align="center">65.67</td>
<td valign="top" align="center">53.76</td>
</tr>
<tr>
<td valign="top" align="left">Two-way Mamba</td>
<td valign="top" align="center">Bidirectional (D)</td>
<td valign="top" align="center">67.57</td>
<td valign="top" align="center">54.97</td>
</tr>
<tr>
<td valign="top" align="left">Four-way Mamba</td>
<td valign="top" align="center">Bidirectional (D &#x0002B; H)</td>
<td valign="top" align="center">65.31</td>
<td valign="top" align="center">53.36</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<label>4.6.4</label>
<title>BAGF module ablation</title>
<p>Test the performance of our proposed BAGF module and its components, we conducted an ablation study as follows: we started with a key baseline model in which we replaced the BAGF module with a concatenation operation following the design of the standard U-Net. Then, we successively added each of our proposed components to this baseline: Enc-Res: residual depthwise separable convolutional blocks added to the encoder path to improve feature representation; Enc-SA: spatial attention operation added to the encoder path; and Dec-CA: channel attention operation added to the decoder path. As illustrated in <xref ref-type="fig" rid="F11">Figure 11</xref>, introducing the Enc-Res component alone improves the Dice score by 1.18%, demonstrating that strengthening local feature representations in the encoder path prior to feature fusion is advantageous. Incorporating the Enc-SA component results in an even greater gain, enhancing the Dice score by 2.02%, which confirms the effectiveness of selectively emphasizing spatially relevant regions within the encoder&#x00027;s coarse feature maps. Applying the Dec-CA component independently also produces a notable improvement, raising the IoU by 1.22%. This outcome highlights its capacity to refine decoder features by emphasizing lesion-relevant channels while suppressing noise-induced oversegmentation. The complete BAGF model achieves the best overall performance, with Dice score of 67.57% and an IoU of 54.97%. These results exceed the additive contributions of individual components, suggesting strong synergistic interactions among them. Collectively, these findings demonstrate that optimizing feature information within the skip-connection framework is essential for achieving accurate stroke lesion segmentation.</p>
<fig position="float" id="F11">
<label>Figure 11</label>
<caption><p>Experimental results of BAGF module ablation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1759114-g0011.tif">
<alt-text content-type="machine-generated">Line graphs comparing performance metrics. The left graph shows Dice Score percentages, increasing from 65.0% at Baseline to 68.0% at &#x0201C;Ours.&#x0201D; The right graph displays IoU Score percentages, rising from 53.0% at Baseline to 55.5% at &#x0201C;Ours.&#x0201D; Each graph indicates improvements across different models: Enc-Res, Enc-SA, Dec-CA, and &#x0201C;Ours.&#x0201D; </alt-text>
</graphic>
</fig></sec></sec>
<sec>
<label>4.7</label>
<title>Discussion</title>
<p>We show that multi-scale convolutional encoding combined with Mamba-based decoding supported by adaptive feature fusion can substantially improve stroke lesion segmentation performance for small lesions (&#x0003C; 10 cm<sup>3</sup>) which have been highlighted as particularly challenging for recent ATLAS v2.0 benchmarks and systematic reviews (<xref ref-type="bibr" rid="B32">32</xref>&#x02013;<xref ref-type="bibr" rid="B34">34</xref>). These lesions account for 61.8% of the ATLAS v2.0 test set and are often missed by other methods (<xref ref-type="bibr" rid="B35">35</xref>). Our model achieves a Dice score of 58.47%. This performance is attributed to the proposed architecture addressing some of the limitations in scale awareness and local detail preservation in current stroke models (<xref ref-type="bibr" rid="B36">36</xref>, <xref ref-type="bibr" rid="B37">37</xref>) and improving semantic consistency under sequence-based decoders. We take LSC to the top-down hierarchical convolution by exploiting large convolution kernels to get global semantics, but then refining local structures with small kernels to avoid loss of fine-grained features of classical CNNs after downsampling. Dual-branch large-kernel performance gets 1.4%&#x02013;5.4% performance gain with a single-kernel approach, and shows that multi-scale context is important for small lesions. Compared to Transformers&#x00027; quadratic complexity and risk of overfitting on small-scale medical datasets, Mamba&#x00027;s linear complexity and selective state space mechanism are better suited for 3D medical data. However, directly applying it to 3D sequences disrupts local spatial relationships. Therefore, we incorporated an ASC module into BSC-Mamba to compensate for local structural information and adopted bidirectional scanning to integrate spatial dependencies from all directions. Experiments demonstrate that bidirectional scanning improves performance by 1.9% over unidirectional scanning, while further expanding scanning directions yields negligible gains, indicating that essential global relationships have been effectively captured. We achieve Dice scores of 67.57 and 82.03% on ATLAS v2.0 and ISLES 2022, respectively, validating this mechanism&#x00027;s effectiveness in medical imaging.</p>
<p>In order to reduce semantic differences between encoder and decoder features, we propose BAGF. It employs spatial attention to suppress background interference at the encoding stage and channel attention to highlight lesion-related features at the decoding stage, achieving adaptive fusion through a gating mechanism. This strategy achieves a 2.5% improvement over simple concatenation while significantly reducing boundary noise and segmentation errors&#x02014;particularly crucial in stroke imaging where signal similarities often exist between different pathological tissues. Clinically, small lesions, though inconspicuous, hold significant prognostic value and are closely associated with cerebrovascular disease burden and future stroke risk (<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B38">38</xref>). The model&#x00027;s 66.52% recall rate for small lesions reduces the risk of missed diagnoses, facilitating early intervention. It demonstrates good generalization across different imaging modalities and disease stages. However, greater imaging variability exists in real-world clinical settings, necessitating prospective validation. Regarding the loss function, our experiments demonstrate that the standard hybrid loss (Dice &#x0002B; Weighted Cross-Entropy) is effective for this task. The Weighted Cross-Entropy term effectively addresses the class imbalance of small lesions, which is supported by the high Recall and low HD95 scores achieved on the small lesion subset (&#x0003C; 10 cm<sup>3</sup>). These results indicate that our current optimization strategy is sufficient. However, we acknowledge that boundary delineation remains a challenge, and future work may incorporate boundary-sensitive loss functions to further refine edge precision. While the data are manually annotated for fully supervised learning, scalability becomes limited, which semi-supervised methods may overcome (<xref ref-type="bibr" rid="B39">39</xref>, <xref ref-type="bibr" rid="B40">40</xref>). The model&#x00027;s concept of &#x0201C;multiscale local modeling &#x0002B; efficient global modeling&#x0201D; also works in cross-task transferability, and the experiment indicates its potential for multi-organ and tumor segmentation (<xref ref-type="bibr" rid="B41">41</xref>).</p></sec></sec>
<sec sec-type="conclusions" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>In this work, we introduce LBMNet, a hybrid CNN&#x02013;Mamba architecture developed to improve the segmentation of stroke lesions across a wide range of sizes and morphologies. By combining multi-scale convolutional encoding with efficient global sequence modeling and an adaptive fusion strategy, the model integrates fine structural details with broader contextual information while maintaining linear computational complexity. Specifically, LBMNet achieves a parameter count of 66.42 M and 54.80 G FLOPs, demonstrating a favorable balance between complexity and accuracy that fully meets the practical demands of automated lesion quantification and treatment planning&#x02014;highlighting its strong deployment potential in clinical workstations. Future work will explore lightweight variants to further facilitate widespread adoption on resource-constrained devices.</p>
<p>Across two widely used benchmarks, ATLAS v2.0 and ISLES 2022, LBMNet consistently and significantly outperforms state-of-the-art CNN-based methods, Transformer-based methods, and hybrid architectures. The model achieves Dice scores of 67.57 and 82.03%, respectively, and shows marked improvements in detecting small lesions-a critical challenge in stroke neuroimaging. Ablation analyses further support the functional roles of the LSC encoder, the BSC-Mamba decoder, and the BAGF module, each contributing to more effective multi-scale representation, global&#x02013;local feature integration, and encoder&#x02013;decoder alignment. Although the results are promising, several challenges remain. Boundary delineation can still be difficult in cases where lesion margins are poorly defined (<xref ref-type="bibr" rid="B42">42</xref>, <xref ref-type="bibr" rid="B43">43</xref>), and the reliance on fully annotated datasets limits broader clinical deployment. Moreover, while initial findings suggest that LBMNet may generalize beyond stroke imaging, further evaluation on additional neuroimaging tasks is needed.</p>
<p>Future development will focus on incorporating boundary-aware loss functions to enhance edge precision (<xref ref-type="bibr" rid="B44">44</xref>), exploring semi-supervised or weakly supervised learning to reduce annotation requirements, extending the framework to other brain segmentation tasks, and performing prospective clinical validation to assess performance under real-world imaging variability. Collectively, these results suggest that LBMNet provides an effective and efficient algorithm to achieve automated stroke lesion segmentation and demonstrates the merits of hybrid architectures that combine convolutional modeling with state-space mechanisms to advance neuroimaging analysis.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>ZK: Supervision, Conceptualization, Writing &#x02013; review &#x00026; editing, Data curation, Resources. XY: Validation, Writing &#x02013; review &#x00026; editing, Visualization, Writing &#x02013; original draft, Methodology, Conceptualization, Software. JY: Methodology, Investigation, Writing &#x02013; review &#x00026; editing. DS: Methodology, Funding acquisition, Supervision, Project administration, Resources, Writing &#x02013; original draft. JZ: Resources, Supervision, Validation, Writing &#x02013; review &#x00026; editing. LS: Resources, Validation, Writing &#x02013; review &#x00026; editing, Supervision.</p>
</sec>
<ack><title>Acknowledgments</title><p>We gratefully acknowledge the contributors of the ATLAS v2.0 and ISLES 2022 datasets for making these valuable public resources available to the research community.</p></ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Feigin</surname> <given-names>VL</given-names></name> <name><surname>Stark</surname> <given-names>BA</given-names></name> <name><surname>Johnson</surname> <given-names>CO</given-names></name> <name><surname>Roth</surname> <given-names>GA</given-names></name> <name><surname>Bisignano</surname> <given-names>C</given-names></name> <name><surname>Abady</surname> <given-names>GG</given-names></name> <etal/></person-group>. <article-title>Global, regional, and national burden of stroke and its risk factors, 1990&#x02013;2019: a systematic analysis for the global burden of disease study 2019</article-title>. <source>Lancet Neurol</source>. (<year>2021</year>) <volume>20</volume>:<fpage>795</fpage>&#x02013;<lpage>820</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S1474-4422(21)00252-0</pub-id><pub-id pub-id-type="pmid">34487721</pub-id></mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Katan</surname> <given-names>M</given-names></name> <name><surname>Luft</surname> <given-names>A</given-names></name></person-group>. <article-title>Global burden of stroke</article-title>. <source>Semin Neurol</source>. (<year>2018</year>) <volume>38</volume>:<fpage>208</fpage>&#x02013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1055/s-0038-1649503</pub-id><pub-id pub-id-type="pmid">29791947</pub-id></mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Maier</surname> <given-names>O</given-names></name> <name><surname>Menze</surname> <given-names>BH</given-names></name> <name><surname>Von Der Gablentz</surname> <given-names>J</given-names></name> <name><surname>H&#x000E4;ni</surname> <given-names>L</given-names></name> <name><surname>Heinrich</surname> <given-names>MP</given-names></name> <name><surname>Liebrand</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>ISLES 2015&#x02014;a public evaluation benchmark for ischemic stroke lesion segmentation from multispectral MRI</article-title>. <source>Med Image Anal</source>. (<year>2017</year>) <volume>35</volume>:<fpage>250</fpage>&#x02013;<lpage>69</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2016.07.009</pub-id></mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ghafoorian</surname> <given-names>M</given-names></name> <name><surname>Karssemeijer</surname> <given-names>N</given-names></name> <name><surname>Heskes</surname> <given-names>T</given-names></name> <name><surname>Van Uden</surname> <given-names>IWM</given-names></name> <name><surname>Sanchez</surname> <given-names>CI</given-names></name> <name><surname>Litjens</surname> <given-names>G</given-names></name> <etal/></person-group>. <article-title>Location sensitive deep convolutional neural networks for segmentation of white matter hyperintensities</article-title>. <source>Sci Rep</source>. (<year>2017</year>) <volume>7</volume>:<fpage>5110</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-017-05300-5</pub-id><pub-id pub-id-type="pmid">28698556</pub-id></mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Litjens</surname> <given-names>G</given-names></name> <name><surname>Kooi</surname> <given-names>T</given-names></name> <name><surname>Bejnordi</surname> <given-names>BE</given-names></name> <name><surname>Setio</surname> <given-names>AAA</given-names></name> <name><surname>Ciompi</surname> <given-names>F</given-names></name> <name><surname>Ghafoorian</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>A survey on deep learning in medical image analysis</article-title>. <source>Med Image Anal</source>. (<year>2017</year>) <volume>42</volume>:<fpage>60</fpage>&#x02013;<lpage>88</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2017.07.005</pub-id><pub-id pub-id-type="pmid">28778026</pub-id></mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O</given-names></name> <name><surname>Fischer</surname> <given-names>P</given-names></name> <name><surname>Brox</surname> <given-names>T</given-names></name></person-group>. <article-title>U-Net: convolutional networks for biomedical image segmentation</article-title>. In:<person-group person-group-type="editor"><name><surname>Navab</surname> <given-names>N</given-names></name> <name><surname>Hornegger</surname> <given-names>J</given-names></name> <name><surname>Wells</surname> <given-names>WM</given-names></name> <name><surname>Frangi</surname> <given-names>AF</given-names></name></person-group>, editors. <source>Medical Image Computing and Computer-Assisted Intervention-MICCAI 2015</source>. vol. 9351. Cham: Springer International Publishing (<year>2015</year>). p. <fpage>234</fpage>&#x02013;<lpage>41</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id></mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shamshad</surname> <given-names>F</given-names></name> <name><surname>Khan</surname> <given-names>S</given-names></name> <name><surname>Zamir</surname> <given-names>SW</given-names></name> <name><surname>Khan</surname> <given-names>MH</given-names></name> <name><surname>Hayat</surname> <given-names>M</given-names></name> <name><surname>Khan</surname> <given-names>FS</given-names></name> <etal/></person-group>. <article-title>Transformers in medical imaging: a survey</article-title>. <source>Med Image Anal</source>. (<year>2023</year>) <volume>88</volume>:<fpage>102802</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2023.102802</pub-id><pub-id pub-id-type="pmid">37315483</pub-id></mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J</given-names></name> <name><surname>Lu</surname> <given-names>Y</given-names></name> <name><surname>Yu</surname> <given-names>Q</given-names></name> <name><surname>Luo</surname> <given-names>X</given-names></name> <name><surname>Adeli</surname> <given-names>E</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>TransUNet: transformers make strong encoders for medical image segmentation</article-title>. <source>arXiv</source> [Preprint]. (<year>2021</year>) arXiv:2102.04306. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2102.04306</pub-id></mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hatamizadeh</surname> <given-names>A</given-names></name> <name><surname>Nath</surname> <given-names>V</given-names></name> <name><surname>Tang</surname> <given-names>Y</given-names></name> <name><surname>Yang</surname> <given-names>D</given-names></name> <name><surname>Roth</surname> <given-names>HR</given-names></name> <name><surname>Xu</surname> <given-names>D</given-names></name></person-group>. <article-title>Swin UNETR: swin transformers for semantic segmentation of brain tumors in MRI images</article-title>. In:<person-group person-group-type="editor"><name><surname>Crimi</surname> <given-names>A</given-names></name> <name><surname>Bakas</surname> <given-names>S</given-names></name></person-group>, editors. <source>Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries</source>. vol. 12962. Cham: Springer International Publishing (<year>2022</year>). p. <fpage>272</fpage>&#x02013;<lpage>84</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-031-08999-2_22</pub-id></mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>A</given-names></name> <name><surname>Dao</surname> <given-names>T</given-names></name></person-group>. <article-title>Mamba: linear-time sequence modeling with selective state spaces</article-title>. <source>arXiv</source> [Preprint]. (<year>2023</year>) arXiv:2312.00752. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2312.00752</pub-id></mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alom</surname> <given-names>MZ</given-names></name> <name><surname>Hasan</surname> <given-names>M</given-names></name> <name><surname>Yakopcic</surname> <given-names>C</given-names></name> <name><surname>Taha</surname> <given-names>TM</given-names></name> <name><surname>Asari</surname> <given-names>VK</given-names></name></person-group>. <article-title>Recurrent residual convolutional neural network based on U-Net (R2U-Net) for medical image segmentation</article-title>. <source>arXiv</source> [Preprint]. (<year>2018</year>) arXiv:1802.06955. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1802.06955</pub-id></mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Oktay</surname> <given-names>O</given-names></name> <name><surname>Schlemper</surname> <given-names>J</given-names></name> <name><surname>Folgoc</surname> <given-names>LL</given-names></name> <name><surname>Lee</surname> <given-names>M</given-names></name> <name><surname>Heinrich</surname> <given-names>M</given-names></name> <name><surname>Misawa</surname> <given-names>K</given-names></name> <etal/></person-group>. <article-title>Attention U-Net: learning where to look for the pancreas</article-title>. <source>arXiv</source> [Preprint]. (<year>2018</year>) arXiv:1804.03999. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1804.03999</pub-id></mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Szegedy</surname> <given-names>C</given-names></name> <name><surname>Liu</surname> <given-names>W</given-names></name> <name><surname>Jia</surname> <given-names>Y</given-names></name> <name><surname>Sermanet</surname> <given-names>P</given-names></name> <name><surname>Reed</surname> <given-names>S</given-names></name> <name><surname>Anguelov</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>Going deeper with convolutions</article-title>. In: <source>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source>. <publisher-loc>Boston, MA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2015</year>). p. <fpage>1</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2015.7298594</pub-id></mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>SH</given-names></name> <name><surname>Cheng</surname> <given-names>MM</given-names></name> <name><surname>Zhao</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>XY</given-names></name> <name><surname>Yang</surname> <given-names>MH</given-names></name> <name><surname>Torr</surname> <given-names>P</given-names></name></person-group>. <article-title>Res2Net: a new multi-scale backbone architecture</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. (<year>2021</year>) <volume>43</volume>:<fpage>652</fpage>&#x02013;<lpage>62</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2019.2938758</pub-id><pub-id pub-id-type="pmid">31484108</pub-id></mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Sun</surname> <given-names>K</given-names></name> <name><surname>Cheng</surname> <given-names>T</given-names></name> <name><surname>Jiang</surname> <given-names>B</given-names></name> <name><surname>Deng</surname> <given-names>C</given-names></name> <name><surname>Zhao</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>Deep high-resolution representation learning for visual recognition</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. (<year>2021</year>) <volume>43</volume>:<fpage>3349</fpage>&#x02013;<lpage>64</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2020.2983686</pub-id><pub-id pub-id-type="pmid">32248092</pub-id></mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dolz</surname> <given-names>J</given-names></name> <name><surname>Ben Ayed</surname> <given-names>I</given-names></name> <name><surname>Desrosiers</surname> <given-names>C</given-names></name></person-group>. <article-title>Dense multi-path U-Net for ischemic stroke lesion segmentation in multiple image modalities</article-title>. In:<person-group person-group-type="editor"><name><surname>Crimi</surname> <given-names>A</given-names></name> <name><surname>Bakas</surname> <given-names>S</given-names></name> <name><surname>Kuijf</surname> <given-names>H</given-names></name> <name><surname>Keyvan</surname> <given-names>F</given-names></name> <name><surname>Reyes</surname> <given-names>M</given-names></name> <name><surname>Van Walsum</surname> <given-names>T</given-names></name></person-group>, editors. <source>Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries</source>. vol. 11383. Cham: Springer International Publishing (<year>2019</year>). p. <fpage>271</fpage>&#x02013;<lpage>82</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-030-11723-8_27</pub-id></mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Z</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Li</surname> <given-names>F</given-names></name> <name><surname>Wang</surname> <given-names>S</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name></person-group>. <article-title>A feature-enhanced network for stroke lesion segmentation from brain MRI images</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>174</volume>:<fpage>108326</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108326</pub-id><pub-id pub-id-type="pmid">38599066</pub-id></mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liew</surname> <given-names>SL</given-names></name> <name><surname>Anglin</surname> <given-names>JM</given-names></name> <name><surname>Banks</surname> <given-names>NW</given-names></name> <name><surname>Sondag</surname> <given-names>M</given-names></name> <name><surname>Ito</surname> <given-names>KL</given-names></name> <name><surname>Kim</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>A large, open source dataset of stroke anatomical brain images and manual lesion segmentations</article-title>. <source>Sci Data</source>. (<year>2018</year>) <volume>5</volume>:<fpage>180011</fpage>. doi: <pub-id pub-id-type="doi">10.1038/sdata.2018.11</pub-id><pub-id pub-id-type="pmid">29461514</pub-id></mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Tian</surname> <given-names>Y</given-names></name> <name><surname>Zhao</surname> <given-names>Y</given-names></name> <name><surname>Yu</surname> <given-names>H</given-names></name> <name><surname>Xie</surname> <given-names>L</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>VMamba: visual state space model</article-title>. <source>arXiv</source> [Preprint]. (<year>2024</year>) arXiv:2401.10166. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2401.10166</pub-id></mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ruan</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Xiang</surname> <given-names>S</given-names></name></person-group>. <article-title>VM-UNet: vision Mamba UNet for medical image segmentation</article-title>. <source>arXiv</source> [Preprint]. (<year>2024</year>) arXiv:2402.02491. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2402.02491</pub-id></mixed-citation>
</ref>
<ref id="B21">
<label>21.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>F</given-names></name> <name><surname>Wang</surname> <given-names>B</given-names></name></person-group>. <article-title>U-Mamba: enhancing long-range dependency for biomedical image segmentation</article-title>. <source>arXiv</source> [Preprint]. (<year>2024</year>) arXiv:2401.04722. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2401.04722</pub-id></mixed-citation>
</ref>
<ref id="B22">
<label>22.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xing</surname> <given-names>Z</given-names></name> <name><surname>Ye</surname> <given-names>T</given-names></name> <name><surname>Yang</surname> <given-names>Y</given-names></name> <name><surname>Liu</surname> <given-names>G</given-names></name> <name><surname>Zhu</surname> <given-names>L</given-names></name></person-group>. <article-title>SegMamba: long-range sequential modeling Mamba for 3D medical image segmentation</article-title>. In:<person-group person-group-type="editor"><name><surname>Linguraru</surname> <given-names>MG</given-names></name> <name><surname>Dou</surname> <given-names>Q</given-names></name> <name><surname>Feragen</surname> <given-names>A</given-names></name> <name><surname>Giannarou</surname> <given-names>S</given-names></name> <name><surname>Glocker</surname> <given-names>B</given-names></name> <name><surname>Lekadir</surname> <given-names>K</given-names></name> <name><surname>et</surname> <given-names>al.</given-names></name></person-group>, editors. <source>Medical Image Computing and Computer Assisted Intervention-MICCAI 2024</source>. vol. 15008. Cham: Springer Nature Switzerland (<year>2024</year>). p. <fpage>578</fpage>&#x02013;<lpage>88</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-031-72111-3_54</pub-id></mixed-citation>
</ref>
<ref id="B23">
<label>23.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>X</given-names></name> <name><surname>Du</surname> <given-names>Y</given-names></name> <name><surname>Sui</surname> <given-names>D</given-names></name></person-group>. <article-title>A U-shaped architecture based on hybrid CNN and Mamba for medical image segmentation</article-title>. <source>Appl Sci</source>. (<year>2025</year>) <volume>15</volume>:<fpage>7821</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app15147821</pub-id></mixed-citation>
</ref>
<ref id="B24">
<label>24.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>X</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Han</surname> <given-names>J</given-names></name> <name><surname>Ding</surname> <given-names>G</given-names></name></person-group>. <article-title>Scaling up your kernels to 31 &#x000D7; 31: revisiting large kernel design in CNNs</article-title>. In: <source>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>. <publisher-loc>New Orleans, LA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2022</year>). p. <fpage>11953</fpage>&#x02013;<lpage>65</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52688.2022.01166</pub-id></mixed-citation>
</ref>
<ref id="B25">
<label>25.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Mao</surname> <given-names>H</given-names></name> <name><surname>Wu</surname> <given-names>CY</given-names></name> <name><surname>Feichtenhofer</surname> <given-names>C</given-names></name> <name><surname>Darrell</surname> <given-names>T</given-names></name> <name><surname>Xie</surname> <given-names>S</given-names></name></person-group>. <article-title>A ConvNet for the 2020s</article-title>. In: <source>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>. <publisher-loc>New Orleans, LA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2022</year>). p. <fpage>11966</fpage>&#x02013;<lpage>76</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52688.2022.01167</pub-id></mixed-citation>
</ref>
<ref id="B26">
<label>26.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liew</surname> <given-names>SL</given-names></name> <name><surname>Lo</surname> <given-names>BP</given-names></name> <name><surname>Donnelly</surname> <given-names>MR</given-names></name> <name><surname>Zavaliangos-Petropulu</surname> <given-names>A</given-names></name> <name><surname>Jeong</surname> <given-names>JN</given-names></name> <name><surname>Barisano</surname> <given-names>G</given-names></name> <etal/></person-group>. <article-title>A large, curated, open-source stroke neuroimaging dataset to improve lesion segmentation algorithms</article-title>. <source>Sci Data</source>. (<year>2022</year>) <volume>9</volume>:<fpage>320</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41597-022-01401-7</pub-id><pub-id pub-id-type="pmid">35710678</pub-id></mixed-citation>
</ref>
<ref id="B27">
<label>27.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hernandez Petzsche</surname> <given-names>MR</given-names></name> <name><surname>De La Rosa</surname> <given-names>E</given-names></name> <name><surname>Hanning</surname> <given-names>U</given-names></name> <name><surname>Wiest</surname> <given-names>R</given-names></name> <name><surname>Valenzuela</surname> <given-names>W</given-names></name> <name><surname>Reyes</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>ISLES 2022: a multi-center magnetic resonance imaging stroke lesion segmentation dataset</article-title>. <source>Sci Data</source>. (<year>2022</year>) <volume>9</volume>:<fpage>762</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41597-022-01875-5</pub-id><pub-id pub-id-type="pmid">36496501</pub-id></mixed-citation>
</ref>
<ref id="B28">
<label>28.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Isensee</surname> <given-names>F</given-names></name> <name><surname>Jaeger</surname> <given-names>PF</given-names></name> <name><surname>Kohl</surname> <given-names>SAA</given-names></name> <name><surname>Petersen</surname> <given-names>J</given-names></name> <name><surname>Maier-Hein</surname> <given-names>KH</given-names></name></person-group>. <article-title>nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation</article-title>. <source>Nat Methods</source>. (<year>2021</year>) <volume>18</volume>:<fpage>203</fpage>&#x02013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41592-020-01008-z</pub-id><pub-id pub-id-type="pmid">33288961</pub-id></mixed-citation>
</ref>
<ref id="B29">
<label>29.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>&#x000C7;i&#x000E7;cek</surname> <given-names>&#x000D6;</given-names></name> <name><surname>Abdulkadir</surname> <given-names>A</given-names></name> <name><surname>Lienkamp</surname> <given-names>SS</given-names></name> <name><surname>Brox</surname> <given-names>T</given-names></name> <name><surname>Ronneberger</surname> <given-names>O</given-names></name></person-group>. <article-title>3D U-Net: learning dense volumetric segmentation from sparse annotation</article-title>. In:<person-group person-group-type="editor"><name><surname>Ourselin</surname> <given-names>S</given-names></name> <name><surname>Joskowicz</surname> <given-names>L</given-names></name> <name><surname>Sabuncu</surname> <given-names>MR</given-names></name> <name><surname>Unal</surname> <given-names>G</given-names></name> <name><surname>Wells</surname> <given-names>W</given-names></name></person-group>, editors. <source>Medical Image Computing and Computer-Assisted Intervention-MICCAI 2016</source>. vol. 9901. Cham: Springer International Publishing (<year>2016</year>). p. <fpage>424</fpage>&#x02013;<lpage>32</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-46723-8_49</pub-id></mixed-citation>
</ref>
<ref id="B30">
<label>30.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Z</given-names></name> <name><surname>Siddiquee</surname> <given-names>MMR</given-names></name> <name><surname>Tajbakhsh</surname> <given-names>N</given-names></name> <name><surname>Liang</surname> <given-names>J</given-names></name></person-group>. <article-title>UNet&#x0002B;&#x0002B;: redesigning skip connections to exploit multiscale features in image segmentation</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2020</year>) <volume>39</volume>:<fpage>1856</fpage>&#x02013;<lpage>67</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2019.2959609</pub-id><pub-id pub-id-type="pmid">31841402</pub-id></mixed-citation>
</ref>
<ref id="B31">
<label>31.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Z</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Deng</surname> <given-names>Z</given-names></name> <name><surname>Ye</surname> <given-names>J</given-names></name> <name><surname>Su</surname> <given-names>Y</given-names></name> <name><surname>Sun</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>STU-Net: scalable and transferable medical image segmentation models empowered by large-scale supervised pre-training</article-title>. <source>arXiv</source> [Preprint]. (<year>2023</year>) arXiv:2304.06716. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2304.06716</pub-id></mixed-citation>
</ref>
<ref id="B32">
<label>32.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Deb</surname> <given-names>P</given-names></name> <name><surname>Bharadwaj Baru</surname> <given-names>L</given-names></name> <name><surname>Dadi</surname> <given-names>K</given-names></name> <name><surname>Raju</surname> <given-names>SB</given-names></name></person-group>. <article-title>BeSt-LeS: benchmarking stroke lesion segmentation using deep supervision</article-title>. In:<person-group person-group-type="editor"><name><surname>Baid</surname> <given-names>U</given-names></name> <name><surname>Dorent</surname> <given-names>R</given-names></name> <name><surname>Malec</surname> <given-names>S</given-names></name> <name><surname>Pytlarz</surname> <given-names>M</given-names></name> <name><surname>Su</surname> <given-names>R</given-names></name> <name><surname>Wijethilake</surname> <given-names>N</given-names></name> <name><surname>et</surname> <given-names>al.</given-names></name></person-group>, editors. <source>Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries</source>. vol. 14668. Cham: Springer Nature Switzerland (<year>2024</year>). p. <fpage>23</fpage>&#x02013;<lpage>35</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-031-76160-7_3</pub-id></mixed-citation>
</ref>
<ref id="B33">
<label>33.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ahmed</surname> <given-names>R</given-names></name> <name><surname>Al Shehhi</surname> <given-names>A</given-names></name> <name><surname>Hassan</surname> <given-names>B</given-names></name> <name><surname>Werghi</surname> <given-names>N</given-names></name> <name><surname>Seghier</surname> <given-names>ML</given-names></name></person-group>. <article-title>An appraisal of the performance of AI tools for chronic stroke lesion segmentation</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>164</volume>:<fpage>107302</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107302</pub-id><pub-id pub-id-type="pmid">37572443</pub-id></mixed-citation>
</ref>
<ref id="B34">
<label>34.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abbasi</surname> <given-names>H</given-names></name> <name><surname>Orouskhani</surname> <given-names>M</given-names></name> <name><surname>Asgari</surname> <given-names>S</given-names></name> <name><surname>Zadeh</surname> <given-names>SS</given-names></name></person-group>. <article-title>Automatic brain ischemic stroke segmentation with deep learning: a review</article-title>. <source>Neurosci Inform</source>. (<year>2023</year>) <volume>3</volume>:<fpage>100145</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuri.2023.100145</pub-id></mixed-citation>
</ref>
<ref id="B35">
<label>35.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>J</given-names></name> <name><surname>Dai</surname> <given-names>P</given-names></name> <name><surname>He</surname> <given-names>Z</given-names></name> <name><surname>Huang</surname> <given-names>Z</given-names></name> <name><surname>Liao</surname> <given-names>S</given-names></name> <name><surname>Liu</surname> <given-names>K</given-names></name></person-group>. <article-title>Deep learning models for ischemic stroke lesion segmentation in medical images: a survey</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>175</volume>:<fpage>108509</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108509</pub-id><pub-id pub-id-type="pmid">38677171</pub-id></mixed-citation>
</ref>
<ref id="B36">
<label>36.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>F</given-names></name> <name><surname>Ding</surname> <given-names>J</given-names></name> <name><surname>Quan</surname> <given-names>Q</given-names></name> <name><surname>Wang</surname> <given-names>L</given-names></name> <name><surname>Ning</surname> <given-names>C</given-names></name> <name><surname>Zhou</surname> <given-names>SK</given-names></name></person-group>. <article-title>CMUNEXT: an efficient medical image segmentation network based on large kernel and skip fusion</article-title>. In: <source>2024 IEEE International Symposium on Biomedical Imaging (ISBI)</source>. <publisher-loc>Athens</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2024</year>). p. <fpage>1</fpage>&#x02013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ISBI56570.2024.10635609</pub-id></mixed-citation>
</ref>
<ref id="B37">
<label>37.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>Y</given-names></name> <name><surname>Dai</surname> <given-names>D</given-names></name> <name><surname>Zhang</surname> <given-names>Q</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Xu</surname> <given-names>S</given-names></name> <name><surname>Lian</surname> <given-names>C</given-names></name></person-group>. <article-title>MSCA-Net: multi-scale contextual attention network for skin lesion segmentation</article-title>. <source>Pattern Recognit</source>. (<year>2023</year>) <volume>139</volume>:<fpage>109524</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patcog.2023.109524</pub-id></mixed-citation>
</ref>
<ref id="B38">
<label>38.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nguyen</surname> <given-names>J</given-names></name> <name><surname>Vo</surname> <given-names>N</given-names></name> <name><surname>Chang</surname> <given-names>PD</given-names></name> <name><surname>Chantaduly</surname> <given-names>C</given-names></name> <name><surname>Yu</surname> <given-names>W</given-names></name> <name><surname>Soun</surname> <given-names>JE</given-names></name></person-group>. <article-title>Evaluation of small vessel disease burden on MRI and stroke outcomes</article-title>. <source>Front Neurol</source>. (<year>2025</year>) <volume>16</volume>:<fpage>1628787</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fneur.2025.1628787</pub-id><pub-id pub-id-type="pmid">40703774</pub-id></mixed-citation>
</ref>
<ref id="B39">
<label>39.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Xiao</surname> <given-names>B</given-names></name> <name><surname>Bi</surname> <given-names>X</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Gao</surname> <given-names>X</given-names></name></person-group>. <article-title>MCF: mutual correction framework for semi-supervised medical image segmentation</article-title>. In: <source>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>. <publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2023</year>). p. <fpage>15651</fpage>&#x02013;<lpage>60</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.01502</pub-id></mixed-citation>
</ref>
<ref id="B40">
<label>40.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>K</given-names></name> <name><surname>Sheng</surname> <given-names>VS</given-names></name> <name><surname>Song</surname> <given-names>Y</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Qiu</surname> <given-names>C</given-names></name> <name><surname>Ma</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>Deep semi-supervised learning for medical image segmentation: a review</article-title>. <source>Expert Syst Appl</source>. (<year>2024</year>) <volume>245</volume>:<fpage>123052</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2023.123052</pub-id></mixed-citation>
</ref>
<ref id="B41">
<label>41.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>S</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Meng</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>C</given-names></name> <name><surname>Song</surname> <given-names>Z</given-names></name></person-group>. <article-title>Multi-organ segmentation: a progressive exploration of learning paradigms under scarce annotation</article-title>. <source>Phys Med Biol</source>. (<year>2024</year>) <volume>69</volume>:<fpage>11TR01</fpage>. doi: <pub-id pub-id-type="doi">10.1088/1361-6560/ad33b5</pub-id><pub-id pub-id-type="pmid">38479023</pub-id></mixed-citation>
</ref>
<ref id="B42">
<label>42.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>Q</given-names></name> <name><surname>Chen</surname> <given-names>X</given-names></name> <name><surname>Chen</surname> <given-names>C</given-names></name> <name><surname>Garibaldi</surname> <given-names>JM</given-names></name></person-group>. <article-title>Boundary-wise loss for medical image segmentation based on fuzzy rough sets</article-title>. <source>Inf Sci</source>. (<year>2024</year>) <volume>661</volume>:<fpage>120183</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ins.2024.120183</pub-id></mixed-citation>
</ref>
<ref id="B43">
<label>43.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>F</given-names></name> <name><surname>Luo</surname> <given-names>Z</given-names></name> <name><surname>Li</surname> <given-names>S</given-names></name></person-group>. <article-title>Boundary difference over union loss for medical image segmentation</article-title>. In:<person-group person-group-type="editor"><name><surname>Greenspan</surname> <given-names>H</given-names></name> <name><surname>Madabhushi</surname> <given-names>A</given-names></name> <name><surname>Mousavi</surname> <given-names>P</given-names></name> <name><surname>Salcudean</surname> <given-names>S</given-names></name> <name><surname>Duncan</surname> <given-names>J</given-names></name> <name><surname>Syeda-Mahmood</surname> <given-names>T</given-names></name> <name><surname>et</surname> <given-names>al.</given-names></name></person-group>, editors. <source>Medical Image Computing and Computer Assisted Intervention-MICCAI 2023</source>. vol. 14223. Cham: Springer Nature Switzerland (<year>2023</year>). p. <fpage>292</fpage>&#x02013;<lpage>301</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<label>44.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>S</given-names></name> <name><surname>Zheng</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>D</given-names></name></person-group>. <article-title>Precise segmentation of non-enhanced computed tomography in patients with ischemic stroke based on multi-scale U-Net deep network model</article-title>. <source>Comput Methods Programs Biomed</source>. (<year>2021</year>) <volume>208</volume>:<fpage>106278</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cmpb.2021.106278</pub-id><pub-id pub-id-type="pmid">34274610</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2099776/overview">Vinayakumar Ravi</ext-link>, Prince Mohammad bin Fahd University, Saudi Arabia</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3324274/overview">Tongan Cai</ext-link>, The Pennsylvania State University (PSU), United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3325296/overview">Sureshkumar Govondarajan</ext-link>, Pondicherry University, Karaikal Campus, India</p>
</fn>
</fn-group>
</back>
</article>