<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2023.1259677</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>TransRender: a transformer-based boundary rendering segmentation network for stroke lesions</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Wu</surname> <given-names>Zelin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2379336/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhang</surname> <given-names>Xueying</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1972575/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Li</surname> <given-names>Fenglian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/965425/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Suzhe</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Jiaying</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2379369/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>College of Electronic Information and Optical Engineering, Taiyuan University of Technology</institution>, <addr-line>Taiyuan</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>The First Clinical Medical College, Shanxi Medical University</institution>, <addr-line>Taiyuan</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Xiangzhi Bai, Beihang University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Caglar Gurkan, Eskisehir Technical University, T&#x000FC;rkiye; Zhenghua Xu, Hebei University of Technology, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Xueying Zhang <email>zhangxy&#x00040;tyut.edu.cn</email></corresp>
<corresp id="c002">Fenglian Li <email>lifenglian&#x00040;tyut.edu.cn</email></corresp></author-notes>
<pub-date pub-type="epub">
<day>12</day>
<month>10</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>17</volume>
<elocation-id>1259677</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>07</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>09</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2023 Wu, Zhang, Li, Wang and Li.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Wu, Zhang, Li, Wang and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license></permissions>
<abstract>
<p>Vision transformer architectures attract widespread interest due to their robust representation capabilities of global features. Transformer-based methods as the encoder achieve superior performance compared to convolutional neural networks and other popular networks in many segmentation tasks for medical images. Due to the complex structure of the brain and the approximate grayscale of healthy tissue and lesions, lesion segmentation suffers from over-smooth boundaries or inaccurate segmentation. Existing methods, including the transformer, utilize stacked convolutional layers as the decoder to uniformly treat each pixel as a grid, which is convenient for feature computation. However, they often neglect the high-frequency features of the boundary and focus excessively on the region features. We propose an effective method for lesion boundary rendering called TransRender, which adaptively selects a series of important points to compute the boundary features in a point-based rendering way. The transformer-based method is selected to capture global information during the encoding stage. Several renders efficiently map the encoded features of different levels to the original spatial resolution by combining global and local features. Furthermore, the point-based function is employed to supervise the render module generating points, so that TransRender can continuously refine the uncertainty region. We conducted substantial experiments on different stroke lesion segmentation datasets to prove the efficiency of TransRender. Several evaluation metrics illustrate that our method can automatically segment the stroke lesion with relatively high accuracy and low calculation complexity.</p></abstract>
<kwd-group>
<kwd>transformer</kwd>
<kwd>deep learning</kwd>
<kwd>stroke</kwd>
<kwd>segmentation</kwd>
<kwd>boundary</kwd>
</kwd-group>
<counts>
<fig-count count="7"/>
<table-count count="7"/>
<equation-count count="12"/>
<ref-count count="41"/>
<page-count count="13"/>
<word-count count="7593"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Brain Imaging Methods</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>Reliable segmentation is the cornerstone for identifying disease types and making treatment strategies, and it plays an indispensable role in assisted therapy and intelligent healthcare (Tajbakhsh et al., <xref ref-type="bibr" rid="B26">2020</xref>). Deep learning-based methods attract enormous research interest in various segmentation tasks, such as stroke lesion segmentation (<xref ref-type="bibr" rid="B8">GBD 2016 lifetime risk of stroke collaborators</xref>, <xref ref-type="bibr" rid="B8">2018</xref>; Wu Z. et al., <xref ref-type="bibr" rid="B31">2023</xref>), skin lesion segmentation (Yuan et al., <xref ref-type="bibr" rid="B34">2017</xref>; Khattar and Kaur, <xref ref-type="bibr" rid="B15">2022</xref>), and brain tumor segmentation (Pereira et al., <xref ref-type="bibr" rid="B23">2016</xref>; Huang P. et al., <xref ref-type="bibr" rid="B12">2022</xref>). Ischemic stroke is a series of sudden neurological deficits caused by localized cerebral ischemia and permanent infarction, and it has become a major cause of injury and even death (Matsuo et al., <xref ref-type="bibr" rid="B20">2017</xref>). For the detection and treatment of stroke, magnetic resonance imaging (MRI) has become an indispensable method with the advantage of high resolution. Deep learning-based techniques produce rapid and accurate lesion segmentation that assists physicians in making timely medical decisions (Nielsen et al., <xref ref-type="bibr" rid="B22">2018</xref>). In the last decade, convolutional neural networks (CNN) have grown popular for researchers in the image processing field due to their success at extracting feature representations (Wu J. et al., <xref ref-type="bibr" rid="B29">2023</xref>). U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B24">2015</xref>) is a popular encoder-decoder symmetric structure that achieves great success for various 2D segmentation tasks. Many of the proposed methods (Milletari et al., <xref ref-type="bibr" rid="B21">2016</xref>; Schlemper et al., <xref ref-type="bibr" rid="B25">2019</xref>; Zhou Y. et al., <xref ref-type="bibr" rid="B39">2021</xref>) are improved based on U-Net, providing spatial information, semantic information, and more. However, CNN-based methods are intractable for establishing long-distance features because of the limitations of their inherent structure.</p>
<p>In the last few years, transformer (Vaswani et al., <xref ref-type="bibr" rid="B27">2017</xref>), which originated in the field of natural language processing (NLP), has shown great potential in a series of visual tasks. The vision transformer (Dosovitskiy et al., <xref ref-type="bibr" rid="B7">2021</xref>; Wang et al., <xref ref-type="bibr" rid="B28">2021</xref>; Chen et al., <xref ref-type="bibr" rid="B3">2022</xref>) is applied directly from NLP to image classification task and outperforms the CNN-based methods. Transformer and its derived methods demonstrate impressive achievements in a variety of visual tasks. The pure transformer is not appropriate, and the structure of hybrid CNN-transformer methods becomes the model of choice in medical image analysis (He et al., <xref ref-type="bibr" rid="B11">2022</xref>). TransUNet (Chen et al., <xref ref-type="bibr" rid="B4">2021</xref>), the first hybrid architecture in medical field, extracts the global features of medical images through transformer layers. For the organs segmentation, TransUNet realizes excellent results that outperform existing CNN-based methods. In contrast to the cascade structure, TransFuse (Zhang et al., <xref ref-type="bibr" rid="B35">2021a</xref>) utilizes both CNN and transformer in a parallel connection. The above-mentioned methods refine the feature representation of the encoder from different perspectives, while for the decoder they employ the traditional convolutional upsampling method. It&#x00027;s undeniable that the long range modeling capability of transformer is very powerful.</p>
<p>Unfortunately, stroke lesion segmentation still faces enormous challenges, as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. and the difficulty of identifying lesion boundaries. We can see that the location of the lesions are different due to the individual differences of patients and their lifestyle habits. The uncertain location of occurrence and the complex brain structure cause the shape of the lesion is extremely irregular. Furthermore, the statistical features of focal tissue are not significantly different from those of healthy tissue, leading to challenging segmentation of irregular lesion boundaries.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Stroke lesions are distributed in different locations and have extremely irregular sizes and shapes. Furthermore, the similarity of the lesion to the surrounding healthy tissue further increases the difficulty of segmentation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1259677-g0001.tif"/>
</fig>
<p>Most of the existing methods tend to treat all feature representations of the lesion region uniformly in a regular grid way, ignoring the high-frequency information at the boundaries, which makes the segmentation of the lesion boundary more difficult.</p>
<p>To ameliorate these difficulties, we propose a point-based boundary segmentation method, TransRender, which comprises the transformer as the encoder and the render-based module as the decoder. The transformer-based encoder constructs global features of the input image sequence at several scales. The render-based decoder utilizes a subdivision strategy that adaptively selects an uncertain set of points to recompute the original segmentation. Furthermore, the render module leverages both CNN and transformer features to recover the resolution of the segmentation results, which enriches the local-global features of the deep semantic information. To illustrate the validity of the TransRender, we implement the comparative experiments using different stroke lesion datasets. The experimental results from these datasets suggest that TransRender achieves excellent performance in the lesion segmentation task.</p>
<p>To summarize, our main contributions are as follows:</p>
<list list-type="order">
<list-item><p>We construct a boundary-related network structure for stroke lesion segmentation, called TransRender, by adopting both the multi-scale transformer to build long-distance dependency and render-based decoder to compute the original recovery images.</p></list-item>
<list-item><p>We propose a render-based decoder that is trained to predict uncertain points, allowing the decoder to fine-tune the lesion boundaries.</p></list-item>
<list-item><p>We design multi-level point-to-point supervision to optimize the point selection strategy. The comprehensive experiments are conducted on two MRI-based stroke lesion datasets to confirm the superior performance of the TransRender.</p></list-item>
</list></sec>
<sec id="s2">
<title>2. Related work</title>
<p>We will review the relevant literature from CNN-based methods, hybrid architecture-based methods, and boundary-related methods in this section.</p>
<sec>
<title>2.1. CNN-based methods</title>
<p>In several fields, such as image classification and image segmentation, the CNN methods have gained enormous success (Zhao et al., <xref ref-type="bibr" rid="B37">2021</xref>, <xref ref-type="bibr" rid="B38">2023</xref>; Guo et al., <xref ref-type="bibr" rid="B9">2022</xref>). Traditional segmentation methods generally use convolutional and pooling layers to extract local features and thus perform segmentation (Li et al., <xref ref-type="bibr" rid="B18">2021</xref>). U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B24">2015</xref>) is a popular symmetric structure based on convolution layers. The skip connection serves as a bridge to connect different semantic information, making U-Net suitable for medical image processing tasks. Some studies on stroke attempt to improve the U-Net method to realize accurate lesion segmentation. D-UNet (Zhou Y. et al., <xref ref-type="bibr" rid="B39">2021</xref>) utilizes the dimensional transformation module to extract the spatial information between slices through the combination of 2D detail features and 3D spatial features. The multi-inputs UNet (Zhang et al., <xref ref-type="bibr" rid="B36">2021b</xref>) takes 3D diffeomorphic registration with the original MRI as inputs, providing rich prior knowledge for the subsequent UNet segmentation network. The CNN-based encoder is limited by convolutional operations and still lacks the ability to extract global information. Yang et al. (<xref ref-type="bibr" rid="B32">2019</xref>) proposed a network that adopts DenseUNet as the encoder and uses a long short-term memory module to fuse contextual information on the decoder. The two-stage U-Net (Agnes et al., <xref ref-type="bibr" rid="B1">2022</xref>) proposes a feature combination module to efficiently extract global information. Unfortunately, these methods introduce global features from different perspectives, but do not qualitatively eliminate the limitations of the convolutional inherent receptive fields.</p></sec>
<sec>
<title>2.2. Hybrid architecture-based methods</title>
<p>Transformer has spread from NLP to computer vision since it is excellent at attracting long-distance information and encoding shape representations (Han et al., <xref ref-type="bibr" rid="B10">2022</xref>). The vision transformer (ViT) (Dosovitskiy et al., <xref ref-type="bibr" rid="B7">2021</xref>) is the first structure to be used for image classification tasks and obtains results that exceed the CNN methods. As the interest grows, ViT and its derived methods (Liu et al., <xref ref-type="bibr" rid="B19">2021</xref>) display powerful performance in a series of visual segmentation tasks. Because of the complex structure and tissue intensity similarity of medical images, a pure transformer is hard to realize the desired segmentation outcomes. The hybrid architectures of CNN combined with transformer have become the model of choice in the medical field (He et al., <xref ref-type="bibr" rid="B11">2022</xref>). TransUNet (Chen et al., <xref ref-type="bibr" rid="B4">2021</xref>) is the first hybrid structure that is utilized to segment the abdominal organs. TransUNet extracts deep-level features by using stacked convolutional layers and then establishes long-term associations by stacking transformer layers in a cascade way. On the contrary, BiFusion module (Zhang et al., <xref ref-type="bibr" rid="B35">2021a</xref>) is proposed to integrate the parallel convolutional and transformer branches, and the proposed method achieves excellent performance while being highly efficient. Swin-Unet is proposed by Cao et al. (<xref ref-type="bibr" rid="B2">2023</xref>), combining a Swin transformer with a U-shaped structure. Swin-Unet can capture local semantic features and build long-distance context information. The nnFormer is proposed by Zhou H. Y. et al. (<xref ref-type="bibr" rid="B40">2021</xref>), which optimally combines convolution with a self-attentive mechanism to surpass previous methods on brain tumor segmentation. As for the decoder, both of them employ the traditional convolutional upsampling path or transformer layers, which tend to degrade the boundary information due to the uniform computation of the pixels around the edge (Kirillov et al., <xref ref-type="bibr" rid="B16">2020</xref>).</p></sec>
<sec>
<title>2.3. Boundary-related methods</title>
<p>We notice recent works in medical image segmentation that can be related to the proposed method. de Vries et al. (<xref ref-type="bibr" rid="B6">2023</xref>) adopts general architecture as the encoder-decoder, while they introduce the multiple cross-attention module to receive the temporal information. Zhu et al. (<xref ref-type="bibr" rid="B41">2023</xref>) proposed a fusion network that extracts edge features from CNN and edge spatial attention blocks, and fuses edge features with semantic features from the transformer. To clarify the structure boundaries, the boundary preserving module (Lee et al., <xref ref-type="bibr" rid="B17">2020</xref>) is proposed to generate a key point map and explore the boundaries of the target object. Kirillov et al. (<xref ref-type="bibr" rid="B16">2020</xref>) proposed a unique idea of considering image segmentation as a rendering issue. The rendering-based approach is effective and qualitative in the instance segmentation and semantic segmentation tasks. In the boundary-rendering network (Huang R. et al., <xref ref-type="bibr" rid="B13">2022</xref>), a point selection module is proposed to concentrate on the area of unclear edge. Moreover, a boundary rendering module is employed to discover the contour information. Some other methods (Chu et al., <xref ref-type="bibr" rid="B5">2020</xref>; Kervadec et al., <xref ref-type="bibr" rid="B14">2021</xref>) to design boundary loss functions to mitigate the difficulties of highly unbalanced problems in medical images. However, the existing methods tend to generate over-smooth or inaccurate predictions (Huang R. et al., <xref ref-type="bibr" rid="B13">2022</xref>). We propose an improved render-based decoder and combine it with a transformer-based encoder, which can accurately segment lesions via fine-level details on a grid and global semantic information.</p></sec></sec>
<sec sec-type="methods" id="s3">
<title>3. Methodology</title>
<p>The structure of the TransRender is described in <xref ref-type="fig" rid="F2">Figure 2</xref>. The transformer-based encoder, render-based decoder, and fusion module are the three parts of the proposed network architecture. For each sliced input image, TransRender utilizes a multi-scale transformer as an encoder to establish long-range dependencies between the patch sequences. Then, the render-based decoder recovers the resolution of the segmentation by upsampling strategy with local-global features. Finally, a fusion module is adopted as the postprocessing to integrate the segmentation maps at each level. Furthermore, the proposed method trains renders with several point-based supervisions. We introduce the detailed structure of these three parts in this section.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>The structure of the TransRender, which includes a multi-scale transformer-based encoder, render-based decoder, and a fusion module. Moreover, the orange line, green line, and blue line mean global skip connection, local skip connection, and point-based loss function, respectively.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1259677-g0002.tif"/>
</fig>
<sec>
<title>3.1. The encoder</title>
<p><xref ref-type="fig" rid="F2">Figure 2</xref> gives the structure of the encoder, which mainly consists of several transformer modules and convolutional modules. To encode the hierarchical context information of the input image, we first utilize a hierarchical transformer. With a particular input <italic>X</italic>&#x02208;&#x0211D;<sup><italic>H</italic>&#x000D7;<italic>W</italic>&#x000D7;<italic>C</italic></sup>, we denote its spatial resolution by <italic>H</italic>&#x000D7;<italic>W</italic> and its channel number by <italic>C</italic>, respectively. The MRI image <italic>X</italic> is initially split into a patch sequence <inline-formula><mml:math id="M1"><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x000B7;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msup><mml:mo>|</mml:mo><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:mi>N</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> in the linear embedding layer, where the height and width of each patch are <italic>P</italic><sub><italic>H</italic></sub> and <italic>P</italic><sub><italic>W</italic></sub>, and <italic>N</italic> stands for the amount of patches. Then we flatten and reflect these patches to a D-dimensional feature representation via the linear projection:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M2"><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>z</mml:mi></mml:mstyle><mml:mn>0</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy='false'>[</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mn>1</mml:mn></mml:msubsup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>E</mml:mi></mml:mstyle><mml:mo>;</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mn>2</mml:mn></mml:msubsup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>E</mml:mi></mml:mstyle><mml:mo>;</mml:mo><mml:mn>...</mml:mn><mml:mo>;</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mi>N</mml:mi></mml:msubsup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>E</mml:mi></mml:mstyle><mml:mo stretchy='false'>]</mml:mo><mml:mo>,</mml:mo><mml:mtext>s</mml:mtext><mml:mo>.</mml:mo><mml:mtext>t</mml:mtext><mml:mo>.</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>z</mml:mi></mml:mstyle><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>E</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo>&#x000B7;</mml:mo><mml:mi>C</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where <italic>z</italic><sub>0</sub> represents the final features, and <italic>E</italic> is the patch embedding projection.</p>
<p>Finally, a positional embedding <inline-formula><mml:math id="M3"><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> to be added is significant for the divided patches to integrate positional information. The encoded patch sequence will be fed into the transformer layers. As illustrated in <xref ref-type="fig" rid="F3">Figure 3A</xref>, the cascaded multi-head self-attention (MSA) layer and the multi-layer perception (MLP) layer comprise the transformer, which is computed as:</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M4"><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:msup><mml:mi>t</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup></mml:mstyle><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mtext>MSA</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mi>L</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>t</mml:mi></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>t</mml:mi></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E3"><label>(3)</label><mml:math id="M5"><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>t</mml:mi></mml:mstyle><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mtext>MLP</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mi>L</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:msup><mml:mi>t</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup></mml:mstyle><mml:mi>l</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:msup><mml:mi>t</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup></mml:mstyle><mml:mi>l</mml:mi></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>The structure of three modules: <bold>(A)</bold> Transformer layer composed of MSA, layer normalization, and MLP; <bold>(B)</bold> Conv2D module consist of two stacked of convolutional layers and activation functions; <bold>(C)</bold> Fusion module comprises GAP, FC layer, and <italic>sigmoid</italic> function.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1259677-g0003.tif"/>
</fig>
<p>where <italic>t</italic><sub><italic>l</italic></sub> and <italic>t</italic><sub><italic>l</italic>&#x02212;1</sub> represents the resulting features of the corresponding transformer layers, and <italic>LN</italic>(&#x000B7;) denotes the layer normalization. The MSA is defined as:</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">MSA</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mtext class="textrm" mathvariant="normal">s.t.</mml:mtext><mml:msub><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>d</italic> denotes the feature dimension, and <inline-formula><mml:math id="M7"><mml:msub><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula><mml:math id="M8"><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>, and <inline-formula><mml:math id="M9"><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>v</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> are the query, key, and value, respectively. The <inline-formula><mml:math id="M10"><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>,<inline-formula><mml:math id="M11"><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, and <inline-formula><mml:math id="M12"><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are the weight matrices, and &#x003C3;<sub>1</sub> means the nonlinear function <italic>softmax</italic>. Moreover, patch merging is employed between the two transformer modules, which reduces the spatial resolution of patches and doubles the channel dimension simultaneously.</p>
<p>As mentioned in the previous section, pure transformer architecture is not optimal for the different segmentation tasks. We utilize the convolutional modules additionally to enrich the local representation. In the initial stage of encoding, the undivided input <italic>X</italic>&#x02208;&#x0211D;<sup><italic>H</italic>&#x000D7;<italic>W</italic>&#x000D7;<italic>C</italic></sup> is directly fed into the Conv2D module. The structure of the Conv2D is shown in <xref ref-type="fig" rid="F3">Figure 3B</xref>, which is defined as follows:</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M13"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mtext>Conv2D</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x003C3;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>B</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>C</mml:mi><mml:mn>3</mml:mn><mml:mn>1</mml:mn></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003C3;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>B</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>C</mml:mi><mml:mn>3</mml:mn><mml:mn>1</mml:mn></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C3;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>B</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>C</mml:mi><mml:mn>3</mml:mn><mml:mn>1</mml:mn></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M15"><mml:msubsup><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> denotes a two-dimensional convolution with the 3 &#x000D7; 3 kernel and the 1 &#x000D7; 1 stride, &#x003C3;<sub>2</sub> means the PReLU linear function, and <italic>BN</italic>(&#x000B7;) represents the batch normalization. TransRender extracts the local features and long-distance dependency of the image at the encoding stage, which will be used by the decoder to perform resolution recovery of the predicted image.</p></sec>
<sec>
<title>3.2. The decoder</title>
<p>Due to the complexity of cerebral structures, the boundaries of stroke lesions are difficult to identify. The traditional CNN methods treat all pixels of the irregular target object uniformly in a convolutional way (Kirillov et al., <xref ref-type="bibr" rid="B16">2020</xref>), either at the lesion boundary or the lesion core. And the proposed render module first selects the set of uncertain points and extracts the feature representations corresponding to these points, and implements the re-prediction of these uncertain points by using the prediction head. The accurate localization of the lesion boundary is accomplished by further prediction of the selected uncertainty points. We take several renders to build a decoder that adaptively predicts points with high uncertainty. The render mainly includes three steps, as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>: point selection strategy, point re-prediction, and point replacement.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>The structure of render module. With the transformer features from the global skip connection and convolutional features from the local skip connection as input, render adaptively selects key points by point selection strategy and combines point features to make re-prediction.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1259677-g0004.tif"/>
</fig>
<p>First, we introduce the point selection strategy using the last layer of the proposed method as an example. For the given feature map <inline-formula><mml:math id="M16"><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mfrac><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mfrac><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, the render first upsamples it by a 2 &#x000D7; interpolate function to obtain the initial coarse segmentation <inline-formula><mml:math id="M17"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. The values from [0, 1] on the segmentation <inline-formula><mml:math id="M18"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represent the possibility of whether the current pixel is a lesion or not. We give the distribution of pixel values and pixel positions in <xref ref-type="fig" rid="F5">Figure 5</xref>. Different colors of points represent different values, where black, orange, and red represent healthy tissue, lesions, and fuzzy boundaries, respectively. If the value of the pixel is closer to 0, it is more likely that the current pixel is a background (healthy tissue), and vice versa, if the value is closer to 1, it means a lesion. When the segmentation threshold is set to 0.5, the closer the threshold is, the higher the uncertainty. Although the number of pixel points near the threshold is sparse in <xref ref-type="fig" rid="F5">Figure 5A</xref>, they are essential for the clear localization of the boundary. These values are sorted in descending order for each pixel, which is calculated as follows:</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo>&#x02200;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02264;</mml:mo><mml:mi>h</mml:mi><mml:mo>&#x02264;</mml:mo><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x02200;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02264;</mml:mo><mml:mi>w</mml:mi><mml:mo>&#x02264;</mml:mo><mml:mi>W</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>The distribution of pixel values and pixel locations: <bold>(A)</bold> The distribution of pixel points in different intervals, with [0.3, 0.7] as the high uncertainty interval; <bold>(B)</bold> The ambiguous pixel points are mainly distributed at the boundary of the lesion core area.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1259677-g0005.tif"/>
</fig>
<p>where <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> represents the prediction value at the (<italic>h, w</italic>) location and <italic>n</italic> &#x0003D; <italic>H</italic>&#x000D7;<italic>W</italic> denotes the number of pixels, <italic>M</italic><sub><italic>s</italic></sub> is the feature map derived after sorting, and the point in the <italic>M</italic><sub><italic>s</italic></sub> follows the rule that <inline-formula><mml:math id="M21"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is greater than <inline-formula><mml:math id="M22"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:math></inline-formula>. Based on the <italic>M</italic><sub><italic>s</italic></sub>, we construct the uncertain points map <italic>M</italic><sub><italic>u</italic></sub>. It can be obtained as follows:</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>h</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>p</italic><sub><italic>thd</italic></sub> represents the threshold value of uncertainty. For the final uncertainty map <italic>M</italic><sub><italic>u</italic></sub>, a smaller value at a pixel means that the segmentation network has more uncertainty in the prediction. That means the smaller the difference between <inline-formula><mml:math id="M24"><mml:msubsup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and <italic>p</italic><sub><italic>thd</italic></sub>, the larger the uncertainty of the current pixel. To eliminate the strong bias due to the space position, the proposed render sampling <italic>k</italic>&#x000D7;<italic>N</italic> points across the <italic>M</italic><sub><italic>u</italic></sub> at random. It identifies that the &#x003B2; &#x000D7; <italic>N</italic> points with the highest uncertainty in the set of points are located around the boundary region, as shown in <xref ref-type="fig" rid="F5">Figure 5B</xref>. These points will be important to correct the segmentation at the lesion boundary.</p>
<p>Then, the render module integrates the features based on the selected points, combining the contextual semantic information from the global skip connection and local detail information from the local skip connection. The feature sequences corresponding to the selected points are fed into the mixer layer for point-based re-prediction, where the mixer layer consists of two trainable MLP layers. Finally, the re-predicted points set replaces the high uncertainty points set in the initial segmentation to accomplish the precise localization of the lesion boundary. The structure of the render is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<p>Based on the render module proposed above, we construct the render-based decoder (see in <xref ref-type="fig" rid="F2">Figure 2</xref>), which combines local and global features at multiple scales. Furthermore, we introduce a fusion module at the end of the decoder in <xref ref-type="fig" rid="F3">Figure 3C</xref>, which fuses multiple layers of decoded features. The segmentation of the different levels renders is merged as input <italic>X</italic><sub><italic>r</italic></sub> to perform the following operations:</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>F</mml:mi><mml:mi>C</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>GAP</italic>(&#x000B7;), <italic>FC</italic>(&#x000B7;), and &#x003C3;<sub>3</sub> denotes global average pooling, fully connected layer, and <italic>sigmoid</italic> function, respectively. The fusion module emphasizes segmentation-related information and suppresses irrelevant features in an attentive manner.</p></sec>
<sec>
<title>3.3. Loss function</title>
<p>The multi-scale render decoder adaptively selects the boundary key points, thus improving the segmentation performance. In the training stage, we design a combined loss function from two aspects: segmentation loss and point loss, which is calculated as:</p>
<disp-formula id="E9"><label>(9)</label><mml:math id="M26"><mml:mrow><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mi>&#x003BB;</mml:mi><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:msubsup><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow><mml:mi>i</mml:mi></mml:msubsup></mml:mrow></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where &#x02112;<sub><italic>dice</italic></sub>(<italic>p, g</italic>) indicates segmentation loss and &#x02112;<sub><italic>bce</italic></sub>(<italic>p, g</italic>) is point loss. &#x003BB; represents the weight parameter, and the default setting is &#x003BB; &#x0003D; 0.7. The segmentation loss supervises the network to generate regional details in the whole upsampling recovery process, and point-to-point losses are employed to monitor each render module in the decoder. The weight parameters of the MLP layer in render are dynamically updated when the point selection strategy calculates the point loss between the selected points on ground truth and the points after re-prediction. Both two loss functions are calculated as:</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M27"><mml:mrow><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mstyle displaystyle='true'><mml:msubsup><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mi>g</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x003B4;</mml:mi></mml:mrow></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle displaystyle='true'><mml:msubsup><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:msubsup><mml:mi>p</mml:mi><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:msubsup><mml:mo>+</mml:mo><mml:mstyle displaystyle='true'><mml:msubsup><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:msubsup><mml:mi>g</mml:mi><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:msubsup><mml:mo>+</mml:mo><mml:mi>&#x003B4;</mml:mi></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E11"><label>(11)</label><mml:math id="M28"><mml:mrow><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mi>log</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msup><mml:mi>p</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup><mml:mi>log</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>g</mml:mi><mml:mo>&#x02032;</mml:mo></mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where <italic>p</italic> represents the prediction probability, <italic>g</italic> represents the expert annotation. <italic>p</italic>&#x02032; and <italic>g</italic>&#x02032; represents the contrary prediction probability of <italic>p</italic> and <italic>g</italic>, respectively.</p></sec></sec>
<sec id="s4">
<title>4. Experiments and configurations</title>
<sec>
<title>4.1. Datasets</title>
<p>Different stroke lesion segmentation datasets, including 490 MRI images, are used to conduct experiments to validate the proposed method. They include brain MRIs of stroke patients in the acute, sub-acute, and post-stroke stages. The details of both datasets are introduced as follows.</p>
<p>The anatomical tracings of lesions after stroke (ATLAS) is a publicly available dataset that includes 240 MRI images. Each image contains the MRI for the t1-weighted modality and the corresponding lesion annotation. The ischemic stroke lesion segmentation (ISLES2022) is provided for use at the MICCAI 2022 grand challenge, which contains 250 MRI images. In contrast to ATLAS, ISLES2022 contains three different modalities: ADC, DWI, and FLAIR. The original size of ATLAS is 233 &#x000D7; 197 &#x000D7; 189, while the original size of ISLES2022 varies over a wide range. After we slice these 3D MRIs into 2D images, the slices are resized to a uniform resolution of 208 &#x000D7; 176. In <xref ref-type="table" rid="T1">Table 1</xref>, we compare both two datasets in terms of imaging method, data source, modality, number of images, and dataset division.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>The data comparison of ATLAS and ISLES2022 dataset.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Imaging method</bold></th>
<th valign="top" align="left"><bold>Data source</bold></th>
<th valign="top" align="left"><bold>Modality</bold></th>
<th valign="top" align="left"><bold>Number of images</bold></th>
<th valign="top" align="left"><bold>Dataset division</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ATLAS</td>
<td valign="top" align="left">MRI</td>
<td valign="top" align="left">Public</td>
<td valign="top" align="left">T1WI</td>
<td valign="top" align="left">240</td>
<td valign="top" align="left">160/40/40</td>
</tr>
<tr>
<td valign="top" align="left">ISLES2022</td>
<td valign="top" align="left">MRI</td>
<td valign="top" align="left">Public</td>
<td valign="top" align="left">DWI, ADC, FLAIR</td>
<td valign="top" align="left">250</td>
<td valign="top" align="left">168/41/41</td>
</tr>
</tbody>
</table>
</table-wrap></sec>
<sec>
<title>4.2. Configurations</title>
<p>The PyTorch framework and Python are used to carry out the experiments. We adopted AdamW as the optimizer with default parameter settings. The epoch-based early stop strategy is utilized to determine whether the model optimization is complete. Furthermore, the transformer layers are pre-trained on the large images dataset. All experiments are performed on GeForce RTX 2080 super with 8 GB memory.</p>
<p>We select common metrics to measure the advantages of TransRender, including DSC, Precision, Recall, and HD to evaluate the similarity between prediction results and lesion labels. We consider the first two metrics, DSC and HD, more significant than the classic F2, Precision, and Recall. The DSC calculates the region similarity, and the HD calculates the boundary similarity between the two inputs.</p></sec>
<sec>
<title>4.3. Experiments</title>
<sec>
<title>4.3.1. Comparison experiment</title>
<p>We compare our TransRender with previous methods: U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B24">2015</xref>), AG U-Net (Schlemper et al., <xref ref-type="bibr" rid="B25">2019</xref>), D-UNet (Zhou Y. et al., <xref ref-type="bibr" rid="B39">2021</xref>), CLCI-Net (Yang et al., <xref ref-type="bibr" rid="B32">2019</xref>), SAN-Net (Yu et al., <xref ref-type="bibr" rid="B33">2023</xref>), TransUNet (Chen et al., <xref ref-type="bibr" rid="B4">2021</xref>), TransFuse (Zhang et al., <xref ref-type="bibr" rid="B35">2021a</xref>), and MLRA-Net (Wu et al., <xref ref-type="bibr" rid="B30">2022</xref>) using ATLAS dataset to illustrate efficiency of the TransRender. <xref ref-type="table" rid="T2">Table 2</xref> shows the performance comparison, where the experimental result of the proposed TransRender is presented in the last line. Further experiments are implemented on the ISLES2022 to validate the generalizability of the TransRender, as shown in <xref ref-type="table" rid="T3">Table 3</xref>. All of the above experiments employ cross-validation methods to avoid randomness.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>The quantitative comparison of TransRender with the previous eight methods on the ATLAS dataset.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>DSC (F1) (<italic>%</italic>)</bold></th>
<th valign="top" align="left"><bold>HD (<italic>px</italic>)</bold></th>
<th valign="top" align="left"><bold>F2 (<italic>%</italic>)</bold></th>
<th valign="top" align="left"><bold>Precision (%)</bold></th>
<th valign="top" align="left"><bold>Recall (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B24">2015</xref>)</td>
<td valign="top" align="left">48.34</td>
<td valign="top" align="left">51.35</td>
<td valign="top" align="left">49.50</td>
<td valign="top" align="left">54.45</td>
<td valign="top" align="left">53.68</td>
</tr> <tr>
<td valign="top" align="left">AG U-Net (Schlemper et al., <xref ref-type="bibr" rid="B25">2019</xref>)</td>
<td valign="top" align="left">49.60</td>
<td valign="top" align="left">50.12</td>
<td valign="top" align="left">53.67</td>
<td valign="top" align="left">49.25</td>
<td valign="top" align="left">62.53</td>
</tr> <tr>
<td valign="top" align="left">CLCI-Net (Yang et al., <xref ref-type="bibr" rid="B32">2019</xref>)</td>
<td valign="top" align="left">51.74</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">51.28</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">51.39</td>
</tr> <tr>
<td valign="top" align="left">MI-Net (Zhang et al., <xref ref-type="bibr" rid="B36">2021b</xref>)</td>
<td valign="top" align="left">56.72</td>
<td valign="top" align="left">38.80</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">60.90</td>
<td valign="top" align="left">59.38</td>
</tr> <tr>
<td valign="top" align="left">SAN-Net (Yu et al., <xref ref-type="bibr" rid="B33">2023</xref>)</td>
<td valign="top" align="left">57.11</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">56.23</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">59.77</td>
</tr> <tr>
<td valign="top" align="left">D-UNet (Zhou Y. et al., <xref ref-type="bibr" rid="B39">2021</xref>)</td>
<td valign="top" align="left">53.49</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">63.31</td>
<td valign="top" align="left">52.43</td>
</tr> <tr>
<td valign="top" align="left">TransUNet (Chen et al., <xref ref-type="bibr" rid="B4">2021</xref>)</td>
<td valign="top" align="left">56.23</td>
<td valign="top" align="left">45.44</td>
<td valign="top" align="left">59.64</td>
<td valign="top" align="left">57.15</td>
<td valign="top" align="left">65.95</td>
</tr> <tr>
<td valign="top" align="left">TransFuse (Zhang et al., <xref ref-type="bibr" rid="B35">2021a</xref>)</td>
<td valign="top" align="left">58.18</td>
<td valign="top" align="left">41.56</td>
<td valign="top" align="left"><bold>62.40</bold></td>
<td valign="top" align="left">57.64</td>
<td valign="top" align="left"><bold>70.06</bold></td>
</tr>
<tr>
<td valign="top" align="left">TransRender</td>
<td valign="top" align="left"><bold>59.79</bold></td>
<td valign="top" align="left"><bold>33.98</bold></td>
<td valign="top" align="left">59.38</td>
<td valign="top" align="left"><bold>63.91</bold></td>
<td valign="top" align="left">68.08</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values in the table represent the best results.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>The performance comparison of TransRender with the previous five methods on the ISLES2022 dataset.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>DSC (F1) (%)</bold></th>
<th valign="top" align="left"><bold>HD (px)</bold></th>
<th valign="top" align="left"><bold>F2 (%)</bold></th>
<th valign="top" align="left"><bold>Precision (%)</bold></th>
<th valign="top" align="left"><bold>Recall (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B24">2015</xref>)</td>
<td valign="top" align="left">82.04</td>
<td valign="top" align="left">36.82</td>
<td valign="top" align="left">81.52</td>
<td valign="top" align="left">85.31</td>
<td valign="top" align="left">81.44</td>
</tr> <tr>
<td valign="top" align="left">AG U-Net (Schlemper et al., <xref ref-type="bibr" rid="B25">2019</xref>)</td>
<td valign="top" align="left">81.45</td>
<td valign="top" align="left">37.01</td>
<td valign="top" align="left">80.99</td>
<td valign="top" align="left">84.70</td>
<td valign="top" align="left">80.98</td>
</tr> <tr>
<td valign="top" align="left">TransUNet (Chen et al., <xref ref-type="bibr" rid="B4">2021</xref>)</td>
<td valign="top" align="left">84.23</td>
<td valign="top" align="left">29.98</td>
<td valign="top" align="left">84.01</td>
<td valign="top" align="left">86.88</td>
<td valign="top" align="left">84.19</td>
</tr> <tr>
<td valign="top" align="left">TransFuse (Zhang et al., <xref ref-type="bibr" rid="B35">2021a</xref>)</td>
<td valign="top" align="left">84.39</td>
<td valign="top" align="left">29.19</td>
<td valign="top" align="left">84.06</td>
<td valign="top" align="left"><bold>87.36</bold></td>
<td valign="top" align="left">84.15</td>
</tr> <tr>
<td valign="top" align="left">MLRA-Net (Wu et al., <xref ref-type="bibr" rid="B30">2022</xref>)</td>
<td valign="top" align="left">84.73</td>
<td valign="top" align="left">29.95</td>
<td valign="top" align="left">84.48</td>
<td valign="top" align="left">87.03</td>
<td valign="top" align="left"><bold>84.70</bold></td>
</tr>
<tr>
<td valign="top" align="left">TransRender</td>
<td valign="top" align="left"><bold>85.37</bold></td>
<td valign="top" align="left"><bold>27.60</bold></td>
<td valign="top" align="left"><bold>84.87</bold></td>
<td valign="top" align="left">86.48</td>
<td valign="top" align="left">83.94</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values in the table represent the best results.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<title>4.3.2. Ablation experiment</title>
<p>The four ablation experiments on decoders are conducted to assess the availability of the render module, which are shown below: (1) The encoder uses U-Net and traditional convolutional upsampling path as the decoder; (2) The encoder uses U-Net and render-based upsampling path as the decoder; (3) TransRender as the encoder and traditional convolutional upsampling path as the decoder; (4) TransRender as the encoder and render-based upsampling path as the decoder. <xref ref-type="table" rid="T4">Table 4</xref> shows the comparative results.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>The ablation comparison of TransRender on the ATLAS dataset.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th/>
<th valign="top" align="center"><bold>Encoder</bold></th>
<th valign="top" align="center"><bold>Render</bold></th>
<th valign="top" align="center"><bold>DSC (F1) (%)</bold></th>
<th valign="top" align="center"><bold>HD (px)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">U-Net</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">48.34</td>
<td valign="top" align="center">51.35</td>
</tr>
<tr>
<td/>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">54.13</td>
<td valign="top" align="center">40.71</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">AG U-Net</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">49.60</td>
<td valign="top" align="center">50.12</td>
</tr>
<tr>
<td/>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center"><bold>55.21</bold></td>
<td valign="top" align="center"><bold>38.14</bold></td>
</tr> <tr>
<td valign="top" align="left">Transformer</td>
<td valign="top" align="center">TransUNet</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">56.23</td>
<td valign="top" align="center">45.44</td>
</tr>
<tr>
<td/>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">57.86</td>
<td valign="top" align="center">37.42</td>
</tr>

<tr>
<td/>
<td valign="top" align="center">TransRender</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">58.27</td>
<td valign="top" align="center">37.86</td>
</tr>
<tr>
<td/>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center"><bold>59.79</bold></td>
<td valign="top" align="center"><bold>33.98</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values in the table represent the best results.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<title>4.3.3. Hyper-parameter comparison</title>
<p>The render module automatically selects <italic>k</italic>&#x000D7;<italic>N</italic> points as the uncertain points set to predict. The value of <italic>k</italic> directly affects how many points are selected in network learning and, consequently, the segmentation capacity of the proposed TransRender. We set <italic>k</italic> &#x0003D; 1, 2, 3, 5 in the render module to compare the performance using ATLAS dataset, respectively. <xref ref-type="table" rid="T6">Table 6</xref> shows the results of this experiment. It is also worthwhile to investigate the value of &#x003B2;, which indicates the different percentiles of points selected as important points. These important &#x003B2; &#x000D7; <italic>N</italic> points are sampled for the features of spatial location, while the other (1&#x02212;&#x003B2;) &#x000D7; <italic>N</italic> points are randomly assigned features. We conduct a comparative experiment to explore the effect on segmentation performance by using &#x003B2; &#x0003D; 0.1, 0.5, 0.6, 0.7, 0.8 on the ATLAS dataset, respectively. The results of this experiment as shown in <xref ref-type="table" rid="T7">Table 7</xref>.</p></sec></sec></sec>
<sec id="s5">
<title>5. Result and discussion</title>
<sec>
<title>5.1. Comparison experiment</title>
<p><xref ref-type="table" rid="T2">Table 2</xref> reports the quantitative results using the ATLAS dataset. Comparative experiments with eight different existing methods are conducted to analyze the segmentation effectiveness of the point-based TransRender. The comparison results indicate that TransRender exceeds the previous method, performance gains range from 1.61%, 7.58<italic>px</italic>, and 0.60&#x02013;11.45%, 17.37<italic>px</italic>, and 9.46% considering the DSC, HD, and PRE, respectively. The significant improvements demonstrate that applying a render-based decoder to TransRender is better at capturing boundary semantic information than a standard decoder. For the DSC, our method achieves a mean DSC of 59.79%, which is improved by 2.77% than the second-best TransFuse. We would also like to mention that the difference in the HD metric is pretty large. Our method does not obtain the best performance in terms of F2 and RE, only 59.38 and 68.08%, which are the third- and second-best ranks, respectively. However, we recognize that region overlap (DSC) and boundary distance (HD) is more important between the prediction results and the physicians annotation. Excellent results verify that adaptively predicting selected points can improve lesion segmentation at the boundary.</p>
<p>Furthermore, the qualitative comparisons of the ATLAS dataset are displayed in <xref ref-type="fig" rid="F5">Figure 5</xref>. As we can see from the visualization results, whether the lesion size is large or small, the lesion location is left or right, our method produces visually superior segmentation. We visualize four methods, including U-Net (Ronneberger et al., <xref ref-type="bibr" rid="B24">2015</xref>), AG U-Net (Schlemper et al., <xref ref-type="bibr" rid="B25">2019</xref>), TransUNet (Chen et al., <xref ref-type="bibr" rid="B4">2021</xref>), and TransFuse (Zhang et al., <xref ref-type="bibr" rid="B35">2021a</xref>) to compare visually with the TransRender. The scale, location, and shape of each lesion are different in the selected five brain images. In Case 1, the target object consists of an infarct lesion and multiple embolic, the latter of which size is extremely small. All methods identify infarct lesions with more or less accuracy, but our TransRender achieves the best regional similarity. And for the multiple embolic, only AG U-Net and TransRender locate the lesion, where the latter obtains more correct segmentation and less over-segmentation. The lesion size in Case 2 is small, so U-Net and TransFuse only segment a small part of the lesion or even fail to identify it. The other two previous methods realize correct segmentation almost completely, but at the cost of severe over-segmentation. Benefiting from the prediction of the boundary key points by the render module, the proposed method greatly reduces over-segmentation. In Case 3, transformer-based methods display significantly improved segmentation performance compared to CNN-based methods. However, these methods suffer from different degrees of under-segmentation. The TransRender yields precise details of the lesion boundary, with almost no under-segmentation. We regard Case 4 in <xref ref-type="fig" rid="F6">Figure 6</xref> as a difficult segmentation issue due to its close location to the skull. None of the five methods completely segments the lesion, whereas TransRender achieves the correct segmentation of the most pixels. It is necessary to mention that TransRender suffers from a slight over-segmentation. The complex tissue structure in the area of the focal lesion affects the segmentation performance of all methods. So in Case 5, the prediction results of each method are coarse and discontinuous. TransRender yields fewer over-segmentation than the transformer-based methods, and fewer under-segmentation than the CNN-based methods.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>The visual segmentation results of TransRender and the four previous methods on the ATLAS dataset. Where, the red color, blue color, and green color mean correct, insufficient, and excessive segmentation, respectively.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1259677-g0006.tif"/>
</fig>
<p>We further carry out comparisons to validate the performance robustness of the TransRender. The quantitative comparison is reported in <xref ref-type="table" rid="T3">Table 3</xref> between the TransRender and five methods using the ISLES2022 dataset. We can observe that the CNN methods are significantly worse than that the transformer methods in terms of five metrics. MLRA-Net outperforms fourth-best TransUNet and third-best TransFuse by 0.50 and 0.34% on the DSC metric, respectively, but it is worse than TransFuse on the HD metric. The proposed method uses a multi-scale transformer as the encoder with render as the decoder that yields the best scores on the DSC, HD, and F2 metrics. It might be interpreted that render successfully corrects the error segmentation at the lesion boundary.</p>
<p><xref ref-type="fig" rid="F7">Figure 7</xref> displays the qualitative comparison using the ISLES2022 dataset. Four brain images are selected for visualization and comparison, each of which has different modalities, lesion shapes, and locations. In Case 1 and Case 3, all methods only segment parts of the lesion to a more or less degree, while TransRender realizes the best region overlap and boundary similarity. The lesions in Case 2 are multiple embolic, and only the proposed method segments the lesions nearly completely. The excellent results on these two datasets validate the segmentation accuracy of the TransRender for multiple embolic. The existing methods all identify Case 4 as having multiple lesions, and the reason may be that the lesion occurs in the cerebral cortex. TransRender identifies Case 4 as a whole lesion and completes more correct segmentation.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>The visual segmentation results of TransRender and the four previous methods on the ISLES2022 dataset. Where, the red color, blue color, and green color mean correct, insufficient, and excessive segmentation, respectively.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1259677-g0007.tif"/>
</fig>
<p>Overall, on these two datasets, the proposed TransRender can yield satisfactory segmentation performance, both qualitatively and quantitatively. These results indicate the efficacy and robustness of TransRender for stroke lesion segmentation.</p></sec>
<sec>
<title>5.2. Ablation experiment</title>
<p>Ablation experiments on decoders are conducted to investigate the impact of the render module on lesion segmentation. The comparison results for performance and complexity are presented in <xref ref-type="table" rid="T4">Tables 4</xref>, <xref ref-type="table" rid="T5">5</xref>. When using the render-based decoder, the DSC scores of both U-Net and TransRender are improved, while the HD scores are descended. We carry out experiments with the U-Net or TransRender as encoders, and the classical convolutional upsampling or render modules as decoders, respectively. The DSC and HD using a convolutional upsampling- and render-based decoder are improved from 48.34% and 51.35<italic>px</italic> to 54.13% and 40.71<italic>px</italic>, which gain improvements of 11.98 and 20.72%. With TransRender as the encoder, we employ render as the decoder, which attains superior performance, scoring 59.79% in DSC and 33.98<italic>px</italic> in HD. It is worth noting that by using render as the decoder, the calculation complexity and the network parameters are also decreased. These ablation comparisons demonstrate that the proposed render offers a competitive advantage over convolution methods in terms of its ability to process high-frequency information at the boundary.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>The complexity comparison of TransRender and U-Net w/o Render.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Encoder</bold></th>
<th valign="top" align="left"><bold>Render</bold></th>
<th valign="top" align="left"><bold>FLOPs (<italic>G</italic>)</bold></th>
<th valign="top" align="left"><bold>Params (<italic>M</italic>)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">U-Net</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">30.5</td>
<td valign="top" align="left">31.0</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left"><bold>15.6</bold></td>
<td valign="top" align="left"><bold>18.9</bold></td>
</tr>
<tr>
<td valign="top" align="left">TransRender</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">118.4</td>
<td valign="top" align="left">43.6</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">&#x02713;</td>
<td valign="top" align="left"><bold>100.1</bold></td>
<td valign="top" align="left"><bold>32.2</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values in the table represent the best results.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<title>5.3. Hyper-parameter comparison</title>
<p>Further comparison experiments are conducted to explore whether the hyper-parameters <italic>k</italic> and &#x003B2; would affect the segmentation performance. <xref ref-type="table" rid="T6">Table 6</xref> presents the comparison results using different numbers of selected points. The number of points selected is desired to match the lesion due to the different sizes. When <italic>k</italic> &#x0003D; 3, TransRender gives the best result in all metrics. In the experiments, we set <italic>k</italic> &#x0003D; 3 by default. The performance comparison using the different numbers of important points is shown in <xref ref-type="table" rid="T7">Table 7</xref>. The comparison indicates that there is a significant influence of &#x003B2; values on the segmentation. Note that we set &#x003B2; &#x0003D; 0.1 to suppress the features of important points and highlight the random features of other points. The comparison results indicate that more important points should be selected for feature extraction. As &#x003B2; increases, the segmentation performance becomes more favorable until &#x003B2; &#x0003D; 0.7. This might be due to some point features that mistakenly guide the decoding process. In the other experiments, we set &#x003B2; to 0.7 by default.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Segmentation performance comparison of different initial <italic>k</italic>.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Value of <italic>k</italic></bold></th>
<th valign="top" align="left"><bold>DSC (F1) (<italic>%</italic>)</bold></th>
<th valign="top" align="left"><bold>HD (<italic>px</italic>)</bold></th>
<th valign="top" align="left"><bold>F2 (<italic>%</italic>)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="left">59.07</td>
<td valign="top" align="left">34.96</td>
<td valign="top" align="left">58.97</td>
</tr> <tr>
<td valign="top" align="left">2</td>
<td valign="top" align="left">59.27</td>
<td valign="top" align="left">34.77</td>
<td valign="top" align="left">59.21</td>
</tr> <tr>
<td valign="top" align="left">3</td>
<td valign="top" align="left"><bold>59.79</bold></td>
<td valign="top" align="left"><bold>33.98</bold></td>
<td valign="top" align="left"><bold>59.38</bold></td>
</tr>
<tr>
<td valign="top" align="left">5</td>
<td valign="top" align="left">59.26</td>
<td valign="top" align="left">34.85</td>
<td valign="top" align="left">59.17</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values in the table represent the best results.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Segmentation performance comparison of different &#x003B2;.</p></caption> 
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Value of &#x003B2;</bold></th>
<th valign="top" align="left"><bold>DSC (F1) (<italic>%</italic>)</bold></th>
<th valign="top" align="left"><bold>HD (<italic>px</italic>)</bold></th>
<th valign="top" align="left"><bold>F2 (<italic>%</italic>)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">0.1</td>
<td valign="top" align="left">55.17</td>
<td valign="top" align="left">45.33</td>
<td valign="top" align="left">54.99</td>
</tr> <tr>
<td valign="top" align="left">0.5</td>
<td valign="top" align="left">58.76</td>
<td valign="top" align="left">38.49</td>
<td valign="top" align="left">57.48</td>
</tr> <tr>
<td valign="top" align="left">0.6</td>
<td valign="top" align="left">59.38</td>
<td valign="top" align="left">36.90</td>
<td valign="top" align="left"><bold>59.61</bold></td>
</tr> <tr>
<td valign="top" align="left">0.7</td>
<td valign="top" align="left"><bold>59.79</bold></td>
<td valign="top" align="left"><bold>33.98</bold></td>
<td valign="top" align="left">59.38</td>
</tr>
<tr>
<td valign="top" align="left">0.8</td>
<td valign="top" align="left">59.02</td>
<td valign="top" align="left">34.57</td>
<td valign="top" align="left">57.24</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values in the table represent the best results.</p>
</table-wrap-foot>
</table-wrap></sec></sec>
<sec sec-type="conclusions" id="s6">
<title>6. Conclusion</title>
<p>In this study, we propose a novel point-based boundary segmentation method for stroke lesions using different MRI images. The TransRender is built on a multi-scale transformer encoder because of its strong ability to establish long-distance dependencies. The render-based decoder implements the non-uniform grid representation, which allows more attention to the precise features at the boundaries. Furthermore, a combined supervision loss is utilized to optimize the point selection of the render. Extensive experiments are conducted using the different ischemic stroke datasets to evaluate TransRender. And the experimental results indicate that TransRender has a competitive advantage over the existing networks in terms of both accuracy and complexity. Unfortunately, the improved render module is not adequate to achieve accurate segmentation due to the variety of lesions. We may consider the use of other network structures in the future to accomplish the re-prediction of selection points in the render module.</p></sec>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding authors.</p></sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>ZW: Conceptualization, Methodology, Visualization, Writing&#x02014;original draft. XZ: Software, Supervision, Writing&#x02014;review and editing. FL: Investigation, Supervision, Writing&#x02014;review and editing. SW: Formal analysis, Validation, Writing&#x02014;review and editing. JL: Validation, Writing&#x02014;original draft, Writing&#x02014;review and editing.</p></sec>
</body>
<back>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Natural Science Foundation of China (Nos. 62171307 and 62271342) and the Shanxi Province Natural Science Foundation (No. 202103021224113).</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Agnes</surname> <given-names>S. A.</given-names></name> <name><surname>Anitha</surname> <given-names>J.</given-names></name> <name><surname>Solomon</surname> <given-names>A. A.</given-names></name></person-group> (<year>2022</year>). <article-title>Two-stage lung nodule detection framework using enhanced UNet and convolutional LSTM networks in CT images</article-title>. <source>Comput. Biol. Med</source>. <volume>149</volume>:<fpage>106059</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.106059</pub-id><pub-id pub-id-type="pmid">36087510</pub-id></citation></ref>
<ref id="B2">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cao</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Jiang</surname> <given-names>D.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Tian</surname> <given-names>Q.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Swin-unet: Unet-like pure transformer for medical image segmentation,&#x0201D;</article-title> in <source>Lecture Notes in Computer Science</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Nature Switzerland</publisher-name>), <fpage>205</fpage>&#x02013;<lpage>218</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-25066-8_9</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>K.</given-names></name> <name><surname>Fung</surname> <given-names>K.-M.</given-names></name> <name><surname>Thai</surname> <given-names>T. C.</given-names></name> <name><surname>Moore</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Recent advances and clinical applications of deep learning in medical image analysis</article-title>. <source>Med. Image Anal</source>. <volume>79</volume>:<fpage>102444</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102444</pub-id><pub-id pub-id-type="pmid">35472844</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Lu</surname> <given-names>Y.</given-names></name> <name><surname>Yu</surname> <given-names>Q.</given-names></name> <name><surname>Luo</surname> <given-names>X.</given-names></name> <name><surname>Adeli</surname> <given-names>E.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>TransUNet: transformers make strong encoders for medical image segmentation</article-title>. <source>arXiv preprint: arXiv:2102.04306</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2102.04306</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chu</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Zhou</surname> <given-names>W.</given-names></name> <name><surname>Shi</surname> <given-names>H.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Tu</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Pay more attention to discontinuity for medical image segmentation,&#x0201D;</article-title> in <source>Medical Image Computing and Computer Assisted Intervention?MICCAI 2020</source> (<publisher-loc>Lima</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>166</fpage>&#x02013;<lpage>175</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-59719-1_17</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>de Vries</surname> <given-names>L.</given-names></name> <name><surname>Emmer</surname> <given-names>B. J.</given-names></name> <name><surname>Majoie</surname> <given-names>C. B.</given-names></name> <name><surname>Marquering</surname> <given-names>H. A.</given-names></name> <name><surname>Gavves</surname> <given-names>E.</given-names></name></person-group> (<year>2023</year>). <article-title>PerfU-net: baseline infarct estimation from CT perfusion source data for acute ischemic stroke</article-title>. <source>Med. Image Anal</source>. <volume>85</volume>:<fpage>102749</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2023.102749</pub-id><pub-id pub-id-type="pmid">36731276</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname> <given-names>A.</given-names></name> <name><surname>Beyer</surname> <given-names>L.</given-names></name> <name><surname>Kolesnikov</surname></name><etal/></person-group>. (<year>2021</year>). <article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>. <source>arXiv preprint arXiv:2010.11929</source>.</citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><collab>GBD 2016 lifetime risk of stroke collaborators</collab></person-group> (<year>2018</year>). <article-title>Global, regional, and country-specific lifetime risks of stroke, 1990 and 2016</article-title>. <source>N. Engl. J. Med</source>. <volume>379</volume>, <fpage>2429</fpage>&#x02013;<lpage>2437</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMoa1804492</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>Y.</given-names></name> <name><surname>Du</surname> <given-names>R.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Xie</surname> <given-names>J.</given-names></name> <name><surname>Ma</surname> <given-names>Z.</given-names></name> <name><surname>Dong</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Learning calibrated class centers for few-shot classification by pair-wise similarity</article-title>. <source>IEEE Trans. Image Process</source>. <volume>31</volume>, <fpage>4543</fpage>&#x02013;<lpage>4555</lpage>. <pub-id pub-id-type="doi">10.1109/TIP.2022.3184813</pub-id><pub-id pub-id-type="pmid">35767479</pub-id></citation></ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>K.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name> <name><surname>Guo</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A survey on vision transformer</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>44</volume>, <fpage>1</fpage>&#x02013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3152247</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Gan</surname> <given-names>C.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Rekik</surname> <given-names>I.</given-names></name> <name><surname>Yin</surname> <given-names>Z.</given-names></name> <name><surname>Ji</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Transformers in medical image analysis: a review</article-title>. <source>arXiv preprint: arXiv:2202.12165</source>.</citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Jiao</surname> <given-names>Z.</given-names></name> <name><surname>Wei</surname> <given-names>D.</given-names></name> <name><surname>Cao</surname> <given-names>B.</given-names></name> <name><surname>Mo</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Common feature learning for brain tumor MRI synthesis by context-aware generative adversarial network</article-title>. <source>Med. Image Anal</source>. <volume>79</volume>:<fpage>102472</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102472</pub-id><pub-id pub-id-type="pmid">35567847</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>R.</given-names></name> <name><surname>Lin</surname> <given-names>M.</given-names></name> <name><surname>Dou</surname> <given-names>H.</given-names></name> <name><surname>Lin</surname> <given-names>Z.</given-names></name> <name><surname>Ying</surname> <given-names>Q.</given-names></name> <name><surname>Jia</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Boundary-rendering network for breast lesion segmentation in ultrasound images</article-title>. <source>Med. Image Anal</source>. <volume>80</volume>:<fpage>102478</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102478</pub-id><pub-id pub-id-type="pmid">35691144</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kervadec</surname> <given-names>H.</given-names></name> <name><surname>Bouchtiba</surname> <given-names>J.</given-names></name> <name><surname>Desrosiers</surname> <given-names>C.</given-names></name> <name><surname>Granger</surname> <given-names>E.</given-names></name> <name><surname>Dolz</surname> <given-names>J.</given-names></name> <name><surname>Ayed</surname> <given-names>I. B.</given-names></name></person-group> (<year>2021</year>). <article-title>Boundary loss for highly unbalanced segmentation</article-title>. <source>Med. Image Anal</source>. <volume>67</volume>:<fpage>101851</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2020.101851</pub-id><pub-id pub-id-type="pmid">33080507</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khattar</surname> <given-names>S.</given-names></name> <name><surname>Kaur</surname> <given-names>R.</given-names></name></person-group> (<year>2022</year>). <article-title>Computer assisted diagnosis of skin cancer: a survey and future recommendations</article-title>. <source>Comput. Electr. Eng</source>. <volume>104</volume>:<fpage>108431</fpage>. <pub-id pub-id-type="doi">10.1016/j.compeleceng.2022.108431</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kirillov</surname> <given-names>A.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;PointRend: image segmentation as rendering,&#x0201D;</article-title> in <source>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00982</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>H. J.</given-names></name> <name><surname>Kim</surname> <given-names>J. U.</given-names></name> <name><surname>Lee</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>H. G.</given-names></name> <name><surname>Ro</surname> <given-names>Y. M.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Structure boundary preserving segmentation for medical image with ambiguous boundary,&#x0201D;</article-title> in <source>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4817</fpage>&#x02013;<lpage>4826</lpage>. IEEE. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00487</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Sun</surname> <given-names>Z.</given-names></name> <name><surname>Xue</surname> <given-names>J.-H.</given-names></name> <name><surname>Ma</surname> <given-names>Z.</given-names></name></person-group> (<year>2021</year>). <article-title>A concise review of recent few-shot meta-learning methods</article-title>. <source>Neurocomputing</source> <volume>456</volume>, <fpage>463</fpage>&#x02013;<lpage>468</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2020.05.114</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Swin transformer: hierarchical vision transformer using shifted windows,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</source> (<publisher-loc>Montreal, QC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>9992</fpage>&#x02013;<lpage>10002</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00986</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Matsuo</surname> <given-names>R.</given-names></name> <name><surname>Yamaguchi</surname> <given-names>Y.</given-names></name> <name><surname>Matsushita</surname> <given-names>T.</given-names></name> <name><surname>Hata</surname> <given-names>J.</given-names></name> <name><surname>Kiyuna</surname> <given-names>F.</given-names></name> <name><surname>Fukuda</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Association between onset-to-door time and clinical outcomes after ischemic stroke</article-title>. <source>Stroke</source> <volume>48</volume>, <fpage>3049</fpage>&#x02013;<lpage>3056</lpage>. <pub-id pub-id-type="doi">10.1161/STROKEAHA.117.018132</pub-id><pub-id pub-id-type="pmid">28974631</pub-id></citation></ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Milletari</surname> <given-names>F.</given-names></name> <name><surname>Navab</surname> <given-names>N.</given-names></name> <name><surname>Ahmadi</surname> <given-names>S.-A.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;V-net: fully convolutional neural networks for volumetric medical image segmentation,&#x0201D;</article-title> in <source>2016 Fourth International Conference on 3D Vision (3DV)</source> (<publisher-loc>Stanford</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>565</fpage>&#x02013;<lpage>571</lpage>. <pub-id pub-id-type="doi">10.1109/3DV.2016.79</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nielsen</surname> <given-names>A.</given-names></name> <name><surname>Hansen</surname> <given-names>M. B.</given-names></name> <name><surname>Tietze</surname> <given-names>A.</given-names></name> <name><surname>Mouridsen</surname> <given-names>K.</given-names></name></person-group> (<year>2018</year>). <article-title>Prediction of tissue outcome and assessment of treatment effect in acute ischemic stroke using deep learning</article-title>. <source>Stroke</source> <volume>49</volume>, <fpage>1394</fpage>&#x02013;<lpage>1401</lpage>. <pub-id pub-id-type="doi">10.1161/STROKEAHA.117.019740</pub-id><pub-id pub-id-type="pmid">29720437</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pereira</surname> <given-names>S.</given-names></name> <name><surname>Pinto</surname> <given-names>A.</given-names></name> <name><surname>Alves</surname> <given-names>V.</given-names></name> <name><surname>Silva</surname> <given-names>C. A.</given-names></name></person-group> (<year>2016</year>). <article-title>Brain tumor segmentation using convolutional neural networks in MRI images</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>35</volume>, <fpage>1240</fpage>&#x02013;<lpage>1251</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2016.2538465</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O.</given-names></name> <name><surname>Fischer</surname> <given-names>P.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;U-net: convolutional networks for biomedical image segmentation,&#x0201D;</article-title> in <source>Lecture Notes in Computer Science</source> (<publisher-loc>Munich</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>234</fpage>&#x02013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schlemper</surname> <given-names>J.</given-names></name> <name><surname>Oktay</surname> <given-names>O.</given-names></name> <name><surname>Schaap</surname> <given-names>M.</given-names></name> <name><surname>Heinrich</surname> <given-names>M.</given-names></name> <name><surname>Kainz</surname> <given-names>B.</given-names></name> <name><surname>Glocker</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Attention gated networks: learning to leverage salient regions in medical images</article-title>. <source>Med. Image Anal</source>. <volume>53</volume>, <fpage>197</fpage>&#x02013;<lpage>207</lpage>. <pub-id pub-id-type="doi">10.1016/j.media.2019.01.012</pub-id><pub-id pub-id-type="pmid">30802813</pub-id></citation></ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tajbakhsh</surname> <given-names>N.</given-names></name> <name><surname>Jeyaseelan</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name> <name><surname>Chiang</surname> <given-names>J. N.</given-names></name> <name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Ding</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>Embracing imperfect datasets: a review of deep learning solutions for medical image segmentation</article-title>. <source>Med. Image Anal</source>. <volume>63</volume>:<fpage>101693</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2020.101693</pub-id><pub-id pub-id-type="pmid">32289663</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Attention is all you need,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems, Vol. 30</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>5998</fpage>&#x02013;<lpage>6008</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Chen</surname> <given-names>C.</given-names></name> <name><surname>Ding</surname> <given-names>M.</given-names></name> <name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Zha</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;TransBTS: multimodal brain tumor segmentation using transformer,&#x0201D;</article-title> in <source>Medical Image Computing and Computer Assisted Intervention</source> (<publisher-loc>Strasbourg</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>109</fpage>&#x02013;<lpage>119</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-87193-2_11</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Guo</surname> <given-names>D.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Yang</surname> <given-names>S.</given-names></name> <name><surname>Zheng</surname> <given-names>Y.</given-names></name> <name><surname>Shapey</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>TISS-net: brain tumor image synthesis and segmentation using cascaded dual-task networks and error-prediction consistency</article-title>. <source>Neurocomputing</source> <volume>544</volume>:<fpage>126295</fpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2023.126295</pub-id><pub-id pub-id-type="pmid">37528990</pub-id></citation></ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Huang</surname> <given-names>L.</given-names></name></person-group> (<year>2022</year>). <article-title>Multi-scale long-range interactive and regional attention network for stroke lesion segmentation</article-title>. <source>Comput. Electr. Eng</source>. <volume>103</volume>:<fpage>108345</fpage>. <pub-id pub-id-type="doi">10.1016/j.compeleceng.2022.108345</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Huang</surname> <given-names>L.</given-names></name></person-group> (<year>2023</year>). <article-title>W-net: a boundary-enhanced segmentation network for stroke lesions</article-title>. <source>Expert Syst. Appl</source>. <volume>229</volume>:<fpage>120637</fpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2023.120637</pub-id></citation>
</ref>
<ref id="B32">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>H.</given-names></name> <name><surname>Huang</surname> <given-names>W.</given-names></name> <name><surname>Qi</surname> <given-names>K.</given-names></name> <name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;CLCI-net: cross-level fusion and context inference networks for lesion segmentation of chronic stroke,&#x0201D;</article-title> in <source>Medical Image Computing and Computer Assisted Intervention</source> (<publisher-loc>Shenzhen</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>266</fpage>&#x02013;<lpage>274</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-32248-9_30</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>W.</given-names></name> <name><surname>Huang</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Shan</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). <article-title>SAN-net: Learning generalization to unseen sites for stroke lesion segmentation with self-adaptive normalization</article-title>. <source>Comput. Biol. Med</source>. <volume>156</volume>:<fpage>106717</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106717</pub-id><pub-id pub-id-type="pmid">36878125</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yuan</surname> <given-names>Y.</given-names></name> <name><surname>Chao</surname> <given-names>M.</given-names></name> <name><surname>Lo</surname> <given-names>Y.-C.</given-names></name></person-group> (<year>2017</year>). <article-title>Automatic skin lesion segmentation using deep fully convolutional networks with Jaccard distance</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>36</volume>, <fpage>1876</fpage>&#x02013;<lpage>1886</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2017.2695227</pub-id><pub-id pub-id-type="pmid">28436853</pub-id></citation></ref>
<ref id="B35">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Hu</surname> <given-names>Q.</given-names></name></person-group> (<year>2021a</year>). <article-title>&#x0201C;TransFuse: fusing transformers and CNNs for medical image segmentation,&#x0201D;</article-title> in <source>Medical Image Computing and Computer Assisted Intervention</source> (<publisher-loc>Strasbourg</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>14</fpage>&#x02013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-87193-2_2</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Wu</surname> <given-names>E. X.</given-names></name> <name><surname>Tang</surname> <given-names>X.</given-names></name></person-group> (<year>2021b</year>). <article-title>MI-UNet: multi-inputs UNet incorporating brain parcellation for stroke lesion segmentation from T1-weighted magnetic resonance images</article-title>. <source>IEEE J. Biomed. Health Inform</source>. <volume>25</volume>, <fpage>526</fpage>&#x02013;<lpage>535</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2020.2996783</pub-id><pub-id pub-id-type="pmid">32750908</pub-id></citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>F.</given-names></name> <name><surname>Zhao</surname> <given-names>W.</given-names></name> <name><surname>Lu</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Yao</surname> <given-names>L.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>Depth-distilled multi-focus image fusion</article-title>. <source>IEEE Trans. Multim</source>. <volume>25</volume>, <fpage>966</fpage>&#x02013;<lpage>978</lpage>. <pub-id pub-id-type="doi">10.1109/TMM.2021.3134565</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Q.</given-names></name> <name><surname>Wan</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>J.</given-names></name> <name><surname>Fang</surname> <given-names>L.</given-names></name></person-group> (<year>2023</year>). <article-title>Cross-modal attention fusion network for RGB-d semantic segmentation</article-title>. <source>Neurocomputing</source> <volume>548</volume>:<fpage>126389</fpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2023.126389</pub-id></citation>
</ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>W.</given-names></name> <name><surname>Dong</surname> <given-names>P.</given-names></name> <name><surname>Xia</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <article-title>D-unet: a dimension-fusion u shape network for chronic stroke lesion segmentation</article-title>. <source>IEEE/ACM Trans. Comput. Biol. Bioinform</source>. <volume>18</volume>, <fpage>940</fpage>&#x02013;<lpage>950</lpage>. <pub-id pub-id-type="doi">10.1109/TCBB.2019.2939522</pub-id><pub-id pub-id-type="pmid">31502985</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>H.-Y.</given-names></name> <name><surname>Guo</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Yu</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Yu</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>nnFormer: interleaved transformer for volumetric segmentation</article-title>. <source>arXiv preprint: arXiv:2109.03201</source>.</citation>
</ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>Z.</given-names></name> <name><surname>He</surname> <given-names>X.</given-names></name> <name><surname>Qi</surname> <given-names>G.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Cong</surname> <given-names>B.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>Brain tumor segmentation based on the fusion of deep semantics and edge information in multimodal MRI</article-title>. <source>Inform. Fus</source>. <volume>91</volume>, <fpage>376</fpage>&#x02013;<lpage>387</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2022.10.022</pub-id></citation>
</ref>
</ref-list>
</back>
</article>