<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Oncol.</journal-id>
<journal-title>Frontiers in Oncology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Oncol.</abbrev-journal-title>
<issn pub-type="epub">2234-943X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fonc.2024.1469293</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Oncology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>MT-SCnet: multi-scale token divided and spatial-channel fusion transformer network for microscopic hyperspectral image segmentation</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Cao</surname>
<given-names>Xueying</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2797925"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gao</surname>
<given-names>Hongmin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1941983"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Haoyan</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2337054"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fei</surname>
<given-names>Shuyu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2825725"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xu</surname>
<given-names>Peipei</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1896299"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Zhijian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Computer Science and Software Engineering, Hohai University</institution>, <addr-line>Nanjing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Hematology, Nanjing Drum Tower Hospital Clinical College of Nanjing University of Chinese Medicine</institution>, <addr-line>Nanjing</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Hematology, Nanjing University Medical School Affiliated Nanjing Drum Tower Hospital</institution>, <addr-line>Nanjing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Qingli Li, East China Normal University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Yu-Jie Xiong, Shanghai University of Engineering Sciences, China</p>
<p>Wei Li, Beijing Institute of Technology, China</p>
<p>Qiuhong Cui, Beijing Jiaotong University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Peipei Xu, <email xlink:href="mailto:xu_peipei0618@163.com">xu_peipei0618@163.com</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>12</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>14</volume>
<elocation-id>1469293</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>07</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>11</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Cao, Gao, Zhang, Fei, Xu and Wang</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Cao, Gao, Zhang, Fei, Xu and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Hybrid architectures based on convolutional neural networks and Transformers, effectively captures both the local details and the overall structural context of lesion tissues and cells, achieving highly competitive segmentation results in microscopic hyperspectral image (MHSI) segmentation tasks. However, the fixed tokenization schemes and single-dimensional feature extraction and fusion in existing methods lead to insufficient global feature extraction in hyperspectral pathology images.</p>
</sec>
<sec>
<title>Methods</title>
<p>Base on this, we propose a multi-scale token divided and spatial-channel fusion transformer network (MT-SCnet) for MHSIs segmentation. Specifically, we first designed a Multi-Scale Token Divided module. It divides token at different scale based on mirror padding and promotes information interaction and fusion between different tokens to obtain more representative features for subsequent global feature extraction. Secondly, a novel spatial channel fusion transformer was designed to capture richer features from spatial and channel dimensions, and eliminates the semantic gap between features from different dimensions based on cross-attention fusion block. Additionally, to better restore spatial information, deformable convolutions were introduced in decoder.</p>
</sec>
<sec>
<title>Results</title>
<p>The Experiments on two MHSI datasets demonstrate that MT-SCnet outperforms the comparison methods. </p>
</sec>
<sec>
<title>Discussion</title>
<p>This advance has significant implications for the field of MHSIs segmentation. Our code is freely available at <uri xlink:href="https://github.com/sharycao/MT-SCnet">https://github.com/sharycao/MT-SCnet</uri>.</p>
</sec>
</abstract>
<kwd-group>
<kwd>microscopic hyperspectral image</kwd>
<kwd>feature fusion</kwd>
<kwd>multi-scale</kwd>
<kwd>transformer</kwd>
<kwd>deformable convolution</kwd>
</kwd-group>
<counts>
<fig-count count="8"/>
<table-count count="7"/>
<equation-count count="16"/>
<ref-count count="44"/>
<page-count count="13"/>
<word-count count="6297"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Cancer Imaging and Image-directed Interventions</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>The outstanding performance of hyperspectral imaging technology in remote sensing has attracted attention from various domains (<xref ref-type="bibr" rid="B1">1</xref>&#x2013;<xref ref-type="bibr" rid="B3">3</xref>). Some researchers applied hyperspectral imaging to medical field, obtaining microscopic hyperspectral images (MHSIs) (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B5">5</xref>). Compared to other images, MHSIs contain not only in high spatial resolution but also in high spectral resolution. These abundant spectral features can reflect the biochemical status of biological tissue cells, providing multi-dimensional information for tissue analysis and diagnosis (<xref ref-type="bibr" rid="B6">6</xref>). This provides strong support for the early diagnosis and treatment of diseases. The MHSIs segmentation task is the initial step in utilizing MHSIs to assist pathologists in diagnosis. Fully and effectively employing spectral-spatial information to segment tissues not only has significant research value, but also has critical clinical importance.</p>
<p>In early segmentation tasks, researchers focused on extracting spectral features (<xref ref-type="bibr" rid="B7">7</xref>&#x2013;<xref ref-type="bibr" rid="B9">9</xref>) to segment the tissue. To further enhance the performance, some researchers employed methods such as Otsu (<xref ref-type="bibr" rid="B10">10</xref>), object-based multiscale analysis (<xref ref-type="bibr" rid="B11">11</xref>), and spatial-spectral density analysis (<xref ref-type="bibr" rid="B12">12</xref>) to both extract spatial and spectral features in MHSI. With the development of deep learning, the encoder-decoder architectures based on convolutional neural networks (CNNs) (<xref ref-type="bibr" rid="B13">13</xref>) have been widely applied in MHSI segmentation tasks. It can automatically extract spectral-spatial information from MHSIs, avoiding the need for complex manually designed features. For example, Sun et&#xa0;al. (<xref ref-type="bibr" rid="B14">14</xref>) designed a cholangiocarcinoma analysis and diagnosis method based on CNNs and proposed a spectral interval convolution and normalization scheme to learn richer spatial-spectral information. Wang et&#xa0;al. (<xref ref-type="bibr" rid="B15">15</xref>) designed a 3D fully convolutional network to extract spatial-spectral features from MHSIs to segment melanoma. Gao et&#xa0;al. (<xref ref-type="bibr" rid="B16">16</xref>) designed a high-level feature channel attention U-Net. Given the exceptional representation learning ability of CNNs, these methods have produced remarkable results. However, due to the limit by the inherent locality of CNNs, they cannot extract long-range context and global semantics features (<xref ref-type="bibr" rid="B17">17</xref>).</p>
<p>Vision Transformer (ViT) (<xref ref-type="bibr" rid="B18">18</xref>) is a structure based on self-attention that possesses powerful capabilities for global context modeling and has achieved excellent performance in various tasks. Naturally, incorporating it into MHSI segmentation has also become the key point of current research. Dai et&#xa0;al. (<xref ref-type="bibr" rid="B19">19</xref>) proposed a segmentation network based on swin-spec transformer to extract feature from both spatial and spectral dimensions of cholangiocarcinoma hyperspectral images. Wang et&#xa0;al. (<xref ref-type="bibr" rid="B20">20</xref>) designed a dual-modal pathological image cross-attention U-Net, which designed two cascaded multi head self-attention for extracting and exchanging the information between HSI and RGB. The incorporation of global context has contributed to the outstanding performance of these methods. However, these methods typically tokenize based on specific kernel scales, resulting in fixed size area information within the tokens. This limitation restricts the efficient extraction of subsequent global feature. Additionally, most existing methods extract features only from the spatial dimension, which result in insufficient feature extraction. Although some researchers extract features from both spectral and spatial dimensions, they often overlook the potential semantic gap between features from different dimensions. This may introduce new interference, thereby adversely affecting the model&#x2019;s performance.</p>    <p>To address the aforementioned issues, this paper proposes a novel network called multi-scale token division and spatial-channel fusion transformer (MT-SCnet) for MHSIs segmentation. MT-SCnet aims to more efficiently extract the spatial-spectral information in MHSI, which mainly featuring on Multi-scale Token Division (MSTD) module and Spatial-Channel Fusion Transformer (SCFormer) block. Specifically, MSTD is designed to exploit the advantages of multi-scale tokens to enrich the global dependencies. It utilizes mirror flipping padding to generate different spatial size feature maps and divides tokens at different scale on them. Meanwhile, it promotes information interaction and fusion between tokens for richer and more robust feature information. SCFormer is proposed to more comprehensively exploit global spectral-spatial information. It extracts spectral-spatial information from both spatial and channel dimensions to obtain more enriched feature representations, and suppress the semantic gap between features from different dimensions through a cross attention fusion module (CAF). Furthermore, dense connection is introduced between channel dimensions to facilitate the transfer and interaction of global features across different levels. In addition, for capture local spectral-spatial information, MT-SCnet employs CNN at shadow encoder, and employ deformable convolutions (<xref ref-type="bibr" rid="B21">21</xref>) to better restore the spatial dimensions of the feature maps at decoder. The main contributions are as follows:</p>
<list list-type="order">
<list-item>
<p>We propose MT-SCnet for MHSI segmentation, which more effectively and efficiently captures the spectral-spatial information in MHSIs through multi-scale token division and multi-dimensional feature extraction. The proposed MSTD, SCFormer, and the deformable convolutions all play crucial roles in the network, which enhanced its overall segmentation performance.</p>
</list-item>
<list-item>
<p>We propose a multi-scale token division module, which enrich the global dependencies by capturing multi-scale tokens and promoting fusion between different tokens.</p>
</list-item>
<list-item>
<p>We design a novel spatial-channel fusion transformer block. It conducts a more comprehensive extraction of global feature, and reduce the semantic gap between different dimensions features through emphasizes the commonality among them.</p>
</list-item>
<list-item>
<p>The experimental results on Gastric Intraepithelial Ieoplasia (GIN) and intestinal metaplasia (IM) MHSI datasets demonstrate that the proposed method achieves competitive results.</p>
</list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2_1">
<label>2.1</label>
<title>CNN for medical segmentation</title>
<p>The encoder-decoder architectures based on CNNs, such as U-Net (<xref ref-type="bibr" rid="B22">22</xref>) and U-Net++ (<xref ref-type="bibr" rid="B23">23</xref>) have exhibited remarkable performance in medical image segmentation tasks. Currently, researchers have conducted thorough investigations on the encoder-decoder architectures and proposed many methods to improve segmentation performance. For instance, some studies introducing residual structures into U-Net to address the issue of network degradation (<xref ref-type="bibr" rid="B24">24</xref>, <xref ref-type="bibr" rid="B25">25</xref>). To obtain a better receptive field and capture more contextual information, researchers have introduced dilated convolutions (<xref ref-type="bibr" rid="B26">26</xref>) and deformable convolutions (<xref ref-type="bibr" rid="B27">27</xref>) to U-Net. Additionally, many studies have utilized attention mechanisms to help model focus on crucial feature information, which can further improve segmentation accuracy (<xref ref-type="bibr" rid="B28">28</xref>). For example, Gao et&#xa0;al. (<xref ref-type="bibr" rid="B29">29</xref>) designed an attention network for the segmentation of cholangiocarcinoma MHSIs. Liu et&#xa0;al. (<xref ref-type="bibr" rid="B30">30</xref>) designed the global context and hybrid attention network for lung segmentation.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Vision Transformer for medical segmentation</title>
<p>Since the outstanding achievements of ViT in 2020, the numerous of transformer-based methods have emerged in CV tasks. These methods divide the input image into patches and treat each patch as a token, which processed through transformer layers that includes self-attention mechanisms and feed-forward networks. Considering the high computational cost of directly processing the entire feature map, Swin-Transformer (<xref ref-type="bibr" rid="B31">31</xref>) divides the input image into non-overlapping windows and applies attention mechanisms independently within each window to reduce computational costs. Base on this, SwinUnet (<xref ref-type="bibr" rid="B32">32</xref>) was proposed and has demonstrated outstanding results in medical image segmentation.</p>
<p>Compared with using pure Transformer, some researchers designed hybrid networks that concurrently utilize CNNs and Transformers to achieve higher segmentation performance. TransUnet (<xref ref-type="bibr" rid="B17">17</xref>) was the first to design a hybrid architecture based on CNN and Transformer, and achieved excellent segmentation results. Huang et&#xa0;al. (<xref ref-type="bibr" rid="B33">33</xref>) introduced MISSFormer, a network designed to capture more discriminative dependencies and context, and has better ability to integrate global information and local context. To further minimize feature loss during the downsampling process and enhance the restoration of spatial information during upsampling, Zhang et&#xa0;al. (<xref ref-type="bibr" rid="B34">34</xref>) proposed the FDR-TransUNet based on TransUnet. This module introduces an amalgamation of concepts from densenet and resnet in encoder, and upsample through two independent expanding paths. Zhu et&#xa0;al. (<xref ref-type="bibr" rid="B35">35</xref>) proposed a parallel hybrid architecture that feeds input images concurrently into both CNN and transformer branches, thereby effectively merging spatial detail features with global contextual information.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Multi-scale information extraction for medical segmentation</title>
<p>To address the complex structural variations in biological tissues, multi-scale features are commonly utilized in medical image segmentation tasks. To enhance the quality of feature learning, Lin et&#xa0;al. (<xref ref-type="bibr" rid="B36">36</xref>) proposed a dual-branch Swin-transformer in the encoder to extract multi-scale feature representations. He et&#xa0;al. (<xref ref-type="bibr" rid="B37">37</xref>) developed a block within the encoding pathway that integrates multi-scale information, global features from transformers, and local details from CNNs, thereby enhancing the model&#x2019;s capability for feature representation. Ao et&#xa0;al. (<xref ref-type="bibr" rid="B38">38</xref>) designed a shunt transformer to capture multi-scale features and utilized a pyramid decoder for decoding, effectively harnessing the fine features. In addition, some researchers employ multi-scale feature extraction strategies to bridge the gap between the features in the encoder and decoder, thereby enhancing the segmentation accuracy. For example, Fang et&#xa0;al. (<xref ref-type="bibr" rid="B39">39</xref>) designed a pyramid input-output network to compress multi-scale features, which for reducing the semantic gap between multi-scale features. Sun et&#xa0;al. (<xref ref-type="bibr" rid="B40">40</xref>) developed a multi-scale bridging module between the encoder and decoder to effectively interact with multi-scale context information. Liu et&#xa0;al. (<xref ref-type="bibr" rid="B41">41</xref>) proposed a multi-scale embedding spatial transformer. This module effectively captures the global context of images by modeling the spatial relationships between multi-scale and multi-level image patches. To further achieve a refined fusion of global and local features, Heidari et&#xa0;al. (<xref ref-type="bibr" rid="B42">42</xref>) designed multiple multi-scale representations based on the Swin Transformer and CNN-based encoders. Furthermore, some researchers have designed feature fusion structures within the decoder to fully decode features at various scales. Such as Yang et&#xa0;al. (<xref ref-type="bibr" rid="B43">43</xref>) proposed a multi query attention module to fuse the multi-scale features from different levels of decoder sub-network.</p>
<p>Although previous works have utilized CNNs and Transformers to extract global and local spectral-spatial information and further enhanced model representation through multi-scale feature extraction, their single tokenization schemes and simplistic fusion methods between features from different dimensions have limited the models&#x2019; performance potential.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<p>As shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>, MT-SCnet is an encoder-decoder network. The encoder consists of CNN, MSTD, and three SCFormers to extract local and global spectral-spatial information in MHSIs. In CNN, we adopted the design of TransUnet (<xref ref-type="bibr" rid="B17">17</xref>) to learn the local context in MHSI. Next, MSTD is used to learn multi-scale tokens to acquire richer and better feature representations for subsequent global feature extract. Additionally, three SCFormer layers are employed to learn global spectral-spatial information, and the dense connect is used for enhancing the utilization of global features. In decoder, deformable convolutions are introduced to obtaining more representative features, thereby better restore the spatial size. We also employed Principal Component Analysis (PCA) to reduce the dimensionality of MHSI, thus mitigating computational costs.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Architecture of the MT-SCnet. In <bold>(A)</bold>, we show the overall architecture of MT-SCnet. In <bold>(B)</bold>, we present details of the SCFormer.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g001.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>MSTD</title>
<p>Existing methods typically use a single scale tokenization scheme, which limits the efficiency of subsequent global information extraction. Multi-scale information provides an effective way to address significant morphological differences between tissues and enhance the model&#x2019;s ability to represent details. Therefore, dividing tokens in a more flexible manner to capture multi-scale information, thereby providing richer and more robust features for subsequent global feature extraction, is key to improving segmentation performance. Based on this, MSTD is proposed to perform multi-scale token division and promote information interaction and fusion between tokens.</p>
<p>The specific steps of MSTD are shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>. Assuming <inline-formula>
<mml:math display="inline" id="im1">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represent the height and width of the feature map <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> learned through CNN. Firstly, MSTD performs multi-scale token division based on convolution operations and mirror flipping. It uses the convolution of 2&#xd7;2 kernels with 2 stride to obtain tokens <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Then, employ mirror flipping padding expand the <inline-formula>
<mml:math display="inline" id="im5">
<mml:mi mathvariant="bold-italic">Z</mml:mi>
</mml:math>
</inline-formula> to the size of 2 <inline-formula>
<mml:math display="inline" id="im6">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> &#xd7;2 <inline-formula>
<mml:math display="inline" id="im7">
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula> and use the convolution of 4&#xd7;4 kernels with 4 stride to obtain tokens <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Although <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> have the same size and quantity, they contain different detailed information due to the different scale used for their division. Therefore, this step enables the obtain richer information. Secondly, MSTD promotes information interaction and fusion between tokens to generate more discriminative features. It evenly divides each scale tokens into two parts along the channel. This process is demonstrated in <xref ref-type="disp-formula" rid="eq1">Equations 1</xref>, <xref ref-type="disp-formula" rid="eq2">2</xref>:</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Structure of MSTD.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g002.tif"/>
</fig>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the channel splitting operation. Then, <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> added together to obtain the fused feature <inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is element-wise multiplied with <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im17">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> respectively, to learn the weight matrix. The Tanh+1 was used as active function to map the aforementioned weight matrix to the range of [0,2]. In addition, residual connections are introduced to preserve the original information and accelerate convergence. The above process can be represented by <xref ref-type="disp-formula" rid="eq3">Equations 3</xref>&#x2013;<xref ref-type="disp-formula" rid="eq5">5</xref>:</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="bold-italic">A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="bold-italic">A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:mi mathvariant="bold-italic">A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents Tanh+1 activation function, <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represent the feature maps after residual connections. Then, <inline-formula>
<mml:math display="inline" id="im21">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im22">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im23">
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are calculated through matrix element-wise multiplication to obatin the pixel-level relationship map. Tanh+1 and residual connections are also employed to enhance feature representation. The above operation enables <inline-formula>
<mml:math display="inline" id="im24">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im25">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im26">
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> incorporate feature information from different scales, thereby providing more abundant and robust feature information. Finally, the <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im28">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are added together, and concatenate with <inline-formula>
<mml:math display="inline" id="im29">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> along the channel dimension. Overall, dividing tokens at different scales and promoting information interaction and fusion between tokens of the same scale and different scales can provide richer and more comprehensive feature representations for subsequent global feature extraction. Assuming <inline-formula>
<mml:math display="inline" id="im30">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the concatenate operation and <inline-formula>
<mml:math display="inline" id="im31">
<mml:mi mathvariant="bold-italic">F</mml:mi>
</mml:math>
</inline-formula> is the output of MSTD, the above process can be expressed as <xref ref-type="disp-formula" rid="eq6">Equations 6</xref>, <xref ref-type="disp-formula" rid="eq7">7</xref>:</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="bold-italic">A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>SCFormer</title>
<p>Global features can better extract the context information of the overall structure of tissues, which is a crucial step for efficiently utilizing spectral-spatial information to learn tissue characteristics. However, most previous methods rely on learning from a single dimension, leading to insufficient extraction of global spectral-spatial information. Additionally, while some methods learn spectral-spatial information from different dimensions, they simply fuse these features. Considering the semantic gap between features from different dimensions, simple feature fusion may introduce new interference, thereby affecting the final segmentation results. Based on this, the SCFormer was proposed, which not only extract spectral-spatial information from spatial and channel dimensions by multi-head spatial attention (MSA) and multi-head channel attention (MCA), but also highlight common features between feature from different dimensions for suppressing the semantic gap by CAF. In addition, dense connections are incorporate between MCA to enhance feature reuse and improve the learning of features across different levels. As shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>, the main improve structure of proposed SCFormer module incorporates MSA, MCA, CAF and multilayer perceptron (MLP). The MSA and MCA can be represented as <xref ref-type="disp-formula" rid="eq8">Equations 8</xref>&#x2013;<xref ref-type="disp-formula" rid="eq13">13</xref>:</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Structure of SCFormer.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g003.tif"/>
</fig>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>M</mml:mi>
<mml:mi>C</mml:mi>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mstyle>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mstyle>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im32">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>M</mml:mi>
<mml:mi>C</mml:mi>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im33">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> represents the MCA and MSA in the <inline-formula>
<mml:math display="inline" id="im34">
<mml:mi mathvariant="bold-italic">j</mml:mi>
</mml:math>
</inline-formula>-th SCFormer, respectively. <inline-formula>
<mml:math display="inline" id="im35">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">z</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the output feature map of the <inline-formula>
<mml:math display="inline" id="im36">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>-th SCFormer, <inline-formula>
<mml:math display="inline" id="im37">
<mml:mrow>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the dense connection, and <inline-formula>
<mml:math display="inline" id="im38">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>L</mml:mi>
<mml:mi>N</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> represents the fully connected layer.</p>
<p>For CAF in <inline-formula>
<mml:math display="inline" id="im39">
<mml:mi mathvariant="bold-italic">j</mml:mi>
</mml:math>
</inline-formula>-th SCFormer, we first perform element-wise multiplication between the <inline-formula>
<mml:math display="inline" id="im40">
<mml:mi mathvariant="bold-italic">j</mml:mi>
</mml:math>
</inline-formula>-th spatial feature and channel feature to highlight their commonalities, which pay more attention to the common important information while pay less attention on insignificance information. Subsequently, we perform same operation between the output of the <inline-formula>
<mml:math display="inline" id="im41">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>-th SCFormer with the <inline-formula>
<mml:math display="inline" id="im42">
<mml:mi mathvariant="bold-italic">j</mml:mi>
</mml:math>
</inline-formula>-th spatial feature and channel feature, respectively, to highlight the commonalities between different hierarchical features. Then, point-wise summation operation and Tanh+1 function is conduct. Overall, emphasizing the commonalities between features from different dimensions not only highlights common important features but also reduces redundancy and interference between features, thereby suppressing the semantic gap. It can generate the more efficiently fused feature map. Assuming <inline-formula>
<mml:math display="inline" id="im43">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> as the output of <inline-formula>
<mml:math display="inline" id="im44">
<mml:mi mathvariant="bold-italic">j</mml:mi>
</mml:math>
</inline-formula>-th CAF, <inline-formula>
<mml:math display="inline" id="im45">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im46">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im47">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent the output of <inline-formula>
<mml:math display="inline" id="im48">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>M</mml:mi>
<mml:mi>C</mml:mi>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im49">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im50">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>-th SCFormer respectively, the above process can be expressed as <xref ref-type="disp-formula" rid="eq14">Equation 14</xref>:</p>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi mathvariant="bold-italic">A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>&#xd7;</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>+</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">F</mml:mi>
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mstyle>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The MLP consists of two fully connected layers, GELU activation functions, and two dropout functions. Specific parameters can be referred to in TransUnet (<xref ref-type="bibr" rid="B17">17</xref>).</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Decoder</title>
<p>In decoder, this paper introduces deformable convolution in decoder to adaptively adjust the shape of the receptive field. By flexibly choosing sampling locations to handle various deformations and scale variations, it better restores and refines the details in the feature maps. Compared to using multi-scale features or adding convolutional layers to enhance the decoder&#x2019;s representation capability, deformable convolutions improve feature representation by introducing a small number of offset parameters. This approach enhances segmentation performance while avoiding a significant increase in computational cost. Deformable convolution can be represented as <xref ref-type="disp-formula" rid="eq15">Equation 15</xref>:</p>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:msubsup>
<mml:mstyle mathvariant="bold-italic">
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
<mml:mo>&#xd7;</mml:mo>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mo>+</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
<mml:mo>+</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>&#x394;</mml:mi>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>&#x394;</mml:mi>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im51">
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:math>
</inline-formula> denote the sampling points of the convolution kernel. <inline-formula>
<mml:math display="inline" id="im52">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im53">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>&#x394;</mml:mi>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> represent the preset offset and the learnable offset, respectively. <inline-formula>
<mml:math display="inline" id="im54">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im55">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:mi>&#x394;</mml:mi>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> correspond to the weight and the modulation scalar for the nth position, respectively.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Loss function</title>
<p>In the proposed model, the total training loss can be expressed as <xref ref-type="disp-formula" rid="eq16">Equation 16</xref>:</p>
<disp-formula id="eq16">
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="bold">0.7</mml:mn>
<mml:mstyle mathvariant="bold-italic">
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mstyle>
<mml:mo>+</mml:mo>
<mml:mn mathvariant="bold">0.3</mml:mn>
<mml:mfrac>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi mathvariant="bold-italic">C</mml:mi>
</mml:mfrac>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">c</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mi mathvariant="bold-italic">c</mml:mi>
</mml:msubsup>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mi>c</mml:mi>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im56">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> represents the Cross-Entropy Loss (CE), <inline-formula>
<mml:math display="inline" id="im57">
<mml:mi mathvariant="bold-italic">C</mml:mi>
</mml:math>
</inline-formula> denotes the number of classes, and <inline-formula>
<mml:math display="inline" id="im58">
<mml:mrow>
<mml:mstyle mathvariant="bold-italic">
<mml:msubsup>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mi>c</mml:mi>
</mml:msubsup>
</mml:mstyle>
</mml:mrow>
</mml:math>
</inline-formula> represents the Dice Loss (DCE) for class <inline-formula>
<mml:math display="inline" id="im59">
<mml:mi mathvariant="bold-italic">C</mml:mi>
</mml:math>
</inline-formula>. CE measures the alignment between the predicted probabilities and the true labels, making it highly effective for optimizing pixel-level segmentation tasks. However, CE does not account for the spatial relationships between pixels and is sensitive to class imbalance issues. In contrast, DCE measures the overlap between the predicted and true regions, capturing the spatial relationships between pixels and highlighting target regions, thereby addressing the potential shortcomings of CE.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets</title>
<p>In order to validate the effectiveness of the proposed MT-SCnet, we conducted experiments on two MHSI datasets (<xref ref-type="bibr" rid="B44">44</xref>), including gastric mucosa intestinal metaplasia MHSI dataset (IM) and Gastric Intraepithelial Ieoplasia MHSI dataset (GIN). The dataset IM consists of 412 MHSIs and the dataset GIN consists of 282 MSHIs. Each hyperspectral data cube is acquired at 10x objective lens, which contains 40 bands and the spectral range of 450 to 700nm with 6.25nm spectral resolution. Under the guidance of pathologist, we select and crop the original MHSI images with 512&#xd7;512 spatial resolution. All MHSIs are labelled by pathologists with precancerous regions. In this paper, five-fold cross validation method is used.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Implementation and evaluation</title>
<p>The MT-SCnet is implemented based on PyTorch, all experiments were conducted on a computer with 32GB of memory and an Nvidia GeForce GTX 4090. The stochastic gradient descent (SGD) is used for backpropagation, the learning rate was set to 0.01, momentum to 0.9, weight decay to 1e-6, and the batch size was 4. We utilized Dice Loss and Crossentropy functions simultaneously as loss functions, with weights set to 0.3 and 0.7, respectively. The number of epochs is set to 45 and 60 for dataset IM and GIN. The weights with the lowest loss were chosen as the optimal weights for testing. In order to better assess the performance of the proposed model, four common evaluation metrics are used: Overall Accuracy (OA), Sensitivity, Intersection Over Union (IoU) and Dice Similarity Coefficient (DSC).</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Comparison with mainstream methods</title>
<p>We conducted a series of ablation experiments to validate the effectiveness of different modules in segmentation. We first tested the effectiveness of the MSTD, SCFormer and deformable convolution in MT-SCnet and further conducted ablation experiments within SCFormer and MSTD to validate the rationality of each design. All experiments were conducted on the IM dataset.</p>
<sec id="s4_3_1">
<label>4.3.1</label>
<title>IM dataset</title>
<p>As shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>, our model achieved the best result with 94.45% OA, 92.06% sensitivity, 86.63% IoU, and 92.82% DSC. It outperformed the purely CNN-based U-Net model by 1.26% OA, 1.46% Sensitivity, 2.75% IoU, and 1.59% DSC. Furthermore, compared to HLCA-Unet, which is also designed for MHSIs segmentation tasks, the proposed method achieves significant improvements across all metrics. In summary, MT-SCNet outperforms the pure CNN-based comparative models in terms of performance. Compared to the HiFormer-b, which utilizes both CNN and Transformer architectures and ranks second in segmentation accuracy, MT-SCnet showed improvements of 0.94% OA, 0.29% Sensitivity, 1.94% IoU and 1.11 DSC% respectively. This is benefited to the more efficiency learning of spectral-spatial information from MHSIs through MSTD, SCFormer and deformable convolutions. Additionally, MT-SCNet has 34.59M parameters and 120.91G FLOPs. Compared to HiFormer-b, which with 31.69M parameters and 355.77G FLOPs, the proposed method achieves a better balance in terms of accuracy, memory usage, and computational cost. The K, M, G, and T in the <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> represent Kilo, Mega, Giga, and Tera, respectively.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Comparison with other methods on IM dataset (%).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Architecture</th>
<th valign="middle" align="center">OA</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">DSC</th>
<th valign="middle" align="center">Params</th>
<th valign="middle" align="center">FLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">U-Net</td>
<td valign="middle" align="center">93.19</td>
<td valign="middle" align="center">90.60</td>
<td valign="middle" align="center">83.88</td>
<td valign="middle" align="center">91.23</td>
<td valign="middle" align="center">34.53 M</td>
<td valign="middle" align="center">1.05T</td>
</tr>
<tr>
<td valign="middle" align="center">Att-Unet</td>
<td valign="middle" align="center">93.01</td>
<td valign="middle" align="center">90.46</td>
<td valign="middle" align="center">85.19</td>
<td valign="middle" align="center">91.00</td>
<td valign="middle" align="center">34.88M</td>
<td valign="middle" align="center">1.07T</td>
</tr>
<tr>
<td valign="middle" align="center">HLCA-Unet</td>
<td valign="middle" align="center">89.64</td>
<td valign="middle" align="center">86.05</td>
<td valign="middle" align="center">76.45</td>
<td valign="middle" align="center">86.63</td>
<td valign="middle" align="center">588.84K</td>
<td valign="middle" align="center">62.92G</td>
</tr>
<tr>
<td valign="middle" align="center">MISSFormer</td>
<td valign="middle" align="center">92.44</td>
<td valign="middle" align="center">90.37</td>
<td valign="middle" align="center">82.41</td>
<td valign="middle" align="center">90.33</td>
<td valign="middle" align="center">35.45 M</td>
<td valign="middle" align="center">36.96G</td>
</tr>
<tr>
<td valign="middle" align="center">TransUnet</td>
<td valign="middle" align="center">93.09</td>
<td valign="middle" align="center">90.12</td>
<td valign="middle" align="center">83.62</td>
<td valign="middle" align="center">91.06</td>
<td valign="middle" align="center">100.90 M</td>
<td valign="middle" align="center">201.03G</td>
</tr>
<tr>
<td valign="middle" align="center">Hiformer-b</td>
<td valign="middle" align="center">93.51</td>
<td valign="middle" align="center">91.77</td>
<td valign="middle" align="center">84.69</td>
<td valign="middle" align="center">91.71</td>
<td valign="middle" align="center">31.69M</td>
<td valign="middle" align="center">355.77G</td>
</tr>
<tr>
<td valign="middle" align="center">MT-SCnet</td>
<td valign="middle" align="center">94.45</td>
<td valign="middle" align="center">92.06</td>
<td valign="middle" align="center">86.63</td>
<td valign="middle" align="center">92.82</td>
<td valign="middle" align="center">34.59M</td>
<td valign="middle" align="center">120.91G</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The K, M, G, and T represent Kilo, Mega, Giga, and Tera, respectively.</p>
</table-wrap-foot>
</table-wrap>
<p>To further validate the segmentation performance of MT-SCnet on IM, the segmentation results of all models were visualized. As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, the first column displays the false-color images of MHSIs, the second column shows the true labels, and the subsequent column present the prediction results of the U-Net, Att-Unet, HLCA-Unet, MISSFormer, TransUnet, HiFormer-b and MT-SCnet networks. As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, the prediction maps of MT-SCNet exhibit smoother boundaries and provide a more accurate and comprehensive delineation of precancerous lesion areas. Compared to HiFormer-b, the proposed network more clearly delineates the boundaries of different tissues in densely distributed regions (as shown in the fourth row). In summary, MT-SCNet achieves superior recognition of contiguous regions, producing segmentation results that align more closely with the true labels compared to other models.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Visualization results of each model on MHSIs of IM: <bold>(A)</bold> false color image of hyperspectral images; <bold>(B)</bold> ground truth; <bold>(C)</bold> U-net; <bold>(D)</bold> Att-Unet; <bold>(E)</bold> HLCA-Unet; <bold>(F)</bold> MISSFormer; <bold>(G)</bold> TransUnet; <bold>(H)</bold> Hiformer-b; <bold>(I)</bold> MT-SCnet.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g004.tif"/>
</fig>
</sec>
<sec id="s4_3_2">
<label>4.3.2</label>
<title>GIN dataset</title>
<p>As presented in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>, MT-SCnet achieve the results with 88.93% OA, 88.58% sensitivity, 79.75% IoU, 88.71% DSC on the GIN dataset. Compared to other methods, the proposed method achieves the highest accuracy in terms of OA, IoU, and DSC. However, in terms of sensitivity, MT-SCNet performs slightly lower than U-Net and HiFormer-b. Considering that the cancerous regions predicted by MT-SCNet exhibit a higher overlap with the actual regions and demonstrate more accurate overall prediction accuracy, this may be attributed to the fact that, although U-Net and HiFormer-b are more sensitive in detecting cancerous regions, this sensitivity comes at the cost of generating more false positives. Overall, the proposed method demonstrates superior performance when considering all aspects comprehensively.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparison with other methods on GIN dataset (%).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Architecture</th>
<th valign="middle" align="center">OA</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">DSC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">U-net</td>
<td valign="top" align="center">88.44</td>
<td valign="top" align="center">88.95</td>
<td valign="top" align="center">79.11</td>
<td valign="top" align="center">88.33</td>
</tr>
<tr>
<td valign="middle" align="center">Att-Unet</td>
<td valign="top" align="center">88.40</td>
<td valign="top" align="center">87.93</td>
<td valign="top" align="center">78.85</td>
<td valign="top" align="center">88.17</td>
</tr>
<tr>
<td valign="middle" align="center">HLCA-Unet</td>
<td valign="top" align="center">85.08</td>
<td valign="top" align="center">82.26</td>
<td valign="top" align="center">73.06</td>
<td valign="top" align="center">84.42</td>
</tr>
<tr>
<td valign="middle" align="center">MISSFormer</td>
<td valign="top" align="center">85.70</td>
<td valign="top" align="center">84.81</td>
<td valign="top" align="center">74.48</td>
<td valign="top" align="center">85.37</td>
</tr>
<tr>
<td valign="middle" align="center">TransUnet</td>
<td valign="top" align="center">86.20</td>
<td valign="top" align="center">87.17</td>
<td valign="top" align="center">75.66</td>
<td valign="top" align="center">86.14</td>
</tr>
<tr>
<td valign="middle" align="center">Hiformer-b</td>
<td valign="top" align="center">88.66</td>
<td valign="top" align="center">89.86</td>
<td valign="top" align="center">79.58</td>
<td valign="top" align="center">88.62</td>
</tr>
<tr>
<td valign="middle" align="center">MT-SCnet</td>
<td valign="top" align="center">88.93</td>
<td valign="top" align="center">88.58</td>
<td valign="top" align="center">79.75</td>
<td valign="top" align="center">88.71</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We have also conducted visualization of the segmentation results on the GIN dataset. As shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, the first column shows the MHSI false color images, the second column represents the ground truth, and the subsequent columns sequentially display the comparison results of U-Net, Att-Unet, HLCA-Unet, MISSFormer, TransUnet, HiFormer-b and MT-SCnet. It is evident that, compared to other methods, MT-SCNet exhibits fewer misclassifications and omissions, demonstrating more accurate segmentation performance with results that are closer to the ground truth. From the aforementioned results, it can be observed that the MT-SCnet in this paper demonstrates stronger segmentation performance.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Visualization results of each model on MHSIs of GIN: <bold>(A)</bold> false color image of hyperspectral images; <bold>(B)</bold> ground truth; <bold>(C)</bold> U-net; <bold>(D)</bold> Att-Unet; <bold>(E)</bold> HLCA-Unet; <bold>(F)</bold> MISSFormer; <bold>(G)</bold> TransUnet; <bold>(H)</bold> Hiformer-b; <bold>(I)</bold> MT-SCnet.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g005.tif"/>
</fig>
</sec>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Ablation experiments</title>
<p>We conducted a series of ablation experiments to validate the effectiveness of different modules in segmentation. We first tested the effectiveness of the MSTD, SCFormer and deformable convolution in MT-SCnet and further conducted ablation experiments within SCFormer to validate the rationality of each design. All experiments were conducted on the IM dataset. The result show in <xref ref-type="table" rid="T3">
<bold>Tables&#xa0;3</bold>
</xref>, <xref ref-type="table" rid="T4">
<bold>4</bold>
</xref>. In the tables, <inline-formula>
<mml:math display="inline" id="im60">
<mml:mo>&#xd7;</mml:mo>
</mml:math>
</inline-formula> indicates that the module was not used, while <inline-formula>
<mml:math display="inline" id="im61">
<mml:mo>&#x221a;</mml:mo>
</mml:math>
</inline-formula> indicates that it was employed.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Ablation study on the proposed components of the MT-SCnet with the IM dataset (%).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">No.</th>
<th valign="middle" align="center">MSTD</th>
<th valign="middle" align="center">SCFormer</th>
<th valign="middle" align="center">Deformable Convolution</th>
<th valign="middle" align="center">OA</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">DSC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.41</td>
<td valign="top" align="center">90.07</td>
<td valign="top" align="center">82.26</td>
<td valign="top" align="center">90.26</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.87</td>
<td valign="top" align="center">91.07</td>
<td valign="top" align="center">83.39</td>
<td valign="top" align="center">90.92</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">93.31</td>
<td valign="top" align="center">91.73</td>
<td valign="top" align="center">84.26</td>
<td valign="top" align="center">91.45</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">93.90</td>
<td valign="top" align="center">91.03</td>
<td valign="top" align="center">85.36</td>
<td valign="top" align="center">92.09</td>
</tr>
<tr>
<td valign="middle" align="center">5</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">93.70</td>
<td valign="top" align="center">93.17</td>
<td valign="top" align="center">85.25</td>
<td valign="top" align="center">92.03</td>
</tr>
<tr>
<td valign="middle" align="center">6</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="middle" align="center">94.45</td>
<td valign="middle" align="center">92.06</td>
<td valign="middle" align="center">86.63</td>
<td valign="middle" align="center">92.82</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>&#xd7; indicates that the module was not used, while &#x221a; indicates that it was employed.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Ablation study on the SCFormer of the MT-SCnet with the IM dataset (%).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">No.</th>
<th valign="top" align="center">MSA</th>
<th valign="top" align="center">MCA</th>
<th valign="top" align="center">Dense Connect</th>
<th valign="top" align="center">CAF</th>
<th valign="middle" align="center">OA</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">DSC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.41</td>
<td valign="top" align="center">90.07</td>
<td valign="top" align="center">82.26</td>
<td valign="top" align="center">90.26</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.56</td>
<td valign="top" align="center">90.29</td>
<td valign="top" align="center">82.60</td>
<td valign="top" align="center">90.46</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.71</td>
<td valign="top" align="center">90.18</td>
<td valign="top" align="center">82.86</td>
<td valign="top" align="center">90.62</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.39</td>
<td valign="top" align="center">90.28</td>
<td valign="top" align="center">82.25</td>
<td valign="top" align="center">90.25</td>
</tr>
<tr>
<td valign="middle" align="center">5</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="middle" align="center">93.31</td>
<td valign="middle" align="center">91.73</td>
<td valign="middle" align="center">84.26</td>
<td valign="middle" align="center">91.45</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>&#xd7; indicates that the module was not used, while &#x221a; indicates that it was employed.</p>
</table-wrap-foot>
</table-wrap>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Ablation study on proposed components</title>
<p>
<xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref> shows the ablation experimental results of the proposed components. Experiment 1 is the baseline result, which only uses three transformer blocks with MSA. The comparisons between Experiment 1 and Experiment 2, as well as between Experiment 3 and Experiment 5, demonstrate that the addition of MSTD has improved OA, Sensitivity, IoU, and DSC indicators. This indicates that the addition of MSTD has led to an increase in overall prediction accuracy, and the overlap and similarity of the segmented regions are also enhanced. Additionally, comparing Experiment 1 with Experiment 3, as well as Experiment 2 with Experiment 5, it is evident that the integration of SCFormer results in a significant segmentation performance improvement. Furthermore, from the comparison in Experiment 1 and Experiment 4, as well as Experiment 5 and Experiment 6, deformable convolution further enhances the segmentation accuracy of the model. This is because the incorporation of deformable convolution enhances local detail capture and improves the fusion of deep and shallow features. In summary, compared to the baseline, the proposed model achieved improvements of 2.04% OA, 1.99% Sensitivity, 4.37% IoU, and 2.56% DSC, achieving the best performance. Overall, the proposed modules all have a beneficial impact on the model&#x2019;s segmentation performance.</p>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Ablation study on SCFormer module</title>
<p>Next, we conducted ablation experiments on SCFormer, and the results are presented in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. As mentioned above, SCFormer extracts features from both spatial and channel dimensions by MSA and MCA, and then fuses these features from different dimensions based on a CAF. To evaluate the impact of these components on segmentation performance, five experiments were conducted in this section. All experiments were performed on the baseline model. The comparison between Experiment 2 and Experiment 3 shows that the addition of dense connections improves the model&#x2019;s performance. In Experiment 4, MSA was added on the basis of Experiment 3. However, the results showed a certain degree of decline in OA, Sensitivity, and DSC metrics. This may be because the semantic gap from different dimensions makes simple fusion introduce new redundancy and interference. In Experiment 5, the CAF module was added to Experiment 4 to alleviate the semantic differences between features from different dimensions, resulting in the best performance among all experiments. This demonstrates that CAF can suppress the semantic gap between features from different dimensions by highlighting commonalities between feature, thereby enhancing the model&#x2019;s segmentation performance.</p>
</sec>
<sec id="s4_4_3">
<label>4.4.3</label>
<title>Ablation study on MSTD module</title>
<p>Finally, we perform an ablation study on MSTD and present the results in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>. In <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>, MSTD1 represents divide multiscale token based on mirror padding, and MSTD2 represents the interaction and fusion of information between different token. With the addition of MSTD1, the model achieved improvements of 0.06%, 0.11%, and 0.06% in OA, IoU, and DSC, respectively, while sensitivity decreased by 0.02%. After add the MSTD2, the model achieved 92.87% in OA, 91.07% in sensitivity, 83.39% in IoU, and 90.92% in DSC, which improve the increases of 0.4% in OA, 1.02% in sensitivity, 1.02% in IoU, and 0.6% in DSC compare with using only MSTD1. The above results indicate that using MSTD1 alone provides limited contributions to model accuracy, likely due to the consideration of only two scales during token partitioning, resulting in limited detail capture. However, with the addition of MSTD2, the proposed method shows substantial improvements across all four evaluation metrics. This confirms that promoting interaction and fusion between markers of different sizes to obtain more discriminative features can further improve segmentation performance.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Ablation study results on the key blocks of the MSTD with the IM dataset (%).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">No.</th>
<th valign="top" align="center">MSTD1</th>
<th valign="top" align="center">MSTD2</th>
<th valign="middle" align="center">OA</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">DSC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.41</td>
<td valign="top" align="center">90.07</td>
<td valign="top" align="center">82.26</td>
<td valign="top" align="center">90.26</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.47</td>
<td valign="top" align="center">90.05</td>
<td valign="top" align="center">82.37</td>
<td valign="top" align="center">90.32</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">92.87</td>
<td valign="top" align="center">91.07</td>
<td valign="top" align="center">83.39</td>
<td valign="top" align="center">90.92</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>&#xd7; indicates that the module was not used, while &#x221a; indicates that it was employed.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>The effect of the PCA</title>
<p>Due to the high correlation and similarity among spectral bands in hyperspectral pathology images, researchers typically employ PCA to preprocess these images. To further illustrate the rationale for using PCA, we present the first five principal component of the original MHSIs alongside the first five bands of the image after dimensionality reduction via PCA in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>. The first row shows the original MHSIs, while the second row displays the images after PCA dimensionality reduction. It can be observed that the original spectral bands of the hyperspectral pathology image exhibit high similarity and a significant degree of correlation. After dimensionality reduction using PCA, the features between the principal components show significant variation, with the leading principal components retaining most of the useful information. This dimensionality reduction helps subsequent models focus on the most relevant feature information, thereby enhancing the efficiency and performance of image segmentation tasks.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Visualization of the first five principal component of the original hyperspectral pathology image alongside the first five bands of the image after dimensionality reduction via PCA.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g006.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="f7">
<bold>Figures&#xa0;7</bold>
</xref> and <xref ref-type="fig" rid="f8">
<bold>8</bold>
</xref> respectively illustrate the impact of PCA on the final segmentation results for the IM and GIN datasets. In the Figures, experiments 1 through 5 correspond to selecting the first 1 to 5 bands following PCA. The experiment results on two datasets demonstrate that as the number of bands increases, the performance on OA, IoU and DSC metrics initially improves and subsequently declines. This occurs because MHSI not only contain abundant spatial and spectral information but also includes redundant and interfering information. Increasing the number of bands excessively can introduce these features, consequently diminishing the segmentation accuracy. With the increase in the number of bands, the sensitivity results initially decrease significantly and then slightly increase. However, compared to using fewer PCA bands, the results still remain at a lower accuracy level. Overall, results on two datasets show that a moderate increase in the number of bands helps improve segmentation accuracy, but too many bands can lead to a decline in model segmentation performance.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>The effect of PCA on segmentation results on IM.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g007.tif"/>
</fig>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>The effect of PCA on segmentation results on GIN.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-14-1469293-g008.tif"/>
</fig>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>The effect of token scale in MSTD</title>
<p>To study the impact of tokens at different scales on the final segmentation results, we evaluated the combinations that involved partitioning using kernels with (2,3), (2,4), (2,5), (3,5). For example, (2,3) means 2&#xd7;2 kernel with 2 stride to generate 2&#xd7;2 token, and use 3&#xd7;3 kernel with 3 stride to generate 3&#xd7;3 token. The results for IM and GIN are presented in <xref ref-type="table" rid="T6">
<bold>Tables&#xa0;6</bold>
</xref>, <xref ref-type="table" rid="T7">
<bold>7</bold>
</xref>. The experiment results on the IM show that the best performance across all metrics occurs when the token scale combination is set to (2,4). For GIN, token scale set to (2,4) achieve the best results in OA, IoU and DSC, while the lowest results in Sensitivity. Additionally, the experimental results on the two datasets indicate that as the token size increases, the network segmentation performance generally shows a trend of initial improvement followed by a decline. In summary, we selected (2,4) as the final token partitioning dimensions in this study.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Effect of division scale in MSTD on IM dataset (%).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">No.</th>
<th valign="top" align="center">Dense Connect</th>
<th valign="middle" align="center">OA</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">DSC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="top" align="center">(2,3)</td>
<td valign="top" align="center">94.36</td>
<td valign="top" align="center">91.34</td>
<td valign="top" align="center">86.35</td>
<td valign="top" align="center">92.66</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="top" align="center">(2,4)</td>
<td valign="middle" align="center">94.45</td>
<td valign="middle" align="center">92.06</td>
<td valign="middle" align="center">86.63</td>
<td valign="middle" align="center">92.82</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="top" align="center">(2,5)</td>
<td valign="top" align="center">94.08</td>
<td valign="top" align="center">91.01</td>
<td valign="top" align="center">85.68</td>
<td valign="top" align="center">92.27</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="top" align="center">(3,5)</td>
<td valign="top" align="center">93.37</td>
<td valign="top" align="center">89.74</td>
<td valign="top" align="center">84.07</td>
<td valign="top" align="center">91.33</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Effect of division scale in MSTD on GIN dataset (%).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">No.</th>
<th valign="top" align="center">Dense Connect</th>
<th valign="middle" align="center">OA</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">DSC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="top" align="center">(2,3)</td>
<td valign="top" align="center">87.32</td>
<td valign="top" align="center">90.79</td>
<td valign="top" align="center">77.96</td>
<td valign="top" align="center">87.58</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="top" align="center">(2,4)</td>
<td valign="top" align="center">88.93</td>
<td valign="top" align="center">88.58</td>
<td valign="top" align="center">79.75</td>
<td valign="top" align="center">88.71</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="top" align="center">(2,5)</td>
<td valign="top" align="center">88.61</td>
<td valign="top" align="center">90.56</td>
<td valign="top" align="center">79.66</td>
<td valign="top" align="center">88.67</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="top" align="center">(3,5)</td>
<td valign="top" align="center">88.42</td>
<td valign="top" align="center">89.45</td>
<td valign="top" align="center">79.18</td>
<td valign="top" align="center">88.36</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5" sec-type="conclusion">
<label>5</label>
<title>Conclusion</title>
<p>In this study, we introduce a novel network named MT-SCnet for the segmentation of MHSIs. The most significant features of this network are its MSTD and SCFormer components. In MSTD, multi-scale token division and the interaction and fusion of information between different tokens provide richer and more robust feature information for subsequent global feature extraction. In SCFormer, global features are first extracted from both spatial and channel dimensions, and dense connections are introduced to obtain richer spectral-spatial information. Secondly, cross attention is used to highlight the common important features and reduce redundant information between different dimensions, thereby minimizing the semantic gap between features from different dimensions, further enhancing the model representation. Additionally, to better decode feature information in MHSIs, deformable convolutions are introduced. Results from two MHSIs datasets demonstrate that MT-SCnet exhibits strong performance, outperforming current state-of-the-art segmentation methods. In future studies, we will focus on exploring more flexible ways, such as quadrilateral, to divide tokens and on how to suppress the semantic gap between local context and global information.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>Requests to access these datasets should be directed to <email xlink:href="mailto:gaohongmin@hhu.edu.cn">gaohongmin@hhu.edu.cn</email>.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>XC: Conceptualization, Methodology, Writing &#x2013; original draft. HG: Supervision, Writing &#x2013; review &amp; editing. HZ: Validation, Writing &#x2013; review &amp; editing. SF: Visualization, Writing &#x2013; review &amp; editing. PX: Formal analysis, Project administration, Writing &#x2013; review &amp; editing. ZW: Supervision, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported by Nanjing Health Science and Technology Development Special Fund Project (YKK22087).</p>
</sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>D</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>H</given-names>
</name>
</person-group>. <article-title>Grid network: Feature extraction in anisotropic perspective for hyperspectral image classification</article-title>. <source>IEEE Geosci Remote Sens Letters</source>. (<year>2023</year>) <volume>20</volume>:<fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2023.3297612</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>L</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>L</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Bian</surname> <given-names>L</given-names>
</name>
<etal/>
</person-group>. <article-title>D<sup>2</sup>S<sup>2</sup>BoT: dual-dimension spectral-spatial bottleneck transformer for hyperspectral image classification</article-title>. <source>IEEE J Select Topics Appl Earthobservations Remote Sensing</source>. (<year>2024</year>) <volume>17</volume>:<page-range>2655&#x2013;69</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSTARS.2023.3342461</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liao</surname> <given-names>D</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>C</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L</given-names>
</name>
</person-group>. <article-title>A spectral-spatial fusion transformer network for hyperspectral image classification</article-title>. <source>IEEE Trans Geosci Remote Sensing</source>. (<year>2023</year>) <volume>61</volume>:<fpage>1</fpage>&#x2013;<lpage>16</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2023.3286950</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Tao</surname> <given-names>X</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
</person-group>. <article-title>Automatic generation of pathological benchmark dataset from hyperspectral images of double stained tissues</article-title>. <source>Optics Laser Technol</source>. (<year>2023</year>) <volume>163</volume>:<elocation-id>109331</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.optlastec.2023.109331</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
<name>
<surname>He</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>D</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>F</given-names>
</name>
</person-group>. <article-title>Review of spectral imaging technology in biomedical engineering: achievements and challenges</article-title>. <source>J Biomed Optics</source>. (<year>2013</year>) <volume>18</volume>:<elocation-id>100901</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1117/1.JBO.18.10.100901</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Neittaanmaki-Perttu</surname> <given-names>N</given-names>
</name>
<name>
<surname>Gronroos</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tani</surname> <given-names>T</given-names>
</name>
<name>
<surname>Polonen</surname> <given-names>I</given-names>
</name>
<name>
<surname>Ranki</surname> <given-names>A</given-names>
</name>
<name>
<surname>Saksela</surname> <given-names>O</given-names>
</name>
<etal/>
</person-group>. <article-title>Detecting field cancerization using a hyper-spectral imaging system</article-title>. <source>Lasers Surg Med</source>. (<year>2013</year>) <volume>45</volume>:<page-range>410&#x2013;7</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/lsm.22160</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lima</surname> <given-names>C</given-names>
</name>
<name>
<surname>Correa</surname> <given-names>L</given-names>
</name>
<name>
<surname>Byrne</surname> <given-names>H</given-names>
</name>
<name>
<surname>Zezell</surname> <given-names>D</given-names>
</name>
</person-group>. <article-title>K-means and Hierarchical Cluster Analysis as segmentation algorithms of FTIR hyperspectral images collected from cutaneous tissue</article-title>. In: <conf-name>2018 SBFoton International Optics and Photonics Conference (SBFoton IOPC)</conf-name>. <publisher-loc>Campinas, Brazil</publisher-loc>: <publisher-name>IEEE</publisher-name>. (<year>2018</year>), pp. <fpage>1</fpage>&#x2013;<lpage>4</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/SBFoton-IOPC.2018.8610920</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Piqueras</surname> <given-names>S</given-names>
</name>
<name>
<surname>Krafft</surname> <given-names>C</given-names>
</name>
<name>
<surname>Beleites</surname> <given-names>C</given-names>
</name>
<name>
<surname>Egodage</surname> <given-names>K</given-names>
</name>
<name>
<surname>von Eggeling</surname> <given-names>F</given-names>
</name>
<name>
<surname>Guntinas Lichius</surname> <given-names>O</given-names>
</name>
<etal/>
</person-group>. <article-title>Combining multiset resolution and segmentation for hyperspectral image analysis of biological tissues</article-title>. <source>Anal Chimica Acta</source>. (<year>2015</year>) <volume>881</volume>:<fpage>24</fpage>&#x2013;<lpage>36</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.aca.2015.04.053</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Guan</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>L</given-names>
</name>
</person-group>. <article-title>Sublingual vein extraction algorithm based on hyperspectral tongue imaging technology</article-title>. <source>Computer Med Imaging Graph</source>. (<year>2011</year>) <volume>35</volume>:<page-range>179&#x2013;85</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compmedimag.2010.10.001</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>M</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>L</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
</person-group>. <article-title>Segmentation of pathological features of rat bile duct carcinoma from hyperspectral images</article-title>. In: <conf-name>2018 11th International Congress on Image and Signal Processing, BioMedical Engineering and Informatics (CISP-BMEI)</conf-name>. <publisher-loc>Beijing, China</publisher-loc>: <publisher-name>IEEE</publisher-name>. (<year>2018</year>), pp. <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CISP-BMEI.2018.8633189</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>M</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>L</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>S</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>Melanoma and melanocyte identification from hyperspectral pathology images using object-based multiscale analysis</article-title>. <source>Appl Spectroscopy</source>. (<year>2018</year>) <volume>72</volume>:<fpage>538</fpage>&#x2013;<lpage>1547</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1177/0003702818781352</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname> <given-names>M</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W</given-names>
</name>
<name>
<surname>Tao</surname> <given-names>R</given-names>
</name>
<name>
<surname>Lovell</surname> <given-names>NH</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Tu</surname> <given-names>T</given-names>
</name>
<etal/>
</person-group>. <article-title>Spatial-spectral density peaks-based discriminant analysis for membranous nephropathy classification using microscopic hyperspectral images</article-title>. <source>IEEE J Biomed Health Inform</source>. (<year>2021</year>) <volume>25</volume>:<page-range>3041&#x2013;51</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JBHI.2021.3050483</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krizhevsky</surname> <given-names>A</given-names>
</name>
<name>
<surname>Sutskever</surname> <given-names>I</given-names>
</name>
<name>
<surname>Hinton</surname> <given-names>GE</given-names>
</name>
</person-group>. <article-title>Imagenet classification with deep convolutional neural networks</article-title>. <source>Ommun ACM</source>. (<year>2017</year>) <volume>60</volume>:<fpage>84</fpage>&#x2013;<lpage>90</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3065386</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>L</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>M</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>Diagnosis of cholangiocarcinoma from microscopic hyperspectral pathological dataset by deep convolution neural networks</article-title>. <source>Methods</source>. (<year>2022</year>) <volume>202</volume>:<fpage>22</fpage>&#x2013;<lpage>30</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ymeth.2021.04.005</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>L</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>Identification of melanoma from hyperspectral pathology image using 3D convolutional networks</article-title>. <source>IEEE Trans Med Imag</source>. (<year>2021</year>) <volume>40</volume>:<page-range>218&#x2013;27</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMI.2020.3024923</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>H</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>M</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>X</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>A high-level feature channel attention UNet network for cholangiocarcinoma segmentation from microscopy hyperspectral images</article-title>. <source>Mach Vision Applications</source>. (<year>2023</year>) <volume>34</volume>:<fpage>72</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00138-023-01418-x</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>X</given-names>
</name>
<name>
<surname>Adeli</surname> <given-names>E</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<etal/>
</person-group>. <article-title>TransUnet: Transformers make strong encoders for medical image segmentation</article-title>. <source>arXiv</source>. (<year>2021</year>). <uri xlink:href="https://arxiv.org/abs/2102.04306">https://arxiv.org/abs/2102.04306</uri>.</citation>
</ref>
<ref id="B18">
<label>18</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname> <given-names>A</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A</given-names>
</name>
<name>
<surname>Weissenborn</surname> <given-names>D</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T</given-names>
</name>
<etal/>
</person-group>. <article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>. <source>arXiv</source>. (<year>2021</year>). <uri xlink:href="https://arxiv.org/abs/2010.11929">https://arxiv.org/abs/2010.11929</uri>.</citation>
</ref>
<ref id="B19">
<label>19</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>K</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>S</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>M</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. <article-title>A generative data augmentation trained by low-quality annotations for cholangiocarcinoma hyperspectral image segmentation</article-title>. In: <conf-name>2023 International Joint Conference on Neural Networks (IJCNN)</conf-name>. <publisher-loc>Gold Coast, Australia</publisher-loc>: <publisher-name>IEEE</publisher-name>. (<year>2023</year>), pp. <page-range>01&#x2013;9</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IJCNN54540.2023.10191749</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>C</given-names>
</name>
<name>
<surname>Vonsky</surname> <given-names>MS</given-names>
</name>
<name>
<surname>Mitrofanova</surname> <given-names>LB</given-names>
</name>
<etal/>
</person-group>. <article-title>CrossU-Net: Dual-modality cross-attention u-net for segmentation of precancerous lesions in gastric cancer</article-title>. <source>Computer Med Imaging Graph</source>. (<year>2024</year>) <volume>112</volume>:<elocation-id>102339</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compmedimag.2024.102339</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>X</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>S</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>Deformable convnets V2: More deformable, better results</article-title>. In: <conf-name>2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Long Beach, CA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. (<year>2019</year>). pp. <page-range>9300&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00953</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname> <given-names>O</given-names>
</name>
<name>
<surname>Fischer</surname> <given-names>P</given-names>
</name>
<name>
<surname>Brox</surname> <given-names>T</given-names>
</name>
</person-group>. <article-title>U-net: Convolutional networks for biomedical image segmentation</article-title>. <source>arXiv</source>. (<year>2015</year>). <uri xlink:href="https://arxiv.org/abs/1505.04597">https://arxiv.org/abs/1505.04597</uri>.</citation>
</ref>
<ref id="B23">
<label>23</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Siddiquee</surname> <given-names>MMR</given-names>
</name>
<name>
<surname>Tajbakhsh</surname> <given-names>N</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>Unet++: A nested u-net architecture for medical image segmentation</article-title>. In: <conf-name>Deep Learning in Medical Image Analysis and Multimodal Learning for Clinical Decision Support: 4th International Workshop, DLMIA 2018, and 8th International Workshop, ML-CDS 2018, Held in Conjunction with MICCAI 2018</conf-name>. <publisher-loc>Granada, Spain. Berlin, Heidelberg</publisher-loc>: <publisher-name>Springer-Verlag</publisher-name>. (<year>2018</year>). pp. <fpage>3</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-00889-5_1</pub-id>
</citation>
</ref>
<ref id="B24">
<label>24</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname> <given-names>T</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>X</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>H</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>K</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>URNet: A U-Net based residual network for image dehazing</article-title>. <source>Appl Soft Comput</source>. (<year>2021</year>) <volume>102</volume>:<elocation-id>106884</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.asoc.2020.106884</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>S</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>N</given-names>
</name>
<name>
<surname>Shang</surname> <given-names>Y</given-names>
</name>
<etal/>
</person-group>. <article-title>Segmenting nailfold capillaries using an improved U-net network</article-title>. <source>Microvascular Res</source>. (<year>2020</year>) <volume>130</volume>:<elocation-id>104011</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.mvr.2020.104011</pub-id>
</citation>
</ref>
<ref id="B26">
<label>26</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname> <given-names>X</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W</given-names>
</name>
<name>
<surname>An</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>A context hierarchical integrated network for medical mage segmentation</article-title>. <source>Comput Electric Eng</source>. (<year>2022</year>) <volume>101</volume>:<elocation-id>108029</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compeleceng.2022.108029</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ghamsarian</surname> <given-names>N</given-names>
</name>
<name>
<surname>Wolf</surname> <given-names>S</given-names>
</name>
<name>
<surname>Zinkernagel</surname> <given-names>M</given-names>
</name>
<name>
<surname>Schoeffmann</surname> <given-names>K</given-names>
</name>
<name>
<surname>Sznitman</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>DeepPyramid plus: medical image segmentation using pyramid view fusion and deformable pyramid reception</article-title>. <source>Int J Comput Assisted Radiol Surge</source>. (<year>2024</year>) <volume>19</volume>:<page-range>851&#x2013;9</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11548-023-03046-2</pub-id>
</citation>
</ref>
<ref id="B28">
<label>28</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oktay</surname> <given-names>O</given-names>
</name>
<name>
<surname>Schlemper</surname> <given-names>J</given-names>
</name>
<name>
<surname>Folgoc</surname> <given-names>LL</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>M</given-names>
</name>
<name>
<surname>Heinrich</surname> <given-names>M</given-names>
</name>
<name>
<surname>Misawa</surname> <given-names>K</given-names>
</name>
<etal/>
</person-group>. <article-title>Attention U-Net: Learning where to look for the pancreas</article-title>. <source>arXiv</source>. (<year>2018</year>). <uri xlink:href="https://arxiv.org/abs/1804.03999">https://arxiv.org/abs/1804.03999</uri>.</citation>
</ref>
<ref id="B29">
<label>29</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>E</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>C</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>M</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>W</given-names>
</name>
<etal/>
</person-group>. <article-title>Automatic multi-tissue segmentation in pancreatic pathological images with selected multi-scale attention network</article-title>. <source>Comput Biol Med</source>. (<year>2022</year>) <volume>151</volume>:<elocation-id>106228</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.106228</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>S</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>G</given-names>
</name>
<etal/>
</person-group>. <article-title>GCHA-Net: Global context and hybrid attention network for automatic liver segmentation</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>152</volume>:<elocation-id>106352</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.106352</pub-id>
</citation>
</ref>
<ref id="B31">
<label>31</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z</given-names>
</name>
<etal/>
</person-group>. <article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>. <source>arXiv</source>. (<year>2021</year>). <uri xlink:href="https://arxiv.org/abs/2103.14030">https://arxiv.org/abs/2103.14030</uri>.</citation>
</ref>
<ref id="B32">
<label>32</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname> <given-names>H</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>D</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>Q</given-names>
</name>
<etal/>
</person-group>. <article-title>Swin-Unet: Unet-like pure transformer for medical image segmentation</article-title>. <source>arXiv</source>. (<year>2021</year>). <uri xlink:href="https://arxiv.org/abs/2105.05537">https://arxiv.org/abs/2105.05537</uri>.</citation>
</ref>
<ref id="B33">
<label>33</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Li</surname> <given-names>D</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>X</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>MISSFormer: An effective transformer for 2D medical image segmentation</article-title>. <source>IEEE Trans Med Imag</source>. (<year>2023</year>) <volume>42</volume>:<page-range>1484&#x2013;94</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMI.2022.3230943</pub-id>
</citation>
</ref>
<ref id="B34">
<label>34</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>C</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>S</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>W</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>FDR-TransUnet: A novel encoder-decoder architecture with vision transformer for improved medical image segmentation</article-title>. <source>Comput In Biol Med</source>. (<year>2023</year>) <volume>169</volume>:<elocation-id>107858</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107858</pub-id>
</citation>
</ref>
<ref id="B35">
<label>35</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Sheng</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Cui</surname> <given-names>H</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>J</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Xi</surname> <given-names>H</given-names>
</name>
</person-group>. <article-title>Cross pyramid transformer makes U-Net stronger in medical image segmentation</article-title>. <source>Biomed Signal Process Control</source>. (<year>2023</year>) <volume>86</volume>:<elocation-id>105361</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.bspc.2023.105361</pub-id>
</citation>
</ref>
<ref id="B36">
<label>36</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>A</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>G</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D</given-names>
</name>
</person-group>. <article-title>DS-TransUNet: Dual swin transformer u-net for medical image segmentation</article-title>. <source>IEEE Trans Instrument Measure</source>. (<year>2022</year>) <volume>71</volume>:<fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIM.2022.3178991</pub-id>
</citation>
</ref>
<ref id="B37">
<label>37</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>A</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>K</given-names>
</name>
<name>
<surname>Li</surname> <given-names>T</given-names>
</name>
<name>
<surname>Du</surname> <given-names>C</given-names>
</name>
<name>
<surname>Xia</surname> <given-names>S</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>H</given-names>
</name>
</person-group>. <article-title>H2Former: An efficient hierarchical hybrid transformer or medical image segmentation</article-title>. <source>IEEE Trans Med Imag</source>. (<year>2023</year>) <volume>42</volume>:<page-range>2763&#x2013;75</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMI.2023.3264513</pub-id>
</citation>
</ref>
<ref id="B38">
<label>38</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>W</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>B</given-names>
</name>
<name>
<surname>Miao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>He</surname> <given-names>W</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>MS-TCNet: An effective transformer-cnn combined network using multi-scale feature learning for 3d medical image segmentation</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>170</volume>:<elocation-id>108057</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108057</pub-id>
</citation>
</ref>
<ref id="B39">
<label>39</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>Multi-organ segmentation over partially labeled datasets with multi-scale feature abstraction</article-title>. <source>IEEE Trans Med Imag</source>. (<year>2020</year>) <volume>39</volume>:<page-range>3619&#x2013;29</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMI.2020.3001036</pub-id>
</citation>
</ref>
<ref id="B40">
<label>40</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>D</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>S</given-names>
</name>
<name>
<surname>Lian</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>MSCA-Net: Multi-scale contextual attention network for skin lesion segmentation</article-title>. <source>Pattern Recognit</source>. (<year>2023</year>) <volume>139</volume>:<elocation-id>109524</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.patcog.2023.109524</pub-id>
</citation>
</ref>
<ref id="B41">
<label>41</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Xin</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>D</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>T</given-names>
</name>
</person-group>. <article-title>MESTrans: Multi-scale embedding spatial transformer for medical image segmentation</article-title>. <source>Comput Methods Prog Biomed</source>. (<year>2023</year>) <volume>233</volume>:<elocation-id>107493</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cmpb.2023.107493</pub-id>
</citation>
</ref>
<ref id="B42">
<label>42</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Heidari</surname> <given-names>M</given-names>
</name>
<name>
<surname>Kazerouni</surname> <given-names>A</given-names>
</name>
<name>
<surname>Soltany</surname> <given-names>M</given-names>
</name>
<name>
<surname>Azad</surname> <given-names>R</given-names>
</name>
<name>
<surname>Aghdam</surname> <given-names>EK</given-names>
</name>
<name>
<surname>Cohen-Adad</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>HiFormer: hierarchical multi-scale representations using transformers for medical image segmentation</article-title>. In: <conf-name>2023 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)</conf-name>; <conf-date>2023 Jan 3-7</conf-date>; <publisher-loc>Waikoloa, HI, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2023</year>). pp. <page-range>6191&#x2013;201</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00953</pub-id>
</citation>
</ref>
<ref id="B43">
<label>43</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>L</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X</given-names>
</name>
</person-group>. <article-title>MMVIT-Seg: A lightweight transformer and cnn fusion network for covid-19 segmentation</article-title>. <source>Comput Methods Prog Biomed</source>. (<year>2023</year>) <volume>230</volume>:<fpage>107348</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/WACV56688.2023.00614</pub-id>
</citation>
</ref>
<ref id="B44">
<label>44</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Q</given-names>
</name>
</person-group>. <article-title>A hyperspectral dataset of precancerous lesions in gastric cancer and benchmarks for pathological diagnosis</article-title>. <source>J Biophoton</source>. (<year>2022</year>) <volume>15</volume>:<elocation-id>e202200163</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/jbio.202200163</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>