<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Oncol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Oncology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Oncol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2234-943X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fonc.2026.1734345</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>SpectraNet: a novel model for polyp segmentation leveraging a spectral-guided mixture of functional experts</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Liu</surname><given-names>Zhong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Ling</surname><given-names>Jing</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3259167/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Anorectal Surgery, Jiangyan Hospital Affiliated to Nanjing University of Chinese Medicine</institution>, <city>Taizhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Spleen and Stomach Diseases, Jiangyan Hospital Affiliated to Nanjing University of Chinese Medicine</institution>, <city>Taizhou</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Jing Ling, <email xlink:href="mailto:157770425@qq.com">157770425@qq.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-18">
<day>18</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>16</volume>
<elocation-id>1734345</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>26</day>
<month>11</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Liu and Ling.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Liu and Ling</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-18">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Automated and precise polyp segmentation from colonoscopy images is critical for the early diagnosis of colorectal cancer. However, this task is challenged by the ambiguous and low-contrast boundaries of polyps, which often blend with the surrounding mucosa. To address this, we propose SpectraNet, a novel hybrid-domain enhancement network for high-precision polyp segmentation. Our model is built on an encoder-decoder architecture with two core innovations integrated into its skip connections: (1) a Spectral-Guided Boundary Enhancement (SGBE) module that operates in the frequency domain to recover and sharpen indistinct boundary information by enhancing the phase spectrum of features, and (2) a Function-Specialized Mixture-of-Experts (FS-MoE) module that adaptively refines features for diverse polyp morphologies using a set of heterogeneous, function-specific experts. Extensive experiments on our curated PolypSegDataset and two public benchmarks (Kvasir-SEG and CVC-ClinicDB) demonstrate that our method consistently outperforms a wide range of state-of-the-art models. SpectraNet achieves superior performance in key segmentation metrics, and produces qualitatively more accurate segmentation masks with precise boundary definitions.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>foundation model</kwd>
<kwd>frequency domain enhancement</kwd>
<kwd>mixture of experts</kwd>
<kwd>polyp segmentation</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="7"/>
<equation-count count="15"/>
<ref-count count="60"/>
<page-count count="14"/>
<word-count count="7526"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Gastrointestinal Cancers: Colorectal Cancer</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Colorectal cancer (CRC) poses a significant threat to global health, consistently ranking as one of the leading causes of cancer-related mortality worldwide Center et&#xa0;al. (<xref ref-type="bibr" rid="B1">1</xref>) Ladabaum et&#xa0;al. (<xref ref-type="bibr" rid="B2">2</xref>). The early detection and removal of adenomatous polyps during colonoscopy remains the most effective strategy for preventing CRC progression Brenner et&#xa0;al. (<xref ref-type="bibr" rid="B3">3</xref>) Miller and Knight (<xref ref-type="bibr" rid="B4">4</xref>). Central to this preventative measure is the accurate delineation, or segmentation, of polyps, which provides crucial morphological information for clinical assessment Guachi et&#xa0;al. (<xref ref-type="bibr" rid="B5">5</xref>). However, manual detection of polyps is a demanding task that is highly time-consuming and dependent on the experience of the clinician, which may lead to significant miss rates, especially for subtle yet clinically important flat or depressed lesions Viscaino et&#xa0;al. (<xref ref-type="bibr" rid="B6">6</xref>). To address these limitations, developing robust Computer-Aided Diagnosis (CAD) systems to achieve automated and precise polyp segmentation is of paramount clinical importance for improving diagnosis accuracy and assisting therapeutic decision-making (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>In response to this need, deep learning-based methods have become the standard for automated polyp segmentation Li et&#xa0;al. (<xref ref-type="bibr" rid="B8">8</xref>)Ji et&#xa0;al. (<xref ref-type="bibr" rid="B9">9</xref>) Huo et&#xa0;al. (<xref ref-type="bibr" rid="B10">10</xref>) Gupta and Mishra (<xref ref-type="bibr" rid="B11">11</xref>) Qayoom et&#xa0;al. (<xref ref-type="bibr" rid="B12">12</xref>). Architectures based on Convolutional Neural Networks (CNNs), exemplified by the U-Net and its numerous variants, have demonstrated considerable success by learning hierarchical feature representations from images Akbari et&#xa0;al. (<xref ref-type="bibr" rid="B13">13</xref>) Yeung et&#xa0;al. (<xref ref-type="bibr" rid="B14">14</xref>) Sun et&#xa0;al. (<xref ref-type="bibr" rid="B15">15</xref>). More recently, Transformer-based models have been introduced to this domain, leveraging self-attention mechanisms to capture long-range dependencies and global contextual information more effectively than their CNN counterparts Duc et&#xa0;al. (<xref ref-type="bibr" rid="B16">16</xref>) Dong et&#xa0;al. (<xref ref-type="bibr" rid="B17">17</xref>) Jha et&#xa0;al. (<xref ref-type="bibr" rid="B18">18</xref>) Shao et&#xa0;al. (<xref ref-type="bibr" rid="B19">19</xref>). Despite variations in architectural design, these state-of-the-art models share a common operational paradigm: they perform inference entirely within the spatial domain, aiming to identify polyp boundaries by learning complex relationships between neighboring pixels and regions.</p>
<p>Despite these advances, the intrinsic visual characteristics of polyps present a persistent challenge Qayoom et&#xa0;al. (<xref ref-type="bibr" rid="B12">12</xref>) Mei et&#xa0;al. (<xref ref-type="bibr" rid="B20">20</xref>). Lesions frequently exhibit indistinct boundaries, low contrast against the surrounding mucosa, and substantial variation in size, shape, and texture Liu et&#xa0;al. (<xref ref-type="bibr" rid="B21">21</xref>) Tajbakhsh et&#xa0;al. (<xref ref-type="bibr" rid="B22">22</xref>). However, relying solely on spatial information limits the model&#x2019;s ability to detect such subtle lesions. In clinical colonoscopy, the visual difference between a flat polyp and healthy mucosa is often negligible. For standard deep learning models (CNNs), which detect objects by looking for sharp changes in pixel intensity (gradients), these low-contrast areas are essentially &#x2018;invisible&#x2019;. To address this, we look beyond the spatial pixels and advocate for a paradigm shift to the Frequency Domain. In signal processing, an image can be decomposed into its &#x2018;amplitude&#x2019; spectrum (intensity energy) and &#x2018;phase&#x2019; spectrum (structural information). While the amplitude spectrum largely corresponds to overall contrast and can be susceptible to lighting variations Bracewell (<xref ref-type="bibr" rid="B23">23</xref>), the phase spectrum robustly encodes the structural &#x2018;skeleton&#x2019; of the object, such as edges and contours Shanmugam et&#xa0;al. (<xref ref-type="bibr" rid="B24">24</xref>) Nawab et&#xa0;al. (<xref ref-type="bibr" rid="B25">25</xref>). We hypothesize that by explicitly enhancing this phase information, we can recover critical boundary details that are attenuated in the spatial domain, effectively making the &#x2018;invisible&#x2019; boundaries visible again.</p>
<p>To this end, we introduce SpectraNet, a novel framework engineered for high-fidelity medical image segmentation. Built upon a frozen vision foundation model and fine-tuned with lightweight, parameter-efficient adapters, SpectraNet&#x2019;s core innovation lies in a hybrid-domain enhancement unit strategically placed within its skip connections. This unit first employs a Spectral-Guided Boundary Enhancement (SGBE) module to recover critical boundary integrity by operating directly in the frequency domain. Subsequently, a Function-Specialized Mixture-of-Experts (FS-MoE) module performs content-aware feature refinement in the spatial domain to accommodate the vast morphological diversity of polyps.</p>
<p>The main contributions of this paper are summarized as follows:</p>
<list list-type="bullet">
<list-item>
<p>We propose SpectraNet for polyp segmentation. It deploys a hybrid-domain enhancement strategy within the skip connections of a parameter-efficiently adapted foundation model. This approach synergistically combines frequency-domain boundary recovery with spatial-domain adaptive refinement to generate highly discriminative multi-scale features.</p></list-item>
<list-item>
<p>We introduce the Spectral-Guided Boundary Enhancement (SGBE) module, a novel component that explicitly enhances the feature phase spectrum to restore high-frequency structural details. This method directly counteracts the inherent limitations of spatial convolutions in detecting low-contrast and ill-defined edges.</p></list-item>
<list-item>
<p>We design the Function-Specialized Mixture-of-Experts (FS-MoE) module, an adaptive mechanism employing a compact set of heterogeneous experts, each meticulously designed for a distinct function (i.e., edge detection, multi-scale texture analysis, and context aggregation). A dynamic gating network routes features for tailored processing, significantly improving the model&#x2019;s robustness and generalization across diverse polyp morphologies.</p></list-item>
<list-item>
<p>To facilitate more robust evaluation and future research, we introduce PolypSegDataset, a new high-quality benchmark for polyp segmentation. Extensive experiments on this benchmark and several public ones validate that our SpectraNet consistently outperforms previous methods to establish a new state-of-the-art, with its superiority being particularly pronounced on metrics sensitive to fine-grained boundary details.</p></list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Related works</title>
<sec id="s2_1">
<label>2.1</label>
<title>Traditional methods for polyp segmentation</title>
<p>Early research into automated polyp segmentation primarily relied on traditional, hand-crafted feature-based approaches Pogorelov et&#xa0;al. (<xref ref-type="bibr" rid="B26">26</xref>). These methods typically targeted low-level visual cues such as color, texture, and shape to distinguish polyps from the surrounding colonic mucosa. Common techniques included color-space analysis, local binary patterns (LBP) for texture description, and edge detection algorithms to identify polyp boundaries Mamonov et&#xa0;al. (<xref ref-type="bibr" rid="B27">27</xref>) Maghsoudi (<xref ref-type="bibr" rid="B28">28</xref>) Rahim et&#xa0;al. (<xref ref-type="bibr" rid="B29">29</xref>). While these methods laid important groundwork, they were often sensitive to variations in illumination, viewpoint, and polyp morphology. Their reliance on manually engineered features limited their generalization capabilities, making it difficult to achieve robust performance across the wide spectrum of polyp appearances seen in clinical practice.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Deep learning methods for polyp segmentation</title>
<p>The advent of deep learning, particularly Convolutional Neural Networks (CNNs) Akbari et&#xa0;al. (<xref ref-type="bibr" rid="B13">13</xref>) Brandao et&#xa0;al. (<xref ref-type="bibr" rid="B30">30</xref>) He et&#xa0;al. (<xref ref-type="bibr" rid="B31">31</xref>)Simonyan and Zisserman (<xref ref-type="bibr" rid="B32">32</xref>) Cai et&#xa0;al. (<xref ref-type="bibr" rid="B33">33</xref>) Tomar et&#xa0;al. (<xref ref-type="bibr" rid="B34">34</xref>) Zhang et&#xa0;al. (<xref ref-type="bibr" rid="B35">35</xref>) Sengar et&#xa0;al. (<xref ref-type="bibr" rid="B36">36</xref>) Singh and Sengar (<xref ref-type="bibr" rid="B37">37</xref>) Khan et&#xa0;al. (<xref ref-type="bibr" rid="B38">38</xref>), marked a paradigm shift in medical image segmentation. The U-Net architecture, with its seminal encoder-decoder structure and skip connections, became a foundational model, demonstrating remarkable success in preserving both high-level semantic context and fine-grained spatial details Ronneberger et&#xa0;al. (<xref ref-type="bibr" rid="B39">39</xref>) Zhou et&#xa0;al. (<xref ref-type="bibr" rid="B40">40</xref>). This spurred the development of numerous variants, such as UNet++ Zhou et&#xa0;al. (<xref ref-type="bibr" rid="B40">40</xref>) and ResUNet Zhang et&#xa0;al. (<xref ref-type="bibr" rid="B41">41</xref>), which introduced innovations like nested skip pathways and residual connections to further improve feature representation.</p>
<p>More recently, models have been designed specifically to address the unique challenges of polyp segmentation. PraNet Fan et&#xa0;al. (<xref ref-type="bibr" rid="B42">42</xref>), for instance, introduced a parallel reverse attention network to explicitly model boundaries and regions, achieving a significant performance leap. Concurrently, the success of Vision Transformers Dosovitskiy et&#xa0;al. (<xref ref-type="bibr" rid="B43">43</xref>) in capturing global dependencies led to the development of hybrid models like TransUNet Chen et&#xa0;al. (<xref ref-type="bibr" rid="B44">44</xref>) Khan et&#xa0;al. (<xref ref-type="bibr" rid="B45">45</xref>), which combines the strengths of both CNNs and Transformers Vaswani et&#xa0;al. (<xref ref-type="bibr" rid="B46">46</xref>). The latest research trend involves leveraging large-scale, pre-trained foundation models, such as the Segment Anything Model (SAM) Kirillov et&#xa0;al. (<xref ref-type="bibr" rid="B47">47</xref>), as powerful backbones. These models provide robust, generalized feature extraction capabilities that can be fine-tuned for specialized medicaresl tasks like polyp segmentation, representing the current state-of-the-art and the context in which our work is positioned Li et&#xa0;al. (<xref ref-type="bibr" rid="B48">48</xref>, <xref ref-type="bibr" rid="B49">49</xref>).</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>Overall framework</title>
<p>The overall architecture of our proposed SpectraNet is depicited in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>. The network takes a colonoscopy image <italic>I</italic> &#x2208; <inline-formula>
<mml:math display="inline" id="im1"><mml:mi>&#x211d;</mml:mi></mml:math></inline-formula><sup>3&#xd7;</sup><italic><sup>H</sup></italic><sup>&#xd7;</sup><italic><sup>W</sup></italic> as input and produces a pixel-level polyp segmentation probability map <italic>G</italic> &#x2208; <inline-formula>
<mml:math display="inline" id="im2"><mml:mi>&#x211d;</mml:mi></mml:math></inline-formula><italic><sup>H</sup></italic><sup>&#xd7;</sup><italic><sup>W</sup></italic>. SpectraNet is composed of three primary components: (1) a frozen SAM2 Ravi et&#xa0;al. (<xref ref-type="bibr" rid="B50">50</xref>) backbone with a lightweight trainable adapter that extracts robust, multi-scale visual features; (2) a hybrid-domain enhancement unit within the skip connections that first sharpens indistinct boundaries using the Spectral-Guided Boundary Enhancement (SGBE) Module and then adaptively refines features with the Function-Specialized Mixture-of-Experts (FS-MoE) Module; and (3) a progressive refinement decoder that hierarchically fuses the enhanced features from different scales to reconstruct a high-resolution, boundary-precise segmentation mask, supported by multi-level deep supervision.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The overall architecture of the proposed SpectraNet. The model is composed of three primary components: (1) a frozen SAM2 encoder enhanced with lightweight trainable adapters for multi-scale feature extraction; (2) a hybrid-domain enhancement unit in the skip connections, featuring a Spectral-Guided Boundary Enhancement (SGBE) module to sharpen boundaries in the frequency domain and a Function-Specialized Mixture-of-Experts (FS-MoE) module for adaptive feature refinement; and (3) a Progressive Refinement Decoder (PRD) that hierarchically fuses the enhanced features, guided by multilevel deep supervision, to produce the final, precise segmentation map.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1734345-g001.tif">
<alt-text content-type="machine-generated">Flowchart diagram illustrates a deep learning model architecture for image segmentation, starting with an input image, followed by patch embedding, hierarchical blocks, SGBE and FS-MoE modules, a progressive refinement decoder, and finally producing a segmentation mask prediction compared to ground truth. Component blocks are visually separated, with some blocks marked as learnable or frozen, and include labeled layers such as Adapter, Attention, and FFT.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Encoder</title>
<p>Automated polyp segmentation presents a significant challenge, as the visual characteristics of polyps&#x2014;particularly flat or early-stage lesions&#x2014;often lack strong, defining features, blending subtly with the surrounding healthy mucosa. To capture these nuanced patterns, a powerful and robust feature extractor is required. To this end, we employ the Hiera Ryali et&#xa0;al. (<xref ref-type="bibr" rid="B51">51</xref>) encoder from SAM2 as our foundational feature extractor. The encoder processes an input image <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:mi>I</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and generates a four-level feature pyramid <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> for <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mn>4</mml:mn><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>. These feature maps have progressively decreasing spatial resolutions (<italic>H<sub>L</sub></italic> = <italic>H/</italic>2<italic><sup>L</sup></italic><sup>+ 1</sup>, <italic>W<sub>L</sub></italic> = <italic>W/</italic>2<italic><sup>L</sup></italic><sup>+ 1</sup>) and increasing channel dimensions (<italic>C<sub>L</sub></italic> = {144, 288, 576, 1152}), capturing a rich hierarchy of representations from fine-grained textures to abstract semantics.</p>
<p>To specialize the powerful, general-purpose features of SAM2 for the medical domain, we introduce a lightweight, trainable adapter into each Hiera block of the encoder, inspired by Houlsby et&#xa0;al. (<xref ref-type="bibr" rid="B52">52</xref>) Qiu et&#xa0;al. (<xref ref-type="bibr" rid="B53">53</xref>). The adapter, which consists of a linear layer for down-sampling, a GeLU activation function, another linear layer for up-sampling, and a final GeLU activation, processes the feature map <italic>F<sub>L</sub></italic> to generate a task-specific adaptation vector &#x394;<italic>F<sub>L</sub></italic>. This vector is then integrated back into the feature map via a residual connection before the main attention block, producing an adapted feature <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mo>"</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula> as defined in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mtext>&#x394;</mml:mtext><mml:msub><mml:mi>F</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>This in-place adaptive mechanism allows our model to inject polyp-specific priors directly into the feature extraction hierarchy, effectively steering the general encoder to become a specialized extractor finely tuned for discerning subtle pathological tissues. By keeping the original SAM2 encoder weights frozen and only training the lightweight adapters, we preserve the model&#x2019;s strong generalized representations while minimizing additional training overhead and reducing the risk of overfitting on smaller medical datasets.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Spectral-guided boundary enhancement module</title>
<p>A primary difficulty in polyp segmentation is the ambiguous and low-contrast nature of polyp boundaries, which challenges convolutional networks that rely on local spatial gradients. However, the structural information that defines these boundaries, while subtle in the spatial domain, is more robustly encoded in the phase component of a signal&#x2019;s frequency spectrum. Therefore, we designed the SGBE module to operate directly in the frequency domain to amplify this crucial structural information and enhance boundary representation, shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>.</p>
<p>The module takes the adapted feature map <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula> from the encoder as input. First, a 1 &#xd7; 1 convolutional layer projects the features into a uniform channel dimension (<italic>d</italic> = 64), resulting in the feature map <italic>X<sub>L</sub></italic>. We then transition to the frequency domain by applying a 2D Fast Fourier Transform (FFT). The FFT decomposes the feature map into its amplitude spectrum <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>d</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and phase spectrum <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>d</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math></inline-formula> The amplitude spectrum primarily encodes the energy of spatial frequencies, which corresponds to low-level image statistics like contrast and brightness. The phase spectrum, conversely, is critically important as it preserves the high-frequency structural information that defines the precise spatial location of object boundaries and edges. Both spectra are then passed through parallel enhancement branches, each composed of a sequence of convolutional, BatchNorm, and ReLU layers, to learn enhancement residuals, <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:mtext>&#x394;</mml:mtext><mml:msub><mml:mi>A</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:mtext>&#x394;</mml:mtext><mml:msub><mml:mi>P</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>. These are added element-wise to the original spectra to yield the enhanced versions, <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>A</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im13"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>P</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, as formulated in <xref ref-type="disp-formula" rid="eq2">Equations 2</xref>, <xref ref-type="disp-formula" rid="eq3">3</xref>, respectively:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>A</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>L</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>A</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mtext>&#x394;</mml:mtext><mml:msub><mml:mi>A</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>P</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>L</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mtext>&#x394;</mml:mtext><mml:msub><mml:mi>P</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>The enhanced amplitude and phase spectra are recombined to form an enhanced complex frequency tensor. Subsequently, an Inverse FFT (IFFT) is applied to transform this representation back into the spatial domain, yielding the boundary-enhanced feature map <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>enh</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>. To maintain training stability and preserve the original feature context, the final output of the module, <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>SGBE</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, is formed by integrating the enhancement with the input features via a learnable, weighted residual connection as defined in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>:</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>SGBE</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x3b1;</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>enh</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>&#x3b1;</italic> is a learnable scalar parameter that adaptively controls the contribution of the frequency-domain enhancement. This allows the network to dynamically balance spatial feature fidelity with spectral boundary refinement.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Function-specialized mixture-of-experts module</title>
<p>The significant morphological diversity of colorectal polyps, which vary widely in size, shape, and surface texture, demands a highly adaptive feature refinement strategy. To address this challenge, we introduce the Function-Specialized Mixture-of-Experts (FS-MoE) Module. Inspired by the success of MoE designs Zhou et&#xa0;al. (<xref ref-type="bibr" rid="B54">54</xref>), our approach utilizes a dynamic routing mechanism to create a content-aware feature processing pipeline. Unlike conventional MoE systems that use homogeneous experts, our FS-MoE employs a compact set of <italic>heterogeneous experts</italic>, where each is meticulously designed for a distinct and complementary function. The module processes the input feature map <italic>F<sub>L</sub></italic><sup>SGBE</sup> through three parallel, function-specialized expert branches &#x2013; Sobel Edge Expert Branch, Multi-Scale Texture Expert Branch and Dilated Context Expert Branch, which is shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Detailed architecture of the proposed function-specialized mixture-of-experts (FS-MoE) module. An input feature map is processed in parallel by three heterogeneous experts, each designed for a distinct function: (1) a Sobel Edge Expert to enhance high-frequency boundary information, (2) a Multi-Scale Texture Expert to capture varied surface patterns, and (3) a Dilated Context Expert to aggregate broad contextual information. A lightweight gating network dynamically computes weights to adaptively fuse the outputs of the three experts, tailoring the feature refinement for diverse polyp morphologies.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1734345-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture with three modules labeled Sobel Edge Expert, Multi-Scale Texture Expert, and Dilated Context Expert, each receiving input, and a Gating Network combining their outputs before progressing through additional network layers.</alt-text>
</graphic></fig>
<p>The Sobel Edge Expert is designed to explicitly capture and enhance high-frequency boundary information. It consists of a lightweight, depthwise 3&#xd7;3 convolution where the kernels, <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, are initialized with classic Sobel operators Kittler (<xref ref-type="bibr" rid="B55">55</xref>). This provides a strong inductive bias for edge detection, which is subsequently fine-tuned during training. The operation is formally defined in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mi>g</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mtext>BN</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mo>*</mml:mo><mml:mrow><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where &#x2217;<italic><sub>dw</sub></italic> denotes the depthwise convolution operation, BN is BatchNorm, and <italic>&#x3c3;</italic> is the GeLU activation function.</p>
<p>The Multi-Scale Texture Expert aims to capture varied surface patterns at multiple scales. It employs a three-branch parallel design where the input feature <italic>F</italic> is first projected into a low-dimensional space, <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x2192;</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>8</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. The three branches then operate on <inline-formula>
<mml:math display="inline" id="im18"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> to produce outputs <italic>B</italic><sub>1</sub><italic>, B</italic><sub>2</sub><italic>, B</italic><sub>3</sub> with varying receptive fields, as formulated in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>:</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq14">
<mml:math display="block" id="M7"><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq15">
<mml:math display="block" id="M8"><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>5</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>As shown in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>, these outputs are concatenated and fused via a final 1 &#xd7; 1 convolution to restore the original channel dimension <italic>d</italic>, producing a rich, multi-scale texture representation:</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>t</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>8</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn><mml:mo>&#x2192;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mtext>Concat</mml:mtext><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo stretchy="false">]</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>To efficiently aggregate broad contextual information, the Dilated Context Expert utilizes a depthwise separable convolution. This two-stage process involves a 3 &#xd7; 3 depthwise convolution with a large dilation rate (dil = 2), followed by a 1 &#xd7; 1 pointwise convolution to combine channel features, as expressed in <xref ref-type="disp-formula" rid="eq8">Equation 8</xref>:</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>pw</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mtext>BN</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow><mml:mrow><mml:mtext>dw</mml:mtext><mml:mo>,</mml:mo><mml:mtext>&#x2004; dil</mml:mtext><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>The core of the module is its adaptive gating mechanism. A lightweight gating network, <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, processes the input feature <italic>F<sub>L</sub></italic><sup>SGBE</sup> to dynamically generate a set of scalar weights <inline-formula>
<mml:math display="inline" id="im20"><mml:mrow><mml:mi>w</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula>. As formulated in <xref ref-type="disp-formula" rid="eq9">Equation 9</xref>, the refined feature map, <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>ref</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, is obtained by a weighted sum of the expert outputs, <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>:</p>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>ref</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mn>3</mml:mn></mml:munderover><mml:msub><mml:mi>w</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>SGBE</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Finally, a residual connection with a learnable scalar parameter, <italic>&#x3b2;</italic>, is applied to form the final output, <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>FS</mml:mtext><mml:mo>&#x2212;</mml:mo><mml:mtext>MoE</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> according to <xref ref-type="disp-formula" rid="eq10">Equation 10</xref>:</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>FS</mml:mtext><mml:mo>&#x2212;</mml:mo><mml:mtext>MoE</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>SGBE</mml:mtext></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mi>&#x3b2;</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mtext>ref</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math>
</disp-formula>
<p>This adaptive fusion empowers our network to intelligently tailor its feature refinement strategy, improving its ability to accurately segment polyps across their wide spectrum of visual presentations.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Progressive refinement decoder</title>
<p>Although our enhanced skip connections provide rich, multi-scale feature representations, effectively fusing them to reconstruct a precise segmentation mask is non-trivial. Directly merging features from different scales can lead to coarse boundaries or the dilution of semantic information. To address this, we employ a Progressive Refinement Decoder (PRD), inspired by top-down fusion approaches proven effective in dense prediction tasks. The decoder is designed to hierarchically aggregate the enhanced multi-scale features, progressively recovering fine spatial details while preserving high-level semantic context.</p>
<p>The decoder reconstructs the final prediction through a sequence of refinement stages, beginning with the deepest feature map from the FS-MoE module, <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mn>4</mml:mn><mml:mrow><mml:mtext>FS</mml:mtext><mml:mo>&#x2212;</mml:mo><mml:mtext>MoE</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>. For each subsequent decoding stage <italic>i</italic> &#x2208; {3, 2, 1}, a Progressive Refinement Module (PRM) integrates the upsampled features from the deeper stage, <italic>D<sub>i</sub></italic><sub>+ 1</sub>, with the corresponding enhanced skip connection feature, <inline-formula>
<mml:math display="inline" id="im25"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>FS</mml:mtext><mml:mo>&#x2212;</mml:mo><mml:mtext>MoE</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>. This integration process is formally defined in <xref ref-type="disp-formula" rid="eq11">Equation 11</xref>:</p>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M13"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mtext>&#x3a6;</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mtext>Concat</mml:mtext><mml:mo stretchy="false">[</mml:mo><mml:mtext>Upsample</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>FS</mml:mtext><mml:mo>&#x2212;</mml:mo><mml:mtext>MoE</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">]</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where Upsample(&#xb7;) denotes a 2&#xd7; bilinear upsampling operation and &#x3a6;<italic><sub>i</sub></italic>(&#xb7;) is a refinement block composed of a 3 &#xd7; 3 convolution, BatchNorm, and a GeLU activation. This fusion process yields a series of intermediate decoder features {<italic>D</italic><sub>3</sub><italic>, D</italic><sub>2</sub><italic>, D</italic><sub>1</sub>} with progressively increasing spatial resolution.</p>
<p>To guide the learning process at all scales, a prediction head, <inline-formula>
<mml:math display="inline" id="im26"><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, is applied to the output of each decoder stage to produce an auxiliary segmentation map, <italic>out<sub>i</sub></italic>. Each head consists of a 1 &#xd7; 1 convolution followed by a sigmoid activation function. These auxiliary maps (<italic>out</italic><sub>4</sub><italic>, out</italic><sub>3</sub><italic>, out</italic><sub>2</sub>) are upsampled to the original input resolution and used for deep supervision, as detailed in the following section. The final, primary segmentation map, <italic>out</italic><sub>1</sub>, is produced from the feature map of the last and highest-resolution decoder stage, <italic>D</italic><sub>1</sub>.</p>
<p>This progressive, top-down refinement structure ensures that strong semantic guidance from the deeper layers is effectively propagated and fused with the detailed, boundary-rich features at shallower layers. This process enables the network to achieve both a globally consistent understanding of the polyp and pixel-level precision in the final segmentation mask.</p>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Multi-scale supervision</title>
<p>To ensure robust feature learning across all scales, the network is trained using a deep supervision strategy where a loss is applied to the output of each of the four decoder stages. This total loss function, <italic>L</italic><sub>total</sub>, is formulated in <xref ref-type="disp-formula" rid="eq12">Equation 12</xref> as a weighted sum of the loss from the primary output (<italic>out</italic><sub>1</sub>) and the three auxiliary outputs (<italic>out</italic><sub>2</sub><italic>, out</italic><sub>3</sub><italic>, out</italic><sub>4</sub>):</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M14"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>seg</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mi>G</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mn>4</mml:mn></mml:munderover><mml:mrow><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mstyle><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>seg</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mtext>Upsample</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>G</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>G</italic> is the ground-truth polyp mask and <italic>&#x3bb;<sub>i</sub></italic> are loss weights that balance the gradients from different scales. Based on empirical evaluation, we set the weights for the auxiliary outputs to <italic>&#x3bb;<sub>i</sub></italic> = 0.1 for <italic>i</italic> &#x2208; {2, 3, 4}.</p>
<p>Following Dong et&#xa0;al. (<xref ref-type="bibr" rid="B17">17</xref>) Fan et&#xa0;al. (<xref ref-type="bibr" rid="B42">42</xref>), our core segmentation loss, <italic>L</italic><sub>seg</sub>, is a hybrid loss composed of the sum of Binary Cross-Entropy Loss (<italic>L</italic><sub>BCE</sub>) and Dice Loss (<italic>L</italic><sub>Dice</sub>), as shown in <xref ref-type="disp-formula" rid="eq13">Equation 13</xref>:</p>
<disp-formula id="eq13"><label>(13)</label>
<mml:math display="block" id="M15"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>seg</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>BCE</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>Dice</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>The <italic>L</italic><sub>BCE</sub> term enforces pixel-level correctness, while the <italic>L</italic><sub>Dice</sub> term improves performance on imbalanced classes by maximizing the spatial overlap between the predicted mask and the ground truth.</p>
<p>This deep supervision scheme ensures that the intermediate layers of the decoder are explicitly guided toward producing semantically correct feature maps. By enforcing consistency across multiple scales, the model learns to effectively align high-resolution details from shallower layers with the robust semantic context from deeper layers, leading to more accurate and coherent polyp segmentation.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiment and results</title>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets</title>
<p>To rigorously evaluate our proposed model, we curated a high-quality segmentation dataset named the PolypSegDataset. This dataset comprises 1,302 images containing a total of 1,342 meticulously annotated polyp instances. A key characteristic of this dataset is its strong focus on large, clinically significant polyps, which constitute the vast majority of the samples. Furthermore, the dataset is highly standardized, with all images sharing a uniform resolution of 560 &#xd7; 480 pixels, which facilitates direct and fair model comparison. The high fidelity of the ground-truth masks makes this dataset particularly well-suited for evaluating boundary-level segmentation accuracy.</p>
<p>To provide a comprehensive overview of the dataset&#x2019;s characteristics, we conducted a detailed statistical analysis, with the key statistics summarized in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>. Regarding the Polyp Area Distribution, the data is overwhelmingly dominated by large polyps (&#x2265; 5000 pixels), which account for a striking 87.1% of all instances. In contrast, small polyps (<italic>&lt;</italic> 1000 pixels) are nearly absent at only 0.2%. This distinct composition makes the PolypSegDataset an ideal benchmark for developing and testing models on well-developed, clinically significant lesions rather than incipient polyps.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Statistical analysis of the polyp area distribution within the curated PolypSegDataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Metric</th>
<th valign="middle" align="left">Category/statistic</th>
<th valign="middle" align="center">Value/percentage</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" rowspan="3">Polyp Size Distribution</td>
<td valign="middle" align="left">Small (<italic>&lt;</italic> 1000 pixels)</td>
<td valign="middle" align="center">0.2%</td>
</tr>
<tr>
<td valign="middle" align="left">Medium (1000 &#x2212; 5000 pixels)</td>
<td valign="middle" align="center">12.7%</td>
</tr>
<tr>
<td valign="middle" align="left">Large (<italic>&gt;</italic> 5000 pixels)</td>
<td valign="middle" align="center">87.1%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Annotation Quality. <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref> provides insight into the quality and precision of the ground-truth masks. The distribution of the number of vertices per annotation has a high average of 38.79. This metric underscores the meticulous detail with which the polyp boundaries were traced, moving far beyond simple bounding boxes or coarse outlines. Such high-precision annotation is critical for the rigorous evaluation of segmentation algorithms, especially for assessing boundary accuracy.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Analysis of the annotation precision in the PolypSegDataset, illustrated by the distribution of the number of vertices used per ground-truth mask. The data shows a high average of 38.79 vertices per annotation, which underscores the meticulous detail and high fidelity of the boundary tracing. This high-precision annotation makes the dataset particularly suitable for the rigorous evaluation of boundary-level segmentation accuracy.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1734345-g003.tif">
<alt-text content-type="machine-generated">Three vertical bar charts show distributions: left, shapes per image peaks at six; center, vertices per shape peaks near thirty-five; right, annotation areas skewed left, peaking at smaller pixel counts and tapering toward larger areas.</alt-text>
</graphic></fig>
<p>In addition to our curated PolypSegDataset, we also evaluate our model&#x2019;s performance on two widely-used public benchmarks to ensure a comprehensive and fair comparison. We use CVC-ClinicDB Bernal et&#xa0;al. (<xref ref-type="bibr" rid="B56">56</xref>), a standard dataset containing 489 images with a variety of polyp types, and Kvasir-SEG Jha et&#xa0;al. (<xref ref-type="bibr" rid="B57">57</xref>), a larger and more diverse dataset consisting of 800 annotated images. Unlike our dataset, these public benchmarks feature a broader range of polyp sizes, shapes, and imaging conditions, allowing us to assess the generalizability and robustness of our proposed method.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Implementation details</title>
<p>We implement our proposed model using the PyTorch framework and conduct all experiments on a single NVIDIA 4090 GPU. The encoder backbone is initialized with the weights of SAM2-Hiera-Large pre-trained on the SA-1B dataset. For all three datasets&#x2014;PolypSegDataset, CVC-ClinicDB, and Kvasir-SEG&#x2014;we perform a stratified split into training (80%), validation (10%), and testing (10%) sets, using a fixed random seed of 42 to ensure the reproducibility of our results. The specific splits for the training, validation, and testing sets are 1041/130/131 for PolypSegDataset, 489/61/62 for CVC-ClinicDB, and 800/100/100 for Kvasir-SEG, respectively.</p>
<p>During the training phase, all input images are resized to a uniform resolution of 352 &#xd7; 352 pixels. We train the model with a batch size of 12 for a total of 100 epochs. We employ the AdamW optimizer with an initial learning rate of 3 &#xd7; 10<sup>&#x2212;4</sup> and a weight decay of 1 &#xd7; 10<sup>&#x2212;5</sup>. To ensure training stability, a learning rate warmup strategy is utilized for the first 20 epochs. Furthermore, to prevent overfitting and select the best-performing model checkpoint, we implement an early stopping mechanism that monitors the segmentation loss on the validation set.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Evaluation metrics and compared methods</title>
<p>To provide a comprehensive and multi-faceted evaluation of segmentation performance, we employ eight widely-used metrics: mean Dice Coefficient (mDice), mean Intersection over Union (mIoU), Mean Absolute Error (MAE), S-measure (<italic>S<sub>&#x3b1;</sub></italic>), weighted F-measure <inline-formula>
<mml:math display="inline" id="im27"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></inline-formula> mean E-measure <inline-formula>
<mml:math display="inline" id="im28"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x3be;</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></inline-formula> mean Sensitivity (meanSen), and mean Specificity (meanSpe).</p>
<p>Among these, mDice and mIoU are region-based similarity metrics that evaluate the overlap between the predicted mask and the ground truth, primarily assessing the overall accuracy of the segmented object. MAE offers a direct pixel-by-pixel comparison by calculating the average absolute difference between the continuous prediction map and the binary ground-truth mask. To evaluate the model&#x2019;s classification performance on foreground and background pixels respectively, we report meanSen and meanSpe. Sensitivity measures the model&#x2019;s ability to correctly identify polyp pixels (true positives), while Specificity measures its ability to correctly identify background pixels (true negatives). We also include three advanced metrics that capture more complex, human perception-aligned aspects of segmentation quality. The weighted <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula> balances precision and recall using non-uniform weights to better match visual assessment. The <italic>S<sub>&#x3b1;</sub></italic> evaluates the structural similarity between the prediction and the ground truth at both the object and region levels. Finally, the <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:mi>m</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x3be;</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> simultaneously captures both image-level statistics and local pixel-level matching.</p>
<p>To benchmark the performance of our proposed model, we conduct a comprehensive comparison against eight state-of-the-art and representative models from the field of semantic and medical image segmentation. The selected methods include classic architectures such as U-Net Ronneberger et&#xa0;al. (<xref ref-type="bibr" rid="B39">39</xref>), UNet++ Zhou et&#xa0;al. (<xref ref-type="bibr" rid="B40">40</xref>), ResUNet Zhang et&#xa0;al. (<xref ref-type="bibr" rid="B41">41</xref>), and DeepLabV3 Chen et&#xa0;al. (<xref ref-type="bibr" rid="B58">58</xref>), as well as more recent Transformer-based models like TransUNet Chen et&#xa0;al. (<xref ref-type="bibr" rid="B44">44</xref>). We also compare against several models designed specifically for polyp segmentation, including PraNet Fan et&#xa0;al. (<xref ref-type="bibr" rid="B42">42</xref>), Polyp-PVT Dong et&#xa0;al. (<xref ref-type="bibr" rid="B17">17</xref>), and CTNet Xiao et&#xa0;al. (<xref ref-type="bibr" rid="B59">59</xref>). For a fair and direct comparison, we utilize the publicly available, open-source implementations for all baseline methods.</p>
</sec>
<sec id="s4_4" sec-type="results">
<label>4.4</label>
<title>Results</title>
<sec id="s4_4_1" sec-type="results">
<label>4.4.1</label>
<title>Results analysis for Kvasir-SEG</title>
<p>The quantitative results of our model and eight other state-of-the-art methods on the Kvasir-SEG dataset are presented in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>. Our proposed method demonstrates superior performance, achieving the best results on five of the eight key evaluation metrics. Specifically, our model attains the highest scores in mIoU (85.03%), mDice (90.46%), meanSpe (98.40%), and weighted F-measure <inline-formula>
<mml:math display="inline" id="im32"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> (89.53%), while also recording the lowest (best) MAE (0.0313). Compared to the strongest baseline, CTNet, our model shows a notable improvement in the critical region-based similarity metrics of mDice and mIoU. Furthermore, our method achieves highly competitive, second-best results on the structural similarity metrics <italic>S<sub>&#x3b1;</sub></italic> (92.47%) and <inline-formula>
<mml:math display="inline" id="im33"><mml:mrow><mml:mi>m</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x3be;</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> (94.52%). While some methods like Polyp-PVT exhibit slightly higher sensitivity, our model&#x2019;s leading specificity score indicates a robust ability to minimize false positives by correctly identifying background regions.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Quantitative comparison of our proposed method against eight state-of-the-art models on the Kvasir-SEG test set.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="center">Publication</th>
<th valign="middle" align="center">MAE&#x2193;</th>
<th valign="middle" align="center"><italic>S<sub>&#x3b1;</sub></italic>&#x2191;</th>
<th valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>&#x2191;</th>
<th valign="middle" align="center"><italic>mE<sub>&#x3be;</sub></italic>&#x2191;</th>
<th valign="middle" align="center">meanSen&#x2191;</th>
<th valign="middle" align="center">meanSpe&#x2191;</th>
<th valign="middle" align="center">mDice&#x2191;</th>
<th valign="middle" align="center">mIoU&#x2191;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">U-Net</td>
<td valign="middle" align="center">MICCAI 2015</td>
<td valign="middle" align="center">0.0446</td>
<td valign="middle" align="center">0.8820</td>
<td valign="middle" align="center">0.8132</td>
<td valign="middle" align="center">0.9024</td>
<td valign="middle" align="center">0.8866</td>
<td valign="middle" align="center">0.9739</td>
<td valign="middle" align="center">0.8470</td>
<td valign="middle" align="center">0.7735</td>
</tr>
<tr>
<td valign="middle" align="left">UNet++</td>
<td valign="middle" align="center">TMI 2019</td>
<td valign="middle" align="center">0.0432</td>
<td valign="middle" align="center">0.8869</td>
<td valign="middle" align="center">0.8326</td>
<td valign="middle" align="center">0.9116</td>
<td valign="middle" align="center">0.8851</td>
<td valign="middle" align="center">0.9761</td>
<td valign="middle" align="center">0.8562</td>
<td valign="middle" align="center">0.7871</td>
</tr>
<tr>
<td valign="middle" align="left">DeepLab V3</td>
<td valign="middle" align="center">CVPR 2017</td>
<td valign="middle" align="center">0.0444</td>
<td valign="middle" align="center">0.8826</td>
<td valign="middle" align="center">0.8111</td>
<td valign="middle" align="center">0.9096</td>
<td valign="middle" align="center">0.8888</td>
<td valign="middle" align="center">0.9718</td>
<td valign="middle" align="center">0.8478</td>
<td valign="middle" align="center">0.7667</td>
</tr>
<tr>
<td valign="middle" align="left">ResUNet</td>
<td valign="middle" align="center">CVPR 2016</td>
<td valign="middle" align="center">0.0827</td>
<td valign="middle" align="center">0.7808</td>
<td valign="middle" align="center">0.6578</td>
<td valign="middle" align="center">0.8262</td>
<td valign="middle" align="center">0.7683</td>
<td valign="middle" align="center">0.9579</td>
<td valign="middle" align="center">0.7103</td>
<td valign="middle" align="center">0.6029</td>
</tr>
<tr>
<td valign="middle" align="left">PraNet</td>
<td valign="middle" align="center">MICCAI 2020</td>
<td valign="middle" align="center">0.0378</td>
<td valign="middle" align="center">0.8915</td>
<td valign="middle" align="center">0.8481</td>
<td valign="middle" align="center">0.9171</td>
<td valign="middle" align="center">0.8651</td>
<td valign="middle" align="center"><underline>0.9829</underline></td>
<td valign="middle" align="center">0.8630</td>
<td valign="middle" align="center">0.7950</td>
</tr>
<tr>
<td valign="middle" align="left">Polyp-PVT</td>
<td valign="middle" align="center">AIR 2023</td>
<td valign="middle" align="center">0.0337</td>
<td valign="middle" align="center">0.9076</td>
<td valign="middle" align="center">0.8717</td>
<td valign="middle" align="center">0.9368</td>
<td valign="middle" align="center"><underline>0.9284</underline></td>
<td valign="middle" align="center">0.9774</td>
<td valign="middle" align="center">0.8909</td>
<td valign="middle" align="center">0.8296</td>
</tr>
<tr>
<td valign="middle" align="left">TransUNet</td>
<td valign="middle" align="center">MIA 2024</td>
<td valign="middle" align="center">0.0361</td>
<td valign="middle" align="center">0.9194</td>
<td valign="middle" align="center">0.8691</td>
<td valign="middle" align="center">0.9394</td>
<td valign="middle" align="center">0.9166</td>
<td valign="middle" align="center">0.9776</td>
<td valign="middle" align="center"><underline>0.8992</underline></td>
<td valign="middle" align="center">0.8396</td>
</tr>
<tr>
<td valign="middle" align="left">CTNet</td>
<td valign="middle" align="center">TCYB 2024</td>
<td valign="middle" align="center"><underline>0.0319</underline></td>
<td valign="middle" align="center"><bold>0.9263</bold></td>
<td valign="middle" align="center"><underline>0.8794</underline></td>
<td valign="middle" align="center"><bold>0.9468</bold></td>
<td valign="middle" align="center"><bold>0.9305</bold></td>
<td valign="middle" align="center">0.9767</td>
<td valign="middle" align="center">0.8969</td>
<td valign="middle" align="center"><underline>0.8412</underline></td>
</tr>
<tr>
<td valign="middle" align="left">Ours</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"><bold>0.0313</bold></td>
<td valign="middle" align="center"><underline>0.9247</underline></td>
<td valign="middle" align="center"><bold>0.8953</bold></td>
<td valign="middle" align="center"><underline>0.9452</underline></td>
<td valign="middle" align="center">0.9052</td>
<td valign="middle" align="center"><bold>0.9840</bold></td>
<td valign="middle" align="center"><bold>0.9046</bold></td>
<td valign="middle" align="center"><bold>0.8503</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>We report performance across eight different evaluation metrics. The best results are highlighted in bold, and the second-best results are underlined.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>In addition to the quantitative metrics, <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref> provides a qualitative comparison of the segmentation results from different models on representative images from the Kvasir-SEG test set. The visual results consistently illustrate that our model produces more precise and complete segmentation masks that more accurately adhere to the ground-truth boundaries. In challenging cases involving ambiguous edges, low contrast between the polyp and the surrounding mucosa, or irregular shapes, our method demonstrates enhanced robustness. It successfully maintains the integrity of the polyp structure while other methods, such as PraNet or U-Net, may yield incomplete masks or struggle with fine boundary details. This visual evidence corroborates the quantitative findings, highlighting our model&#x2019;s improved ability to handle the complexities of polyp segmentation.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Qualitative comparison of segmentation results on Kvasir-SEG dataset. The green contour represents the ground-truth (GT) boundary. The model&#x2019;s prediction is visualized as a semi-transparent yellow heatmap, where opacity indicates confidence. The final predicted boundary, after applying a 0.5 threshold, is shown as a white contour.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1734345-g004.tif">
<alt-text content-type="machine-generated">Grid of colonoscopy images displays progressive annotation of polyps using different colored overlays and outline markings, illustrating the step-by-step segmentation and identification of abnormal growths in the gastrointestinal tract for medical analysis.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>CVC-ClinicDB</title>
<p><xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref> presents the comparative results on the standard CVC-ClinicDB benchmark. On this highly competitive dataset, our model demonstrates state-of-the-art performance on the core metrics for segmentation accuracy. Specifically, our method achieves the highest mDice (93.16%) and mIoU (88.02%), which are the primary indicators of segmentation quality. It also records the best meanSpe (99.10%), highlighting its excellent ability to avoid false positives. While other methods like PraNet and Polyp-PVT show strong performance on certain perceptual metrics such as MAE and <italic>S<sub>&#x3b1;</sub></italic>, our model&#x2019;s superiority in mDice and mIoU suggests it produces the most spatially accurate and reliable segmentation masks overall. The qualitative results in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref> further support these findings, where our model consistently generates smooth and precise boundaries that closely adhere to the ground truth.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Quantitative comparison of our proposed method against eight state-of-the-art models on the CVC-ClinicDB test set.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="center">Publication</th>
<th valign="middle" align="center">MAE&#x2193;</th>
<th valign="middle" align="center"><italic>S<sub>&#x3b1;</sub></italic>&#x2191;</th>
<th valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im34"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>&#x2191;</th>
<th valign="middle" align="center">mE&#x3be;&#x2191;</th>
<th valign="middle" align="center">meanSen&#x2191;</th>
<th valign="middle" align="center">meanSpe&#x2191;</th>
<th valign="middle" align="center">mDice&#x2191;</th>
<th valign="middle" align="center">mIoU&#x2191;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">U-Net</td>
<td valign="middle" align="center">MICCAI 2015</td>
<td valign="middle" align="center">0.0116</td>
<td valign="middle" align="center">0.9348</td>
<td valign="middle" align="center">0.9018</td>
<td valign="middle" align="center">0.9575</td>
<td valign="middle" align="center">0.9209</td>
<td valign="middle" align="center">0.9899</td>
<td valign="middle" align="center">0.9043</td>
<td valign="middle" align="center">0.8540</td>
</tr>
<tr>
<td valign="middle" align="left">UNet++</td>
<td valign="middle" align="center">TMI 2019</td>
<td valign="middle" align="center"><underline>0.0105</underline></td>
<td valign="middle" align="center">0.9418</td>
<td valign="middle" align="center">0.9069</td>
<td valign="middle" align="center">0.9679</td>
<td valign="middle" align="center">0.9261</td>
<td valign="middle" align="center">0.9900</td>
<td valign="middle" align="center">0.9165</td>
<td valign="middle" align="center">0.8663</td>
</tr>
<tr>
<td valign="middle" align="left">DeepLab V3</td>
<td valign="middle" align="center">CVPR 2017</td>
<td valign="middle" align="center">0.0114</td>
<td valign="middle" align="center">0.9412</td>
<td valign="middle" align="center">0.9138</td>
<td valign="middle" align="center">0.9755</td>
<td valign="middle" align="center">0.9312</td>
<td valign="middle" align="center">0.9902</td>
<td valign="middle" align="center">0.9203</td>
<td valign="middle" align="center">0.8610</td>
</tr>
<tr>
<td valign="middle" align="left">ResUNet</td>
<td valign="middle" align="center">CVPR 2016</td>
<td valign="middle" align="center">0.0357</td>
<td valign="middle" align="center">0.8580</td>
<td valign="middle" align="center">0.7173</td>
<td valign="middle" align="center">0.8989</td>
<td valign="middle" align="center">0.8415</td>
<td valign="middle" align="center">0.9696</td>
<td valign="middle" align="center">0.7839</td>
<td valign="middle" align="center">0.7054</td>
</tr>
<tr>
<td valign="middle" align="left">PraNet</td>
<td valign="middle" align="center">MICCAI 2020</td>
<td valign="middle" align="center"><bold>0.0093</bold></td>
<td valign="middle" align="center"><underline>0.9532</underline></td>
<td valign="middle" align="center"><underline>0.9293</underline></td>
<td valign="middle" align="center"><bold>0.9802</bold></td>
<td valign="middle" align="center">0.9506</td>
<td valign="middle" align="center"><underline>0.9909</underline></td>
<td valign="middle" align="center">0.9306</td>
<td valign="middle" align="center">0.8724</td>
</tr>
<tr>
<td valign="middle" align="left">Polyp-PVT</td>
<td valign="middle" align="center">AIR 2023</td>
<td valign="middle" align="center">0.0117</td>
<td valign="middle" align="center"><bold>0.9537</bold></td>
<td valign="middle" align="center"><bold>0.9295</bold></td>
<td valign="middle" align="center">0.9703</td>
<td valign="middle" align="center"><underline>0.9558</underline></td>
<td valign="middle" align="center">0.9898</td>
<td valign="middle" align="center"><underline>0.9307</underline></td>
<td valign="middle" align="center">0.8687</td>
</tr>
<tr>
<td valign="middle" align="left">TransUNet</td>
<td valign="middle" align="center">MIA 2024</td>
<td valign="middle" align="center">0.0131</td>
<td valign="middle" align="center">0.9416</td>
<td valign="middle" align="center">0.9002</td>
<td valign="middle" align="center">0.9638</td>
<td valign="middle" align="center"><bold>0.9661</bold></td>
<td valign="middle" align="center">0.9849</td>
<td valign="middle" align="center">0.9126</td>
<td valign="middle" align="center">0.8585</td>
</tr>
<tr>
<td valign="middle" align="left">CTNet</td>
<td valign="middle" align="center">TCYB 2024</td>
<td valign="middle" align="center">0.0132</td>
<td valign="middle" align="center">0.9525</td>
<td valign="middle" align="center">0.8815</td>
<td valign="middle" align="center">0.9726</td>
<td valign="middle" align="center">0.9448</td>
<td valign="middle" align="center">0.9855</td>
<td valign="middle" align="center">0.9278</td>
<td valign="middle" align="center"><underline>0.8727</underline></td>
</tr>
<tr>
<td valign="middle" align="left">Ours</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.0108</td>
<td valign="middle" align="center">0.9461</td>
<td valign="middle" align="center">0.9263</td>
<td valign="middle" align="center"><underline>0.9773</underline></td>
<td valign="middle" align="center">0.9504</td>
<td valign="middle" align="center"><bold>0.9910</bold></td>
<td valign="middle" align="center"><bold>0.9316</bold></td>
<td valign="middle" align="center"><bold>0.8802</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>We report performance across eight different evaluation metrics. The best results are highlighted in bold, and the second-best results are underlined.</p></fn>
</table-wrap-foot>
</table-wrap>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Qualitative comparison of segmentation results on CVC-ClinicDB dataset. The green contour represents the ground-truth (GT) boundary. The model&#x2019;s prediction is visualized as a semi-transparent yellow heatmap, where opacity indicates confidence. The final predicted boundary, after applying a 0.5 threshold, is shown as a white contour.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1734345-g005.tif">
<alt-text content-type="machine-generated">Grid of endoscopic images showing multiple rows, each featuring a sequence of medical images where regions of interest are progressively outlined or filled in green and yellow, likely indicating segmentation of abnormalities or polyps for comparative analysis across different methods.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_4_3">
<label>4.4.3</label>
<title>PolypSegDataset</title>
<p>The performance of all methods on our curated PolypSegDataset is detailed in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>. The results demonstrate the clear superiority of our proposed model on this benchmark, as our method achieves state-of-the-art performance on seven out of the eight evaluation metrics, often by a significant margin. It secures the top scores for mIoU (88.62%), mDice (93.27%), meanSen (94.34%), all three perceptual metrics (<inline-formula>
<mml:math display="inline" id="im35"><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mi>&#x3b1;</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im36"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im37"><mml:mrow><mml:mi>m</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x3be;</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>), and achieves the lowest MAE (0.0060). Given that PolypSegDataset is characterized by large polyps with high-quality boundary annotations, this strong performance validates our architecture&#x2019;s effectiveness in precisely segmenting well-defined, clinically significant lesions. The qualitative examples provided in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref> corroborate these quantitative results, highlighting our model&#x2019;s ability to generate exceptionally clean and accurate segmentation masks where the predicted contour almost perfectly overlaps with the ground-truth boundary.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Quantitative comparison of our proposed method against eight state-of-the-art models on the PolypSegDataset test set.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="center">Publication</th>
<th valign="middle" align="center">MAE&#x2193;</th>
<th valign="middle" align="center"><italic>S<sub>&#x3b1;</sub></italic>&#x2191;</th>
<th valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im39"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>&#x2191;</th>
<th valign="middle" align="center">mE&#x3be;&#x2191;</th>
<th valign="middle" align="center">meanSen&#x2191;</th>
<th valign="middle" align="center">meanSpe&#x2191;</th>
<th valign="middle" align="center">mDice&#x2191;</th>
<th valign="middle" align="center">mIoU&#x2191;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">U-Net</td>
<td valign="middle" align="center">MICCAI 2015</td>
<td valign="middle" align="center">0.0095</td>
<td valign="middle" align="center">0.9279</td>
<td valign="middle" align="center">0.8751</td>
<td valign="middle" align="center">0.9401</td>
<td valign="middle" align="center">0.8837</td>
<td valign="middle" align="center">0.9930</td>
<td valign="middle" align="center">0.8793</td>
<td valign="middle" align="center">0.8261</td>
</tr>
<tr>
<td valign="middle" align="left">UNet++</td>
<td valign="middle" align="center">TMI 2019</td>
<td valign="middle" align="center">0.0090</td>
<td valign="middle" align="center">0.9401</td>
<td valign="middle" align="center">0.8994</td>
<td valign="middle" align="center">0.9567</td>
<td valign="middle" align="center">0.8993</td>
<td valign="middle" align="center">0.9933</td>
<td valign="middle" align="center">0.9004</td>
<td valign="middle" align="center">0.8502</td>
</tr>
<tr>
<td valign="middle" align="left">DeepLab V3</td>
<td valign="middle" align="center">CVPR 2017</td>
<td valign="middle" align="center">0.0089</td>
<td valign="middle" align="center">0.9394</td>
<td valign="middle" align="center">0.8937</td>
<td valign="middle" align="center">0.9656</td>
<td valign="middle" align="center">0.9120</td>
<td valign="middle" align="center">0.9925</td>
<td valign="middle" align="center">0.9036</td>
<td valign="middle" align="center">0.8415</td>
</tr>
<tr>
<td valign="middle" align="left">ResUNet</td>
<td valign="middle" align="center">CVPR 2016</td>
<td valign="middle" align="center">0.0272</td>
<td valign="middle" align="center">0.8557</td>
<td valign="middle" align="center">0.7394</td>
<td valign="middle" align="center">0.8959</td>
<td valign="middle" align="center">0.8265</td>
<td valign="middle" align="center">0.9793</td>
<td valign="middle" align="center">0.7741</td>
<td valign="middle" align="center">0.6804</td>
</tr>
<tr>
<td valign="middle" align="left">PraNet</td>
<td valign="middle" align="center">MICCAI 2020</td>
<td valign="middle" align="center"><underline>0.0066</underline></td>
<td valign="middle" align="center"><underline>0.9570</underline></td>
<td valign="middle" align="center">0.9272</td>
<td valign="middle" align="center"><underline>0.9779</underline></td>
<td valign="middle" align="center">0.9336</td>
<td valign="middle" align="center"><underline>0.9935</underline></td>
<td valign="middle" align="center">0.9203</td>
<td valign="middle" align="center">0.8613</td>
</tr>
<tr>
<td valign="middle" align="left">Polyp-PVT</td>
<td valign="middle" align="center">AIR 2023</td>
<td valign="middle" align="center">0.0083</td>
<td valign="middle" align="center">0.9455</td>
<td valign="middle" align="center">0.9046</td>
<td valign="middle" align="center">0.9734</td>
<td valign="middle" align="center">0.9339</td>
<td valign="middle" align="center">0.9902</td>
<td valign="middle" align="center">0.9201</td>
<td valign="middle" align="center">0.8634</td>
</tr>
<tr>
<td valign="middle" align="left">TransUNet</td>
<td valign="middle" align="center">MIA 2024</td>
<td valign="middle" align="center">0.0082</td>
<td valign="middle" align="center">0.9506</td>
<td valign="middle" align="center">0.9133</td>
<td valign="middle" align="center">0.9748</td>
<td valign="middle" align="center"><underline>0.9422</underline></td>
<td valign="middle" align="center">0.9917</td>
<td valign="middle" align="center">0.9216</td>
<td valign="middle" align="center"><underline>0.8690</underline></td>
</tr>
<tr>
<td valign="middle" align="left">CTNet</td>
<td valign="middle" align="center">TCYB 2024</td>
<td valign="middle" align="center">0.0074</td>
<td valign="middle" align="center">0.9568</td>
<td valign="middle" align="center"><underline>0.9219</underline></td>
<td valign="middle" align="center">0.9773</td>
<td valign="middle" align="center">0.8982</td>
<td valign="middle" align="center"><bold>0.9940</bold></td>
<td valign="middle" align="center"><underline>0.9221</underline></td>
<td valign="middle" align="center">0.8689</td>
</tr>
<tr>
<td valign="middle" align="left">Ours</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"><bold>0.0060</bold></td>
<td valign="middle" align="center"><bold>0.9606</bold></td>
<td valign="middle" align="center"><bold>0.9274</bold></td>
<td valign="middle" align="center"><bold>0.9788</bold></td>
<td valign="middle" align="center"><bold>0.9434</bold></td>
<td valign="middle" align="center">0.9930</td>
<td valign="middle" align="center"><bold>0.9327</bold></td>
<td valign="middle" align="center"><bold>0.8862</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>We report performance across eight different evaluation metrics. The best results are highlighted in bold, and the second-best results are underlined.</p></fn>
</table-wrap-foot>
</table-wrap>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Qualitative comparison of segmentation results on PolypSegDataset. The green contour represents the ground-truth (GT) boundary. The model&#x2019;s prediction is visualized as a semi-transparent yellow heatmap, where opacity indicates confidence. The final predicted boundary, after applying a 0.5 threshold, is shown as a white contour.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1734345-g006.tif">
<alt-text content-type="machine-generated">Grid of endoscopic images depicting colon polyp segmentation. Each row shows a sequence for a different polyp, with initial images, corresponding segmentation masks, and progressive overlays of yellow masks highlighting polyp boundaries.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_4_4">
<label>4.4.4</label>
<title>Ablation study</title>
<p>To validate the effectiveness and individual contributions of our key proposed components, we conduct a comprehensive ablation study on the PolypSegDataset. We establish a strong baseline using the SAM2 encoder with our trainable adapter and the progressive refinement decoder. We then incrementally add our two main contributions: the Function-Specialized Mixture-of-Experts (FS-MoE) module and the Spectral-Guided Boundary Enhancement (SGBE) module. The results of this study are summarized in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Ablation study on the PolypSegDataset dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="center">MAE&#x2193;</th>
<th valign="middle" align="center"><italic>S<sub>&#x3b1;</sub></italic>&#x2191;</th>
<th valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im38"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x3b2;</mml:mi><mml:mi>w</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>&#x2191;</th>
<th valign="middle" align="center"><italic>mE<sub>&#x3be;</sub></italic>&#x2191;</th>
<th valign="middle" align="center">meanSen&#x2191;</th>
<th valign="middle" align="center">meanSpe&#x2191;</th>
<th valign="middle" align="center">mDice&#x2191;</th>
<th valign="middle" align="center">mIoU&#x2191;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">SAM2+Adapter</td>
<td valign="middle" align="center">0.0084</td>
<td valign="middle" align="center">0.9463</td>
<td valign="middle" align="center">0.9058</td>
<td valign="middle" align="center">0.9687</td>
<td valign="middle" align="center">0.9021</td>
<td valign="middle" align="center">0.9892</td>
<td valign="middle" align="center">0.9176</td>
<td valign="middle" align="center">0.8558</td>
</tr>
<tr>
<td valign="middle" align="left">+ FS-MoE Module</td>
<td valign="middle" align="center">0.0079</td>
<td valign="middle" align="center">0.9489</td>
<td valign="middle" align="center">0.9127</td>
<td valign="middle" align="center">0.9718</td>
<td valign="middle" align="center">0.9264</td>
<td valign="middle" align="center">0.9903</td>
<td valign="middle" align="center">0.9229</td>
<td valign="middle" align="center">0.8589</td>
</tr>
<tr>
<td valign="middle" align="left">+ SGBE Module</td>
<td valign="middle" align="center">0.0073</td>
<td valign="middle" align="center">0.9498</td>
<td valign="middle" align="center">0.9186</td>
<td valign="middle" align="center">0.9751</td>
<td valign="middle" align="center">0.9378</td>
<td valign="middle" align="center">0.9917</td>
<td valign="middle" align="center">0.9274</td>
<td valign="middle" align="center">0.8611</td>
</tr>
<tr>
<td valign="middle" align="left">SpetraNet</td>
<td valign="middle" align="center"><bold>0.0070</bold></td>
<td valign="middle" align="center"><bold>0.9504</bold></td>
<td valign="middle" align="center"><bold>0.9231</bold></td>
<td valign="middle" align="center"><bold>0.9768</bold></td>
<td valign="middle" align="center"><bold>0.9432</bold></td>
<td valign="middle" align="center"><bold>0.9930</bold></td>
<td valign="middle" align="center"><bold>0.9302</bold></td>
<td valign="middle" align="center"><bold>0.8627</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The table shows the performance improvement by progressively adding different modules to the baseline (SAM2+Adapter). The FS-MoE and SGBE modules both contribute to consistent gains across all metrics, and the full model (SpetraNet) achieves the best overall performance.</p></fn>
<fn>
<p>The best results are highlighted in bold.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>As shown in the table, starting from the baseline, the integration of the FS-MoE module yields consistent performance gains across all eight metrics. This confirms the benefit of its adaptive, function-specialized feature refinement for handling diverse polyp appearances. Similarly, adding the SGBE module to the baseline also results in substantial improvements across the board, which highlights the critical role of frequency-domain enhancement in improving boundary definition and overall segmentation accuracy. Our full model, SpectraNet, which combines both modules, achieves the best performance on all metrics. The synergistic effect of spectral boundary enhancement and adaptive spatial refinement leads to the highest mIoU (86.27%) and mDice (93.02%) scores. This step-by-step analysis clearly demonstrates that both of our proposed modules are effective and contribute positively to the final performance of the network.</p>
</sec>
<sec id="s4_4_5">
<label>4.4.5</label>
<title>Generalization capability evaluation</title>
<p>To rigorously assess the generalization capability of SpectraNet and ensure it does not overfit to specific data distributions, we conducted a cross-dataset evaluation. In this setting, we trained the model on a composite dataset merging Kvasir-SEG and CVC-ClinicDB, and then directly evaluated its performance on an unseen dataset: CVC-ColonDB Bernal et&#xa0;al. (<xref ref-type="bibr" rid="B60">60</xref>). This testing scenario is particularly challenging as CVC-ColonDB contains polyps with highly diverse appearances and imaging conditions distinct from the training set.</p>
<p>The quantitative results are summarized in <xref ref-type="table" rid="T6"><bold>Table&#xa0;6</bold></xref>. As observed, SpectraNet maintains robust performance on the unseen domain, achieving a mDice of 73.85% and a mIoU of 64.92%. Compared to state-of-the-art methods, our model demonstrates superior generalization ability, consistently outperforming representative methods such as Polyp-PVT and CTNet. This confirms that the proposed Spectral-Guided Boundary Enhancement (SGBE) and Function-Specialized Mixture-of-Experts (FS-MoE) modules facilitate the learning of intrinsic, invariant polyp features (such as structural phase information) rather than memorizing dataset-specific biases.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Generalization performance evaluation.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="center">mDice</th>
<th valign="middle" align="center">MIoU</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">U-Net</td>
<td valign="middle" align="center">0.6425</td>
<td valign="middle" align="center">0.5618</td>
</tr>
<tr>
<td valign="middle" align="left">Polyp-PVT</td>
<td valign="middle" align="center">0.6982</td>
<td valign="middle" align="center">0.6105</td>
</tr>
<tr>
<td valign="middle" align="left">CTNet</td>
<td valign="middle" align="center">0.7156</td>
<td valign="middle" align="center">0.6289</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>SpectraNet (Ours)</bold></td>
<td valign="middle" align="center"><bold>0.7385</bold></td>
<td valign="middle" align="center"><bold>0.6492</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Models are trained on the combined Kvasir-SEG and CVC-ClinicDB datasets and tested on the unseen CVC-ColonDB dataset. The best results are highlighted in bold.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_4_6">
<label>4.4.6</label>
<title>Complexity analysis</title>
<p>To evaluate the computational efficiency of the proposed model, we compared SpectraNet with three representative state-of-the-art methods: PraNet, PolypPVT, and CTNet. The evaluation metrics include the total number of parameters (Total Params), the number of trainable parameters (Trainable Params), computational complexity (GFLOPs), and inference speed (Frames Per Second, FPS). All measurements were conducted on a single NVIDIA RTX 4090 GPU with an input resolution of 352 &#xd7; 352.</p>
<p>The results are presented in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>. It is observed that SpectraNet has a significantly larger count of total parameters (214.90 M) and GFLOPs (125.076) compared to the lightweight baselines. This is expected as our architecture is built upon the heavy SAM2 foundation model to leverage its robust feature extraction capabilities. However, a key advantage of our design is the implementation of <italic>Parameter-Efficient Fine-Tuning</italic>. By freezing the heavy backbone and only training the lightweight adapters and decoder heads, SpectraNet requires only 2.75 M trainable parameters&#x2014;reducing the training burden by over 90% compared to PraNet (32.55 M) and CTNet (44.29 M).</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Computational complexity analysis.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">Total params (M)</th>
<th valign="middle" align="center">Trainable (M)</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">PraNet</td>
<td valign="middle" align="center">32.55</td>
<td valign="middle" align="center">32.55</td>
<td valign="middle" align="center">13.150</td>
<td valign="middle" align="center">61.29</td>
</tr>
<tr>
<td valign="middle" align="left">PolypPVT</td>
<td valign="middle" align="center">25.11</td>
<td valign="middle" align="center">25.11</td>
<td valign="middle" align="center">10.018</td>
<td valign="middle" align="center">67.93</td>
</tr>
<tr>
<td valign="middle" align="left">CTNet</td>
<td valign="middle" align="center">44.29</td>
<td valign="middle" align="center">44.29</td>
<td valign="middle" align="center">15.413</td>
<td valign="middle" align="center">35.93</td>
</tr>
<tr>
<td valign="middle" align="left">SpectraNet (Ours)</td>
<td valign="middle" align="center">214.90</td>
<td valign="middle" align="center">2.75</td>
<td valign="middle" align="center">125.076</td>
<td valign="middle" align="center">15.04</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>We compare the model parameters (Total and Trainable), computational cost (GFLOPs), and inference speed (FPS) with state-of-the-art methods. Note that SpectraNet utilizes a frozen backbone strategy, resulting in minimal trainable parameters.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>In terms of inference speed, SpectraNet achieves 15.04 FPS. While this is lower than the lightweight models, it strikes a favorable trade-off between computational cost and the significant performance gains demonstrated in previous sections (e.g., +2-3% mDice). Furthermore, 15 FPS is generally considered sufficient to provide near real-time feedback in clinical colonoscopy workflows, where the priority is often the precision of the segmentation mask rather than ultra-high frame rates.</p>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>In this paper, we addressed the challenging problem of high-precision polyp segmentation, focusing on the critical issue of ambiguous boundary definition. We introduced SpectraNet, a novel encoder-decoder architecture that integrates a unique hybrid-domain enhancement strategy. Our approach leverages a Spectral-Guided Boundary Enhancement (SGBE) module to explicitly amplify structural details in the frequency domain, effectively sharpening the representation of polyp boundaries. This is complemented by a Function-Specialized Mixture-of-Experts (FS-MoE) module, which provides an adaptive mechanism to apply targeted feature refinement based on the specific characteristics of each polyp.</p>
<p>Comprehensive evaluations conducted across three distinct datasets confirmed the effectiveness of our design. On our curated, high-quality PolypSegDataset, as well as on the standard public benchmarks Kvasir-SEG and CVC-ClinicDB, SpectraNet consistently outperformed a suite of state-of-the-art segmentation models. The quantitative results highlighted our model&#x2019;s superiority in achieving higher mIoU and mDice scores, while the qualitative comparisons demonstrated its ability to generate more accurate and complete masks with finer boundary details. The success of our approach validates the significant potential of integrating frequency-domain analysis and adaptive, function-specialized processing into deep learning frameworks for medical image segmentation. Future work may include extending this architecture to other medical imaging modalities where boundary ambiguity is a key challenge, such as tumor segmentation in MRI or CT scans, and exploring model compression techniques to facilitate real-time clinical application.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The datasets presented in this article are not readily available because PolypSegDataset, was collected at Jiangyan Hospital Affiliated to Nanjing University of Chinese Medicine and is not publicly available due to ethical restrictions and patient privacy concerns outlined in the Institutional Review Board approval (Protocol No. 2025-008-003). A de-identified subset of PolypSegDataset supporting the findings of this study is available from the corresponding author, J.L. (Jing Ling), upon reasonable request. Requests to access the datasets should be directed to Jing Ling, <email xlink:href="mailto:157770425@qq.com">157770425@qq.com</email>.</p></sec>
<sec id="s7" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Jiangyan Hospital Affiliated to Nanjing University of Chinese Medicine. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p></sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>ZL: Validation, Visualization, Writing &#x2013; review &amp; editing, Writing&#xa0;&#x2013; original draft, Investigation, Methodology, Conceptualization, Software. JL: Conceptualization, Writing &#x2013; review &amp; editing, Supervision, Data curation, Resources, Project administration.</p></sec>
<ack>
<title>Acknowledgments</title>
<p>The authors thank the Medical Ethics Committee of Jiangyan Hospital Affiliated to Nanjing University of Chinese Medicine for their ethical oversight and approval of this study. We also acknowledge the participants for their voluntary involvement and consent to use their data in this research.</p>
</ack>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Center</surname> <given-names>MM</given-names></name>
<name><surname>Jemal</surname> <given-names>A</given-names></name>
<name><surname>Smith</surname> <given-names>RA</given-names></name>
<name><surname>Ward</surname> <given-names>E</given-names></name>
</person-group>. 
<article-title>Worldwide variations in colorectal cancer</article-title>. <source>CA.: A. Cancer J Clin</source>. (<year>2009</year>) <volume>59</volume>:<page-range>366&#x2013;78</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3322/caac.20038</pub-id>, PMID: <pub-id pub-id-type="pmid">19897840</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<label>2</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ladabaum</surname> <given-names>U</given-names></name>
<name><surname>Dominitz</surname> <given-names>JA</given-names></name>
<name><surname>Kahi</surname> <given-names>C</given-names></name>
<name><surname>Schoen</surname> <given-names>RE</given-names></name>
</person-group>. 
<article-title>Strategies for colorectal cancer screening</article-title>. <source>Gastroenterology</source>. (<year>2020</year>) <volume>158</volume>:<page-range>418&#x2013;32</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1053/j.gastro.2019.06.043</pub-id>, PMID: <pub-id pub-id-type="pmid">31394083</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<label>3</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Brenner</surname> <given-names>H</given-names></name>
<name><surname>Chang-Claude</surname> <given-names>J</given-names></name>
<name><surname>Jansen</surname> <given-names>L</given-names></name>
<name><surname>Seiler</surname> <given-names>CM</given-names></name>
<name><surname>Hoffmeister</surname> <given-names>M</given-names></name>
</person-group>. 
<article-title>Role of colonoscopy and polyp characteristics in colorectal cancer after colonoscopic polyp detection: a population-based case&#x2013;control study</article-title>. <source>Ann Internal Med</source>. (<year>2012</year>) <volume>157</volume>:<page-range>225&#x2013;32</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.7326/0003-4819-157-4-201208210-00002</pub-id>, PMID: <pub-id pub-id-type="pmid">22910933</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<label>4</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Miller</surname> <given-names>SF</given-names></name>
<name><surname>Knight</surname> <given-names>AR</given-names></name>
</person-group>. 
<article-title>The early detection of colorectal cancer</article-title>. <source>Cancer</source>. (<year>1977</year>) <volume>40</volume>:<page-range>945&#x2013;9</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/1097-0142(197708)40:2&lt;945::AID-CNCR2820400253&gt;3.0.CO;2-F</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<label>5</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Guachi</surname> <given-names>L</given-names></name>
<name><surname>Guachi</surname> <given-names>R</given-names></name>
<name><surname>Bini</surname> <given-names>F</given-names></name>
<name><surname>Marinozzi</surname> <given-names>F</given-names></name>
</person-group>. 
<article-title>Automatic colorectal segmentation with convolutional neural network</article-title>. <source>Comput-Aided. Des. Appl</source>. (<year>2019</year>) <volume>16</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.14733/cadaps.2019.836-845</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<label>6</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Viscaino</surname> <given-names>M</given-names></name>
<name><surname>Bustos</surname> <given-names>JT</given-names></name>
<name><surname>Munoz</surname> <given-names>P</given-names></name>
<name><surname>Cheein</surname> <given-names>CA</given-names></name>
<name><surname>Cheein</surname> <given-names>FA</given-names></name>
</person-group>. 
<article-title>Artificial intelligence for the early detection of colorectal cancer: A comprehensive review of its advantages and misconceptions</article-title>. <source>World J Gastroenterol</source>. (<year>2021</year>) <volume>27</volume>:<fpage>6399</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3748/wjg.v27.i38.6399</pub-id>, PMID: <pub-id pub-id-type="pmid">34720530</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<label>7</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jiaxing</surname> <given-names>Z</given-names></name>
<name><surname>Hao</surname> <given-names>T</given-names></name>
</person-group>. 
<article-title>Sam2 for image and video segmentation: A comprehensive survey</article-title>. <source>arXiv</source>. (<year>2025</year>).
</mixed-citation>
</ref>
<ref id="B8">
<label>8</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>S</given-names></name>
<name><surname>Ren</surname> <given-names>Y</given-names></name>
<name><surname>Yu</surname> <given-names>Y</given-names></name>
<name><surname>Jiang</surname> <given-names>Q</given-names></name>
<name><surname>He</surname> <given-names>X</given-names></name>
<name><surname>Li</surname> <given-names>H</given-names></name>
</person-group>. 
<article-title>A survey of deep learning algorithms for colorectal polyp segmentation</article-title>. <source>Neurocomputing</source>. (<year>2025</year>) <volume>614</volume>:<fpage>128767</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2024.128767</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<label>9</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ji</surname> <given-names>G-P</given-names></name>
<name><surname>Xiao</surname> <given-names>G</given-names></name>
<name><surname>Chou</surname> <given-names>Y-C</given-names></name>
<name><surname>Fan</surname> <given-names>D-P</given-names></name>
<name><surname>Zhao</surname> <given-names>K</given-names></name>
<name><surname>Chen</surname> <given-names>G</given-names></name>
<etal/>
</person-group>. 
<article-title>Video polyp segmentation: A deep learning perspective</article-title>. <source>Mach Intell Res</source>. (<year>2022</year>) <volume>19</volume>:<page-range>531&#x2013;49</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11633-022-1371-y</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<label>10</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Huo</surname> <given-names>J</given-names></name>
<name><surname>Xiao</surname> <given-names>R</given-names></name>
<name><surname>Zheng</surname> <given-names>H</given-names></name>
<name><surname>Liu</surname> <given-names>Y</given-names></name>
<name><surname>Ourselin</surname> <given-names>S</given-names></name>
<name><surname>Sparks</surname> <given-names>R</given-names></name>
</person-group>. (<year>2024</year>). 
<article-title>Matchseg: Towards better segmentation via reference image matching</article-title>, in: <source>Proceedings of the 2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</source>, pp. <page-range>2068&#x2013;73</page-range>. <publisher-loc>Piscataway</publisher-loc>: 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B11">
<label>11</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gupta</surname> <given-names>M</given-names></name>
<name><surname>Mishra</surname> <given-names>A</given-names></name>
</person-group>. 
<article-title>A systematic review of deep learning based image segmentation to detect polyp</article-title>. <source>Artif Intell Rev</source>. (<year>2024</year>) <volume>57</volume>:<fpage>7</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-023-10621-1</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<label>12</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Qayoom</surname> <given-names>A</given-names></name>
<name><surname>Xie</surname> <given-names>J</given-names></name>
<name><surname>Ali</surname> <given-names>H</given-names></name>
</person-group>. 
<article-title>Polyp segmentation in medical imaging: challenges, approaches and future directions</article-title>. <source>Artif Intell Rev</source>. (<year>2025</year>) <volume>58</volume>:<fpage>169</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-025-11173-2</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<label>13</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Akbari</surname> <given-names>M</given-names></name>
<name><surname>Mohrekesh</surname> <given-names>M</given-names></name>
<name><surname>Nasr-Esfahani</surname> <given-names>E</given-names></name>
<name><surname>Soroushmehr</surname> <given-names>SR</given-names></name>
<name><surname>Karimi</surname> <given-names>N</given-names></name>
<name><surname>Samavi</surname> <given-names>S</given-names></name>
<etal/>
</person-group>. (<year>2018</year>). 
<article-title>Polyp segmentation in colonoscopy images using fully convolutional network</article-title>, in: <conf-name>Proceedings of the 40th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)</conf-name>, <publisher-loc>Piscataway</publisher-loc>. pp. <fpage>69</fpage>&#x2013;<lpage>72</lpage>. 
<publisher-name>IEEE</publisher-name>., PMID: <pub-id pub-id-type="pmid">30440343</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<label>14</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yeung</surname> <given-names>M</given-names></name>
<name><surname>Sala</surname> <given-names>E</given-names></name>
<name><surname>Sch&#xf6;nlieb</surname> <given-names>C-B</given-names></name>
<name><surname>Rundo</surname> <given-names>L</given-names></name>
</person-group>. 
<article-title>Focus u-net: A novel dual attention-gated cnn for polyp segmentation during colonoscopy</article-title>. <source>Comput Biol Med</source>. (<year>2021</year>) <volume>137</volume>:<fpage>104815</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.104815</pub-id>, PMID: <pub-id pub-id-type="pmid">34507156</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<label>15</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Sun</surname> <given-names>X</given-names></name>
<name><surname>Zhang</surname> <given-names>P</given-names></name>
<name><surname>Wang</surname> <given-names>D</given-names></name>
<name><surname>Cao</surname> <given-names>Y</given-names></name>
<name><surname>Liu</surname> <given-names>B</given-names></name>
</person-group>. (<year>2019</year>). 
<article-title>Colorectal polyp segmentation by u-net with dilation convolution</article-title>, in: <conf-name>Proceedings of the 18th IEEE International Conference on Machine Learning and Applications (ICMLA)</conf-name>, <publisher-loc>Piscataway</publisher-loc>. pp. <page-range>851&#x2013;8</page-range>. 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B16">
<label>16</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Duc</surname> <given-names>NT</given-names></name>
<name><surname>Oanh</surname> <given-names>NT</given-names></name>
<name><surname>Thuy</surname> <given-names>NT</given-names></name>
<name><surname>Triet</surname> <given-names>TM</given-names></name>
<name><surname>Dinh</surname> <given-names>VS</given-names></name>
</person-group>. 
<article-title>Colonformer: An efficient transformer based method for colon polyp segmentation</article-title>. <source>IEEE Access</source>. (<year>2022</year>) <volume>10</volume>:<page-range>80575&#x2013;86</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2022.3195241</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<label>17</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dong</surname> <given-names>B</given-names></name>
<name><surname>Wang</surname> <given-names>W</given-names></name>
<name><surname>Fan</surname> <given-names>D-P</given-names></name>
<name><surname>Li</surname> <given-names>J</given-names></name>
<name><surname>Fu</surname> <given-names>H</given-names></name>
<name><surname>Shao</surname> <given-names>L</given-names></name>
</person-group>. 
<article-title>Polyp-pvt: Polyp segmentation with pyramid vision transformers</article-title>. <source>arXiv</source>. (<year>2021</year>).
</mixed-citation>
</ref>
<ref id="B18">
<label>18</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Jha</surname> <given-names>D</given-names></name>
<name><surname>Tomar</surname> <given-names>NK</given-names></name>
<name><surname>Sharma</surname> <given-names>V</given-names></name>
<name><surname>Bagci</surname> <given-names>U</given-names></name>
</person-group>. 
<article-title>Transnetr: transformer-based residual network for polyp segmentation with multi-center out-of-distribution testing</article-title>. In: <source>Medical Imaging with Deep Learning</source>. 
<publisher-name>PMLR</publisher-name> (<year>2024</year>). p. <page-range>1372&#x2013;84</page-range>.
</mixed-citation>
</ref>
<ref id="B19">
<label>19</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Shao</surname> <given-names>H</given-names></name>
<name><surname>Zhang</surname> <given-names>Y</given-names></name>
<name><surname>Hou</surname> <given-names>Q</given-names></name>
</person-group>. (<year>2024</year>). 
<article-title>Polyper: Boundary sensitive polyp segmentation</article-title>, in: <conf-name>Proceedings of the AAAI conference on artificial intelligence</conf-name>, , Vol. <volume>38</volume>. <publisher-loc>Palo Alto, CA</publisher-loc>: 
<publisher-name>AAAI Press</publisher-name>. pp. <page-range>4731&#x2013;9</page-range>.
</mixed-citation>
</ref>
<ref id="B20">
<label>20</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mei</surname> <given-names>J</given-names></name>
<name><surname>Zhou</surname> <given-names>T</given-names></name>
<name><surname>Huang</surname> <given-names>K</given-names></name>
<name><surname>Zhang</surname> <given-names>Y</given-names></name>
<name><surname>Zhou</surname> <given-names>Y</given-names></name>
<name><surname>Wu</surname> <given-names>Y</given-names></name>
<etal/>
</person-group>. 
<article-title>A survey on deep learning for polyp segmentation: Techniques, challenges and future trends</article-title>. <source>Visual Intell</source>. (<year>2025</year>) <volume>3</volume>:<fpage>1</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s44267-024-00071-w</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<label>21</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Z</given-names></name>
<name><surname>Zheng</surname> <given-names>S</given-names></name>
<name><surname>Sun</surname> <given-names>X</given-names></name>
<name><surname>Zhu</surname> <given-names>Z</given-names></name>
<name><surname>Zhao</surname> <given-names>Y</given-names></name>
<name><surname>Yang</surname> <given-names>X</given-names></name>
<etal/>
</person-group>. 
<article-title>The devil is in the boundary: Boundary-enhanced polyp segmentation</article-title>. <source>IEEE Trans Circuits. Syst Vid. Technol</source>. (<year>2024</year>) <volume>34</volume>:<page-range>5414&#x2013;23</page-range>.
</mixed-citation>
</ref>
<ref id="B22">
<label>22</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tajbakhsh</surname> <given-names>N</given-names></name>
<name><surname>Chi</surname> <given-names>C</given-names></name>
<name><surname>Gurudu</surname> <given-names>SR</given-names></name>
<name><surname>Liang</surname> <given-names>J</given-names></name>
</person-group>. (<year>2014</year>). 
<article-title>Automatic polyp detection from learned boundaries</article-title>, in: <conf-name>Proceedings of the 2014 IEEE 11th International Symposium on Biomedical Imaging (ISBI)</conf-name>, <publisher-loc>Piscataway</publisher-loc>: 
<publisher-name>IEEE</publisher-name>. pp. <fpage>97</fpage>&#x2013;<lpage>100</lpage>.
</mixed-citation>
</ref>
<ref id="B23">
<label>23</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bracewell</surname> <given-names>RN</given-names></name>
</person-group>. 
<article-title>The fourier transform</article-title>. <source>Sci Am</source>. (<year>1989</year>) <volume>260</volume>:<fpage>86</fpage>&#x2013;<lpage>95</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/scientificamerican0689-86</pub-id>, PMID: <pub-id pub-id-type="pmid">2727659</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<label>24</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shanmugam</surname> <given-names>KS</given-names></name>
<name><surname>Dickey</surname> <given-names>FM</given-names></name>
<name><surname>Green</surname> <given-names>JA</given-names></name>
</person-group>. 
<article-title>An optimal frequency domain filter for edge detection in digital pictures</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. (<year>1979</year>), <fpage>37</fpage>&#x2013;<lpage>49</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.1979.4766874</pub-id>, PMID: <pub-id pub-id-type="pmid">21868829</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<label>25</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Nawab</surname> <given-names>H</given-names></name>
<name><surname>Oppenheim</surname> <given-names>A</given-names></name>
<name><surname>Lim</surname> <given-names>J</given-names></name>
</person-group>. (<year>1981</year>). 
<article-title>Improved spectral subtraction for signal restoration</article-title>, in: <conf-name>Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP &#x2019;81)</conf-name>, <publisher-loc>Piscataway</publisher-loc>: 
<publisher-name>IEEE</publisher-name>. Vol. <volume>6</volume>. pp. <page-range>1105&#x2013;8</page-range>.
</mixed-citation>
</ref>
<ref id="B26">
<label>26</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Pogorelov</surname> <given-names>K</given-names></name>
<name><surname>Ostroukhova</surname> <given-names>O</given-names></name>
<name><surname>Jeppsson</surname> <given-names>M</given-names></name>
<name><surname>Espeland</surname> <given-names>H</given-names></name>
<name><surname>Griwodz</surname> <given-names>C</given-names></name>
<name><surname>De Lange</surname> <given-names>T</given-names></name>
<etal/>
</person-group>. (<year>2018</year>). 
<article-title>Deep learning and hand-crafted feature based approaches for polyp detection in medical videos</article-title>, in: <conf-name>Proceedings of the 2018 IEEE 31st International Symposium on Computer-Based Medical Systems (CBMS)</conf-name>, <publisher-loc>Piscataway</publisher-loc>: 
<publisher-name>IEEE</publisher-name>. pp. <page-range>381&#x2013;6</page-range>.
</mixed-citation>
</ref>
<ref id="B27">
<label>27</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mamonov</surname> <given-names>AV</given-names></name>
<name><surname>Figueiredo</surname> <given-names>IN</given-names></name>
<name><surname>Figueiredo</surname> <given-names>PN</given-names></name>
<name><surname>Tsai</surname> <given-names>Y-HR</given-names></name>
</person-group>. 
<article-title>Automated polyp detection in colon capsule endoscopy</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2014</year>) <volume>33</volume>:<page-range>1488&#x2013;502</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMI.2014.2314959</pub-id>, PMID: <pub-id pub-id-type="pmid">24710829</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<label>28</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Maghsoudi</surname> <given-names>OH</given-names></name>
</person-group>. 
<article-title>Superpixel based segmentation and classification of polyps in wireless capsule endoscopy</article-title>. In: <source>2017 IEEE Signal Processing in Medicine and Biology Symposium (SPMB)</source>. <publisher-loc>Piscataway, NJ</publisher-loc>: 
<publisher-name>IEEE</publisher-name> (<year>2017</year>). p. <fpage>1</fpage>&#x2013;<lpage>4</lpage>.
</mixed-citation>
</ref>
<ref id="B29">
<label>29</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rahim</surname> <given-names>T</given-names></name>
<name><surname>Usman</surname> <given-names>MA</given-names></name>
<name><surname>Shin</surname> <given-names>SY</given-names></name>
</person-group>. 
<article-title>A survey on contemporary computer-aided tumor, polyp, and ulcer detection methods in wireless capsule endoscopy imaging</article-title>. <source>Comput. Med Imaging Graphics</source>. (<year>2020</year>) <volume>85</volume>:<fpage>101767</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compmedimag.2020.101767</pub-id>, PMID: <pub-id pub-id-type="pmid">32966967</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<label>30</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Brandao</surname> <given-names>P</given-names></name>
<name><surname>Zisimopoulos</surname> <given-names>O</given-names></name>
<name><surname>Mazomenos</surname> <given-names>E</given-names></name>
<name><surname>Ciuti</surname> <given-names>G</given-names></name>
<name><surname>Bernal</surname> <given-names>J</given-names></name>
<name><surname>Visentini-Scarzanella</surname> <given-names>M</given-names></name>
<etal/>
</person-group>. 
<article-title>Towards a computed-aided diagnosis system in colonoscopy: automatic polyp segmentation using convolution neural networks</article-title>. <source>J Med Robot. Res</source>. (<year>2018</year>) <volume>3</volume>:<fpage>1840002</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1142/S2424905X18400020</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<label>31</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>K</given-names></name>
<name><surname>Zhang</surname> <given-names>X</given-names></name>
<name><surname>Ren</surname> <given-names>S</given-names></name>
<name><surname>Sun</surname> <given-names>J</given-names></name>
</person-group>. (<year>2016</year>). 
<article-title>Deep residual learning for image recognition</article-title>, in: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: 
<publisher-name>IEEE</publisher-name>. pp. <page-range>770&#x2013;8</page-range>.
</mixed-citation>
</ref>
<ref id="B32">
<label>32</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Simonyan</surname> <given-names>K</given-names></name>
<name><surname>Zisserman</surname> <given-names>A</given-names></name>
</person-group>. 
<article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <source>arXiv</source>. (<year>2014</year>).
</mixed-citation>
</ref>
<ref id="B33">
<label>33</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Cai</surname> <given-names>L</given-names></name>
<name><surname>Wu</surname> <given-names>M</given-names></name>
<name><surname>Chen</surname> <given-names>L</given-names></name>
<name><surname>Bai</surname> <given-names>W</given-names></name>
<name><surname>Yang</surname> <given-names>M</given-names></name>
<name><surname>Lyu</surname> <given-names>S</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Using guided self-attention with local information for polyp segmentation</article-title>, in: <conf-name>Lecture Notes in Computer Science (LNCS), International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI 2022)</conf-name>, <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>. pp. <page-range>629&#x2013;38</page-range>.
</mixed-citation>
</ref>
<ref id="B34">
<label>34</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tomar</surname> <given-names>NK</given-names></name>
<name><surname>Jha</surname> <given-names>D</given-names></name>
<name><surname>Bagci</surname> <given-names>U</given-names></name>
<name><surname>Ali</surname> <given-names>S</given-names></name>
</person-group>. (<year>2022</year>). 
<article-title>Tganet: Text-guided attention for improved polyp segmentation</article-title>, in: <conf-name>Lecture Notes in Computer Science (LNCS), International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI 2022)</conf-name>, <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>. pp. <page-range>151&#x2013;60</page-range>., PMID: <pub-id pub-id-type="pmid">36780239</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<label>35</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>R</given-names></name>
<name><surname>Lai</surname> <given-names>P</given-names></name>
<name><surname>Wan</surname> <given-names>X</given-names></name>
<name><surname>Fan</surname> <given-names>D-J</given-names></name>
<name><surname>Gao</surname> <given-names>F</given-names></name>
<name><surname>Wu</surname> <given-names>X-J</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Lesion-aware dynamic kernel for polyp segmentation</article-title>, in: <conf-name>Lecture Notes in Computer Science (LNCS), International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI 2015)</conf-name>, <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>. pp. <fpage>99</fpage>&#x2013;<lpage>109</lpage>.
</mixed-citation>
</ref>
<ref id="B36">
<label>36</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sengar</surname> <given-names>SS</given-names></name>
<name><surname>Meulengracht</surname> <given-names>C</given-names></name>
<name><surname>Boesen</surname> <given-names>MP</given-names></name>
<name><surname>Overgaard</surname> <given-names>AF</given-names></name>
<name><surname>Gudbergsen</surname> <given-names>H</given-names></name>
<name><surname>Nybing</surname> <given-names>JD</given-names></name>
<etal/>
</person-group>. 
<article-title>Multi-planar 3d knee mri segmentation via unet inspired architectures</article-title>. <source>Int J Imaging Syst Technol</source>. (<year>2023</year>) <volume>33</volume>:<page-range>985&#x2013;98</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/ima.22836</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<label>37</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Singh</surname> <given-names>O</given-names></name>
<name><surname>Sengar</surname> <given-names>SS</given-names></name>
</person-group>. 
<article-title>Betternet: An efficient cnn architecture with residual learning and attention for precision polyp segmentation</article-title>. <source>arXiv</source>. (<year>2024</year>).
</mixed-citation>
</ref>
<ref id="B38">
<label>38</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Khan</surname> <given-names>M</given-names></name>
<name><surname>Fu</surname> <given-names>S</given-names></name>
<name><surname>Ullah</surname> <given-names>I</given-names></name>
</person-group>. 
<article-title>Attention-guided asymmetric multiscale polyp segmentation network</article-title>. <source>IEEE Trans Instrum. Meas</source>. (<year>2025</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIM.2025.3550626</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<label>39</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Ronneberger</surname> <given-names>O</given-names></name>
<name><surname>Fischer</surname> <given-names>P</given-names></name>
<name><surname>Brox</surname> <given-names>T</given-names></name>
</person-group>. (<year>2015</year>). 
<article-title>U-net: Convolutional networks for biomedical image segmentation</article-title>, in: <conf-name>International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI 2020)</conf-name>, <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>. pp. <page-range>234&#x2013;41</page-range>.
</mixed-citation>
</ref>
<ref id="B40">
<label>40</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Zhou</surname> <given-names>Z</given-names></name>
<name><surname>Rahman Siddiquee</surname> <given-names>MM</given-names></name>
<name><surname>Tajbakhsh</surname> <given-names>N</given-names></name>
<name><surname>Liang</surname> <given-names>J</given-names></name>
</person-group>. 
<article-title>Unet++: A nested u-net architecture for medical image segmentation</article-title>. In: <source>International workshop on deep learning in medical image analysis</source>. <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name> (<year>2018</year>). p. <fpage>3</fpage>&#x2013;<lpage>11</lpage>., PMID: <pub-id pub-id-type="pmid">32613207</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<label>41</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>Z</given-names></name>
<name><surname>Liu</surname> <given-names>Q</given-names></name>
<name><surname>Wang</surname> <given-names>Y</given-names></name>
</person-group>. 
<article-title>Road extraction by deep residual u-net</article-title>. <source>IEEE Geosci Remote Sens Lett</source>. (<year>2018</year>) <volume>15</volume>:<page-range>749&#x2013;53</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2018.2802944</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<label>42</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Fan</surname> <given-names>D-P</given-names></name>
<name><surname>Ji</surname> <given-names>G-P</given-names></name>
<name><surname>Zhou</surname> <given-names>T</given-names></name>
<name><surname>Chen</surname> <given-names>G</given-names></name>
<name><surname>Fu</surname> <given-names>H</given-names></name>
<name><surname>Shen</surname> <given-names>J</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>Pranet: Parallel reverse attention network for polyp segmentation</article-title>, in: <conf-name>International conference on medical image computing and computer-assisted intervention</conf-name>, <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>. pp. <page-range>263&#x2013;73</page-range>.
</mixed-citation>
</ref>
<ref id="B43">
<label>43</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dosovitskiy</surname> <given-names>A</given-names></name>
<name><surname>Beyer</surname> <given-names>L</given-names></name>
<name><surname>Kolesnikov</surname> <given-names>A</given-names></name>
<name><surname>Weissenborn</surname> <given-names>D</given-names></name>
<name><surname>Zhai</surname> <given-names>X</given-names></name>
<name><surname>Unterthiner</surname> <given-names>T</given-names></name>
<etal/>
</person-group>. 
<article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>. <source>arXiv</source>. (<year>2020</year>).
</mixed-citation>
</ref>
<ref id="B44">
<label>44</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>J</given-names></name>
<name><surname>Mei</surname> <given-names>J</given-names></name>
<name><surname>Li</surname> <given-names>X</given-names></name>
<name><surname>Lu</surname> <given-names>Y</given-names></name>
<name><surname>Yu</surname> <given-names>Q</given-names></name>
<name><surname>Wei</surname> <given-names>Q</given-names></name>
<etal/>
</person-group>. 
<article-title>Transunet: Rethinking the u-net architecture design for medical image segmentation through the lens of transformers</article-title>. <source>Med Img. Anal</source>. (<year>2024</year>) <volume>97</volume>:<fpage>103280</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.media.2024.103280</pub-id>, PMID: <pub-id pub-id-type="pmid">39096845</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<label>45</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Khan</surname> <given-names>M</given-names></name>
<name><surname>Ullah</surname> <given-names>I</given-names></name>
<name><surname>Khan</surname> <given-names>N</given-names></name>
<name><surname>Hussain</surname> <given-names>S</given-names></name>
<name><surname>Khattak</surname> <given-names>MI</given-names></name>
</person-group>. 
<article-title>Adpnet: Attention-driven dual-path network for automated polyp segmentation in colonoscopy</article-title>. <source>Img. Vision Comput</source>. (<year>2025</year>), <fpage>105648</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.imavis.2025.105648</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<label>46</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Vaswani</surname> <given-names>A</given-names></name>
<name><surname>Shazeer</surname> <given-names>N</given-names></name>
<name><surname>Parmar</surname> <given-names>N</given-names></name>
<name><surname>Uszkoreit</surname> <given-names>J</given-names></name>
<name><surname>Jones</surname> <given-names>L</given-names></name>
<name><surname>Gomez</surname> <given-names>AN</given-names></name>
<etal/>
</person-group>. 
<article-title>Attention is all you need</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2017</year>) <volume>30</volume>.
</mixed-citation>
</ref>
<ref id="B47">
<label>47</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Kirillov</surname> <given-names>A</given-names></name>
<name><surname>Mintun</surname> <given-names>E</given-names></name>
<name><surname>Ravi</surname> <given-names>N</given-names></name>
<name><surname>Mao</surname> <given-names>H</given-names></name>
<name><surname>Rolland</surname> <given-names>C</given-names></name>
<name><surname>Gustafson</surname> <given-names>L</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Segment anything</article-title>, in: <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>,  <publisher-loc>Piscataway, NJ</publisher-loc>: 
<publisher-name>IEEE</publisher-name>. pp. <page-range>4015&#x2013;26</page-range>.
</mixed-citation>
</ref>
<ref id="B48">
<label>48</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>H</given-names></name>
<name><surname>Zhang</surname> <given-names>D</given-names></name>
<name><surname>Yao</surname> <given-names>J</given-names></name>
<name><surname>Han</surname> <given-names>L</given-names></name>
<name><surname>Li</surname> <given-names>Z</given-names></name>
<name><surname>Han</surname> <given-names>J</given-names></name>
</person-group>. (<year>2024</year>). 
<article-title>Asps: Augmented segment anything model for polyp segmentation</article-title>, in: <conf-name>International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI 2024)</conf-name>, <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>. pp. <page-range>118&#x2013;28</page-range>.
</mixed-citation>
</ref>
<ref id="B49">
<label>49</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>Y</given-names></name>
<name><surname>Hu</surname> <given-names>M</given-names></name>
<name><surname>Yang</surname> <given-names>X</given-names></name>
</person-group>. 
<article-title>Polyp-sam: Transfer sam for polyp segmentation</article-title>. In: <source>Medical imaging 2024: computer-aided diagnosis</source>, vol. <volume>12927</volume>. <publisher-loc>Bellingham, WA</publisher-loc>: 
<publisher-name>SPIE</publisher-name> (<year>2024</year>). p. <page-range>749&#x2013;54</page-range>.
</mixed-citation>
</ref>
<ref id="B50">
<label>50</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ravi</surname> <given-names>N</given-names></name>
<name><surname>Gabeur</surname> <given-names>V</given-names></name>
<name><surname>Hu</surname> <given-names>Y-T</given-names></name>
<name><surname>Hu</surname> <given-names>R</given-names></name>
<name><surname>Ryali</surname> <given-names>C</given-names></name>
<name><surname>Ma</surname> <given-names>T</given-names></name>
<etal/>
</person-group>. 
<article-title>Sam 2: Segment anything in images and videos</article-title>. <source>arXiv</source>. (<year>2024</year>).
</mixed-citation>
</ref>
<ref id="B51">
<label>51</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Ryali</surname> <given-names>C</given-names></name>
<name><surname>Hu</surname> <given-names>Y-T</given-names></name>
<name><surname>Bolya</surname> <given-names>D</given-names></name>
<name><surname>Wei</surname> <given-names>C</given-names></name>
<name><surname>Fan</surname> <given-names>H</given-names></name>
<name><surname>Huang</surname> <given-names>P-Y</given-names></name>
<etal/>
</person-group>. 
<article-title>Hiera: A hierarchical vision transformer without the bells-and-whistles</article-title>. In: <source>ICML (PMLR)</source> (<year>2023</year>). 
<publisher-name>PMLR</publisher-name> p. <page-range>29441&#x2013;54</page-range>.
</mixed-citation>
</ref>
<ref id="B52">
<label>52</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Houlsby</surname> <given-names>N</given-names></name>
<name><surname>Giurgiu</surname> <given-names>A</given-names></name>
<name><surname>Jastrzebski</surname> <given-names>S</given-names></name>
<name><surname>Morrone</surname> <given-names>B</given-names></name>
<name><surname>De Laroussilhe</surname> <given-names>Q</given-names></name>
<name><surname>Gesmundo</surname> <given-names>A</given-names></name>
<etal/>
</person-group>. 
<article-title>Parameter-efficient transfer learning for nlp</article-title>. In: <source>ICML (PMLR)</source> (<year>2019</year>). 
<publisher-name>PMLR</publisher-name> p. <page-range>2790&#x2013;9</page-range>.
</mixed-citation>
</ref>
<ref id="B53">
<label>53</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Qiu</surname> <given-names>Z</given-names></name>
<name><surname>Hu</surname> <given-names>Y</given-names></name>
<name><surname>Li</surname> <given-names>H</given-names></name>
<name><surname>Liu</surname> <given-names>J</given-names></name>
</person-group>. 
<article-title>Learnable ophthalmology sam</article-title>. <source>arXiv</source>. (<year>2023</year>).
</mixed-citation>
</ref>
<ref id="B54">
<label>54</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhou</surname> <given-names>Y</given-names></name>
<name><surname>Lei</surname> <given-names>T</given-names></name>
<name><surname>Liu</surname> <given-names>H</given-names></name>
<name><surname>Du</surname> <given-names>N</given-names></name>
<name><surname>Huang</surname> <given-names>Y</given-names></name>
<name><surname>Zhao</surname> <given-names>V</given-names></name>
<etal/>
</person-group>. 
<article-title>Mixture-of-experts with expert choice routing</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2022</year>) <volume>35</volume>:<page-range>7103&#x2013;14</page-range>.
</mixed-citation>
</ref>
<ref id="B55">
<label>55</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kittler</surname> <given-names>J</given-names></name>
</person-group>. 
<article-title>On the accuracy of the sobel edge detector</article-title>. <source>Img. Vision Comput</source>. (<year>1983</year>) <volume>1</volume>:<fpage>37</fpage>&#x2013;<lpage>42</lpage>.
</mixed-citation>
</ref>
<ref id="B56">
<label>56</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bernal</surname> <given-names>J</given-names></name>
<name><surname>S&#xe1;nchez</surname> <given-names>FJ</given-names></name>
<name><surname>Fern&#xe1;ndez-Esparrach</surname> <given-names>G</given-names></name>
<name><surname>Gil</surname> <given-names>D</given-names></name>
<name><surname>Rodr&#xed;guez</surname> <given-names>C</given-names></name>
<name><surname>Vilari&#xf1;o</surname> <given-names>F</given-names></name>
</person-group>. 
<article-title>Wm-dova maps for accurate polyp highlighting in colonoscopy: Validation vs. saliency maps from physicians</article-title>. <source>Comput. Med Imaging Graphics</source>. (<year>2015</year>) <volume>43</volume>:<fpage>99</fpage>&#x2013;<lpage>111</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compmedimag.2015.02.007</pub-id>, PMID: <pub-id pub-id-type="pmid">25863519</pub-id>
</mixed-citation>
</ref>
<ref id="B57">
<label>57</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Jha</surname> <given-names>D</given-names></name>
<name><surname>Smedsrud</surname> <given-names>PH</given-names></name>
<name><surname>Riegler</surname> <given-names>MA</given-names></name>
<name><surname>Halvorsen</surname> <given-names>P</given-names></name>
<name><surname>De Lange</surname> <given-names>T</given-names></name>
<name><surname>Johansen</surname> <given-names>D</given-names></name>
<etal/>
</person-group>. (<year>2019</year>). 
<article-title>Kvasir-seg: A segmented polyp dataset</article-title>, in: <conf-name>Lecture Notes in Computer Science (LNCS), International Conference on Multimedia Modeling (MMM 2019)</conf-name>, <publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>. pp. <page-range>451&#x2013;62</page-range>.
</mixed-citation>
</ref>
<ref id="B58">
<label>58</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>L-C</given-names></name>
<name><surname>Papandreou</surname> <given-names>G</given-names></name>
<name><surname>Schroff</surname> <given-names>F</given-names></name>
<name><surname>Adam</surname> <given-names>H</given-names></name>
</person-group>. 
<article-title>Rethinking atrous convolution for semantic image segmentation</article-title>. <source>arXiv</source>. (<year>2017</year>).
</mixed-citation>
</ref>
<ref id="B59">
<label>59</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xiao</surname> <given-names>B</given-names></name>
<name><surname>Hu</surname> <given-names>J</given-names></name>
<name><surname>Li</surname> <given-names>W</given-names></name>
<name><surname>Pun</surname> <given-names>C-M</given-names></name>
<name><surname>Bi</surname> <given-names>X</given-names></name>
</person-group>. 
<article-title>Ctnet: Contrastive transformer network for polyp segmentation</article-title>. <source>IEEE Trans Cybern</source>. (<year>2024</year>) <volume>54</volume>:<page-range>5040&#x2013;53</page-range>., PMID: <pub-id pub-id-type="pmid">38470573</pub-id>
</mixed-citation>
</ref>
<ref id="B60">
<label>60</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bernal</surname> <given-names>J</given-names></name>
<name><surname>S&#xe1;nchez</surname> <given-names>J</given-names></name>
<name><surname>Vilarino</surname> <given-names>F</given-names></name>
</person-group>. 
<article-title>Towards automatic polyp detection with a polyp appearance model</article-title>. <source>Pattern Recognit</source>. (<year>2012</year>) <volume>45</volume>:<page-range>3166&#x2013;82</page-range>.
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3067621">Sharib Ali</ext-link>, University of Leeds, United Kingdom</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1376257">Sandeep Singh Sengar</ext-link>, Cardiff Metropolitan University, United Kingdom</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3266947">Mukhtiar Khan</ext-link>, National Taiwan University, Taiwan</p></fn>
</fn-group>
</back>
</article>