<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Physiol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Physiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Physiol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-042X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1792357</article-id>
<article-id pub-id-type="doi">10.3389/fphys.2026.1792357</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>CRC-Former: frequency-domain adaptive swin-transformer for colorectal cancer histopathology classification</article-title>
<alt-title alt-title-type="left-running-head">Chen et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphys.2026.1792357">10.3389/fphys.2026.1792357</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Chen</surname>
<given-names>Lei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Li</surname>
<given-names>Chenguang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Meng</surname>
<given-names>Fanqi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Tai</surname>
<given-names>Jiandong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1907939"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Kun</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Department of Colorectal and Anal Surgery, General Surgery Center, The First Hospital of Jilin University</institution>, <city>Jilin</city>, <country country="CN">China</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Department of Ophthalmology, The First Hospital of Jilin University</institution>, <city>Jilin</city>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Kun Wang, <email xlink:href="mailto:wangkun0626@jlu.edu.cn">wangkun0626@jlu.edu.cn</email>
</corresp>
<fn fn-type="equal" id="fn001">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-24">
<day>24</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1792357</elocation-id>
<history>
<date date-type="received">
<day>20</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>01</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Chen, Li, Meng, Tai and Wang.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Chen, Li, Meng, Tai and Wang</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-24">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Colorectal cancer (CRC) diagnosis from whole-slide histopathology images remains challenging due to pronounced tissue heterogeneity, multi-scale morphological variations, and the subtle nature of early neoplastic changes. While deep learning models have shown promise, conventional architectures struggle to simultaneously capture fine-grained texture cues and global architectural context, often overlooking diagnostically critical frequency-domain signatures.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address these limitations, we propose CRC-Former, a novel hybrid architecture that synergistically integrates frequency-aware representation learning with efficient cross-scale sequence modeling. Specifically, CRC-Former introduces two key components: (i) a Frequency-aware Global-Local Transformer Block (FGT), which decomposes features via Haar wavelet transform and applies orientation-specific sliding-window attention in distinct subbands to enhance sensitivity to multi-directional pathological textures; and (ii) a Cross-Scale Mamba Block (CSM), which leverages selective state-space modeling to fuse hierarchical features across resolutions with linear complexity.</p>
</sec>
<sec>
<title>Results</title>
<p>Evaluated on the large-scale Chaoyang CRC dataset, CRC-Former achieves state-of-the-art performance, outperforming strong baselines.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Our work demonstrates that explicit integration of signal processing priors with modern sequence modeling offers a powerful paradigm for robust, interpretable, and scalable computational pathology.</p>
</sec>
</abstract>
<kwd-group>
<kwd>colorectal cancer</kwd>
<kwd>haar wavelet transform</kwd>
<kwd>histopathology image classification</kwd>
<kwd>state-space model</kwd>
<kwd>swin-transformer</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by Jilin Province Medical and Health Talent Special Program (Grant number: JLSRCZX2026-42) and Excellent Physician A-position Fund of the First Hospital of Jilin University (Grant number: A1824).</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="2"/>
<equation-count count="12"/>
<ref-count count="36"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Physiology and Medicine</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Colorectal cancer (CRC) ranks as the third most prevalent malignancy and a leading cause of cancer-related death globally, with early detection playing a decisive role in therapeutic planning and long-term survival <xref ref-type="bibr" rid="B3">Anusha and Reddy (2025)</xref>, <xref ref-type="bibr" rid="B5">Attallah (2025)</xref>. Histopathological examination of hematoxylin-and-eosin-stained whole-slide images (WSIs) remains the clinical gold standard for diagnosis, offering critical insights into tumor morphology, architectural organization, and cellular atypia. However, manual interpretation of gigapixel-scale WSIs is labor-intensive, inherently subjective, and susceptible to inter-observer variability&#x2014;challenges further amplified by the pronounced tissue heterogeneity, multi-scale morphological diversity ranging from subcellular nuclear pleomorphism to glandular disarray, and subtle histological signatures of early neoplasia that characterize CRC progression. While deep learning has emerged as a powerful tool for computational pathology, conventional convolutional neural networks (CNNs) <xref ref-type="bibr" rid="B4">Anwar et al. (2018)</xref>, <xref ref-type="bibr" rid="B32">Xu et al. (2019)</xref>, <xref ref-type="bibr" rid="B2">Alzubaidi et al. (2021)</xref> are fundamentally constrained by their local receptive fields, limiting their capacity to model long-range spatial dependencies essential for contextualizing focal dysplastic changes within broader tissue architecture. Vision Transformers <xref ref-type="bibr" rid="B10">Dosovitskiy (2020)</xref>, <xref ref-type="bibr" rid="B17">Liu et al. (2021)</xref>, <xref ref-type="bibr" rid="B30">Wang et al. (2021)</xref>, <xref ref-type="bibr" rid="B31">Wu et al. (2025)</xref> alleviate this limitation through global self-attention but introduce prohibitive quadratic computational complexity and often overlook diagnostically rich frequency-domain cues such as orientation-specific textures, edge sharpness, and structural regularity that are highly informative yet frequently suppressed in purely spatial pipelines. As illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>, colorectal cancer diagnosis demands precise discrimination among visually similar yet clinically distinct entities: normal mucosa, serrated polyps (often precursors), adenomas (benign neoplasms), and invasive adenocarcinomas. The challenge lies in detecting subtle, localized deviations from mild nuclear atypia in early adenomas to complex glandular disarray in poorly differentiated carcinomas, that are easily overlooked by models relying solely on global features. Recent studies have highlighted the value of wavelet-based decomposition in medical imaging, where fixed transforms like Haar wavelets <xref ref-type="bibr" rid="B15">Haar (1909)</xref> provide an interpretable, shift-sensitive prior for disentangling multi-resolution features: low-frequency components capture coarse tissue layout, while high-frequency subbands explicitly encode horizontal, vertical, and diagonal edges corresponding to biologically meaningful structures including crypt alignment, stromal invasion, and nuclear membranes. Concurrently, state-space models such as Mamba <xref ref-type="bibr" rid="B13">Gu and Dao (2023)</xref>, <xref ref-type="bibr" rid="B23">Ma et al. (2024)</xref>, <xref ref-type="bibr" rid="B27">Ruan and Xiang (2024)</xref>, <xref ref-type="bibr" rid="B19">Liu J. et al. (2024)</xref> offer a promising alternative to attention mechanisms by enabling selective, data-dependent propagation of information across long sequences with linear complexity, which is ideal for fusing hierarchical features from multi-scale pathology representations. Motivated by these complementary advances, we propose CRC-Former, a novel hybrid architecture that synergistically integrates frequency-aware representation learning and efficient cross-scale sequence modeling. Specifically, CRC-Former introduces two key innovations: first, a Frequency-aware Global-Local Transformer Block (FGT), which decomposes intermediate features via Haar wavelet transform and applies orientation-adaptive sliding-window attention within distinct subbands to enhance sensitivity to multi-directional pathological textures; and second, a Cross-Scale Mamba Block (CSM), which leverages selective state-space dynamics to fuse features across all spatial resolutions in a context-aware and computationally efficient manner. Together, these modules enable CRC-Former to simultaneously capture fine-grained cytoarchitectural anomalies and global tissue-level abnormalities, thereby addressing the core challenges of robust and scalable CRC classification in digital pathology.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Representative H&#x0026;E-stained histopathology patches from the four diagnostic categories in the Chaoyang dataset, representing (from left to right): normal mucosa, serrated lesion, adenocarcinoma, and adenoma. The increasing architectural and cytological atypia illustrate the challenges in automated CRC classification.</p>
</caption>
<graphic xlink:href="fphys-17-1792357-g001.tif">
<alt-text content-type="machine-generated">Four pairs of histopathology slides display stained colon tissue under a microscope, labeled as normal, serrated, adenocarcinoma, and adenoma. Each category shows characteristic glandular and cellular structures relevant to the diagnosis.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>Convolutional neural networks (CNNs) have historically formed the backbone of computer-aided diagnosis (CAD) systems in medical imaging <xref ref-type="bibr" rid="B24">Masood et al. (2020)</xref>. Architectures such as ResNet <xref ref-type="bibr" rid="B16">He et al. (2016)</xref> and its many derivatives have been widely adopted for tasks ranging from anatomical structure classification to pathological lesion identification <xref ref-type="bibr" rid="B8">Cheema et al. (2019)</xref>. Their strength lies in hierarchical feature learning and efficient extraction of local spatial patterns. Nevertheless, CNNs are inherently constrained by their localized receptive fields, limiting their ability to model long-range contextual interactions or exploit frequency-domain characteristics. To address these limitations, Vision Transformers (ViTs) <xref ref-type="bibr" rid="B10">Dosovitskiy (2020)</xref> introduced a paradigm shift by leveraging self-attention mechanisms to capture global dependencies across image patches. Subsequent variants like the Swin Transformer <xref ref-type="bibr" rid="B17">Liu et al. (2021)</xref> further enhanced practicality through hierarchical feature representation and localized window-based attention, balancing global context with computational feasibility. However, their self-attention operation incurs quadratic computational complexity with respect to sequence length. Moreover, conventional ViT architectures predominantly operate in the spatial domain and do not explicitly incorporate or leverage frequency-domain information, which could offer complementary cues to improve both model robustness and interpretability. More recently, State Space Models (SSMs) have emerged as a scalable alternative for modeling long-range dependencies with linear computational complexity <xref ref-type="bibr" rid="B14">Gu et al. (2021)</xref>, <xref ref-type="bibr" rid="B13">Gu and Dao (2023)</xref>. Building on this foundation, frameworks such as Mamba and their vision-specific adaptations have demonstrated competitive performance as visual backbones <xref ref-type="bibr" rid="B19">Liu J. et al. (2024)</xref>, <xref ref-type="bibr" rid="B21">Liu Y. et al. (2024)</xref>. These models efficiently integrate global contextual information while maintaining favorable computational properties, making them particularly well-suited for large-scale, high-resolution medical imaging tasks. Their capacity to jointly achieve efficiency, expressiveness, and scalability positions SSM-based architectures as promising candidates for advancing histopathology image classification.</p>
</sec>
<sec sec-type="materials|methods" id="s3">
<label>3</label>
<title>Materials and methods</title>
<sec id="s3-1">
<label>3.1</label>
<title>Datasets</title>
<p>We evaluate our method on the Chaoyang Dataset <xref ref-type="bibr" rid="B35">Zhu et al. (2021)</xref>, a large-scale collection of whole-slide images (WSIs) retrospectively curated from routine clinical practice at Beijing Chaoyang Hospital. Representative non-overlapping patches of size <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mn>512</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels were extracted from diagnostically relevant regions under the supervision of board-certified pathologists. The dataset comprises four clinically significant classes: Normal mucosa, Serrated lesions (including hyperplastic polyps and sessile serrated adenomas), Adenoma (tubular/tubulovillous), and Adenocarcinoma. In total, 6,160 annotated patches are included, distributed as follows: 1,816 Normal, 1,163 Serrated, 2,244 Adenocarcinoma, and 937 Adenoma. To ensure fair comparison, we adopt the consistent train and test partition: the training set contains 1,111 Normal, 842 Serrated, 1,404 Adenocarcinoma, and 664 Adenoma samples (total &#x3d; 4,021); the test set includes 705 Normal, 321 Serrated, 840 Adenocarcinoma, and 273 Adenoma samples (total &#x3d; 2,139). No overlap exists between training and test slides at the patient level, thereby mitigating data leakage and enabling assessment of generalization to unseen individuals. All experiments are conducted on this standardized split. <xref ref-type="fig" rid="F1">Figure 1</xref> shows some samples of the Chaoyang dataset.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Preliminaries: haar wavelet transform</title>
<p>The Haar wavelet transform <xref ref-type="bibr" rid="B15">Haar (1909)</xref> provides a computationally efficient, orthogonal multiresolution decomposition that is particularly well-suited for capturing localized intensity discontinuities&#x2014;such as cell boundaries, nuclear membranes, and glandular edges&#x2014;common in histopathological images. Given a 1D discrete signal <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>J</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (with <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="double-struck">N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>), the Haar transform recursively computes approximation (scaling) coefficients <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and detail (wavelet) coefficients <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> at scale <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0,1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>J</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> via <xref ref-type="disp-formula" rid="e1">Equations 1</xref>, <xref ref-type="disp-formula" rid="e2">2</xref>:<disp-formula id="e1">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf7">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>J</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the original signal, and <inline-formula id="inf8">
<mml:math id="m10">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0,1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The inverse transform reconstructs <inline-formula id="inf9">
<mml:math id="m11">
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> exactly from <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
<mml:mo>&#x222a;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>.</p>
<p>For 2D images <inline-formula id="inf11">
<mml:math id="m13">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (assuming <inline-formula id="inf12">
<mml:math id="m14">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are powers of two for simplicity), the 2D Haar transform applies the 1D decomposition along rows and columns successively. At each level, it yields four subbands <inline-formula id="inf13">
<mml:math id="m15">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">H</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">D</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf14">
<mml:math id="m16">
<mml:mrow>
<mml:mi mathvariant="bold">A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the low-frequency approximation (capturing coarse tissue architecture), while <inline-formula id="inf15">
<mml:math id="m17">
<mml:mrow>
<mml:mi mathvariant="bold">H</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf16">
<mml:math id="m18">
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf17">
<mml:math id="m19">
<mml:mrow>
<mml:mi mathvariant="bold">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represent horizontal, vertical, and diagonal detail coefficients, respectively&#x2014;encoding fine-scale texture, edge orientation, and structural irregularities. This explicit separation of spatial frequencies enables pathology-aware feature disentanglement: for instance, dysplastic nuclei often manifest as high-magnitude responses in diagonal/high-frequency bands, whereas tumor-stroma interfaces are reflected in vertical/horizontal edges. Owing to its simplicity, invertibility, and sensitivity to abrupt intensity changes, the Haar wavelet serves as an effective prior for modeling the multi-scale heterogeneity inherent in colorectal cancer histology.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Overview of CRC-Former</title>
<p>We present CRC-Former, a hierarchical deep architecture designed for whole-slide image (WSI) classification in colorectal cancer pathology. As illustrated in <xref ref-type="fig" rid="F2">Figure 2a</xref>, the network processes an input histopathology image <inline-formula id="inf18">
<mml:math id="m20">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> through a multi-stage pipeline that progressively extracts and refines multi-scale, frequency-aware contextual features. The pipeline begins with a Patch Embedding layer that partitions <inline-formula id="inf19">
<mml:math id="m21">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> into non-overlapping patches of size <inline-formula id="inf20">
<mml:math id="m22">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, followed by linear projection to obtain initial patch embeddings. These are then processed by a sequence of four Frequency-aware Global&#x2013;Local Transformer Blocks (FGT Blocks), each operating at successively coarser spatial resolutions (<inline-formula id="inf21">
<mml:math id="m23">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf22">
<mml:math id="m24">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf23">
<mml:math id="m25">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf24">
<mml:math id="m26">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Between consecutive FGT blocks, a Patch Merging operation reduces spatial dimensionality while doubling the channel depth&#x2014;enabling efficient hierarchical feature abstraction. Each FGT block integrates Haar wavelet decomposition to decompose local patch representations into multi-frequency subbands, which are then fused via hybrid attention mechanisms to capture both global context and orientation-sensitive texture cues. After the final FGT stage, the high-level, low-resolution feature map is fed into a Cross-Scale Mamba Block (CSM Block), which serves as a global aggregator. Unlike conventional transformers or CNNs, the CSM Block leverages selective state-space modeling to dynamically fuse information across all previous resolution levels&#x2014;effectively integrating fine-grained textural details from early stages with coarse semantic patterns from deeper layers. This enables the model to reason about long-range tissue organization while preserving discriminative local morphological signals. Finally, a global average pooling operation is applied, followed by a fully connected Classification Head to produce the predicted class probabilities. The entire architecture is end-to-end trainable and maintains linear computational complexity with respect to sequence length, making it scalable to gigapixel WSIs without sacrificing representational power.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>
<bold>(a)</bold> Overview of the proposed CRC-Former: a hierarchical architecture with four Frequency-aware Global-Local Transformer (FGT) blocks and a Cross-Scale Mamba (CSM) block for global aggregation. <bold>(b)</bold> CSM block detail: fuses multi-scale features via downsampling, S6 processing, and element-wise addition to the finest-scale output.</p>
</caption>
<graphic xlink:href="fphys-17-1792357-g002.tif">
<alt-text content-type="machine-generated">Figure contains two labeled sections, (a) and (b). Section (a) is a flowchart illustrating a deep learning pipeline for image processing, beginning with a histology image, followed by patch embedding, alternating FGT blocks and patch merging, then a CSM block, classification head, and output. Section (b) details the CSM block, showing multi-scale feature extraction, down-sampling, an S6 process, element-wise addition, and output. Color legend identifies CSM Block in green, FGT Block in blue, and the element-wise addition symbol.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Cross-Scale Mamba Block</title>
<p>The Cross-Scale Mamba Block (CSM, <xref ref-type="fig" rid="F2">Figure 2b</xref>) serves as the global aggregation module of CRC-Former, designed to fuse multi-resolution features extracted by the preceding four FGT stages into a unified, context-aware representation. Formally, let <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denote the feature maps output by the FGT blocks at resolutions <inline-formula id="inf26">
<mml:math id="m28">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf27">
<mml:math id="m29">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf28">
<mml:math id="m30">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf29">
<mml:math id="m31">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, respectively, where each <inline-formula id="inf30">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. To enable cross-scale interaction, all four feature maps are first downsampled via bilinear interpolation (or strided convolution) to match the coarsest spatial resolution <inline-formula id="inf31">
<mml:math id="m33">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, yielding (<xref ref-type="disp-formula" rid="e3">Equation 3</xref>):<disp-formula id="e3">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>DownSample</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1,2,3,4</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf32">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. These aligned representations are then concatenated along the channel dimension, which is given by <xref ref-type="disp-formula" rid="e4">Equation 4</xref>:<disp-formula id="e4">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cat</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>The concatenated tensor <inline-formula id="inf33">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cat</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is reshaped into a sequence <inline-formula id="inf34">
<mml:math id="m38">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf35">
<mml:math id="m39">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of spatial locations, and <inline-formula id="inf36">
<mml:math id="m40">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the total channel depth. This sequence is then processed by a Bidirectional S6 Layer (Bi-Mamba), which applies two parallel selective state-space models&#x2014;one forward and one backward&#x2014;to capture long-range dependencies in both spatial directions. The output sequence <inline-formula id="inf37">
<mml:math id="m41">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is reshaped back into a 3D tensor <inline-formula id="inf38">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>mamba</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>To preserve the identity of the finest-scale representation while incorporating cross-scale context, we extract from <inline-formula id="inf39">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>mamba</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> only the channels corresponding to the original <inline-formula id="inf40">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (i.e., the last <inline-formula id="inf41">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> channels), denoted as <inline-formula id="inf42">
<mml:math id="m46">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>mamba</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. This is added element-wise to the original <inline-formula id="inf43">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to form the final output (<xref ref-type="disp-formula" rid="e5">Equation 5</xref>):<disp-formula id="e5">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>mamba</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>This design ensures that the CSM block enhances the discriminative power of the coarsest-scale features through globally aware, cross-resolution context modeling&#x2014;without disrupting the hierarchical structure or introducing excessive computational overhead. The use of Bi-Mamba enables efficient, linear-complexity aggregation across all scales, making the CSM block particularly suitable for histopathology analysis.</p>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>Frequency-aware Global-Local Transformer Block</title>
<p>The Frequency-aware Global-Local Transformer Block (FGT, <xref ref-type="fig" rid="F3">Figure 3</xref>) is the core building block of CRC-Former&#x2019;s backbone, designed to jointly model global contextual dependencies and multi-directional local texture patterns via Haar wavelet-guided attention. As depicted in <xref ref-type="fig" rid="F3">Figure 3</xref>, each FGT block processes an input feature map <inline-formula id="inf44">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> at a fixed spatial resolution <inline-formula id="inf45">
<mml:math id="m50">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and produces an output <inline-formula id="inf46">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> through a hierarchical frequency decomposition and fusion pipeline.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Architecture of the proposed Frequency-aware Global-Local Transformer (FGT) block. It decomposes input features via Haar wavelet transform, applies orientation- and scale-specific multi-head self-attention (e.g., <inline-formula id="inf47">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-MSA for global context, <inline-formula id="inf48">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-MSA for horizontal textures), then reconstructs fused features via inverse transform.</p>
</caption>
<graphic xlink:href="fphys-17-1792357-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating the architecture of an FGT Block for feature transformation in medical imaging, showing sequential operations including input and output feature images, wavelet and inverse wavelet transforms, four types of multi-head self-attention modules, layer normalization, and feedforward networks, with an accompanying legend for module abbreviations.</alt-text>
</graphic>
</fig>
<p>First, the input feature map undergoes Layer Normalization, followed by a Haar Wavelet Transform that decomposes it into four orthogonal subbands, which is given by <xref ref-type="disp-formula" rid="e6">Equation 6</xref>:<disp-formula id="e6">
<mml:math id="m54">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>WT</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf49">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the low-frequency approximation (coarse tissue structure), while <inline-formula id="inf50">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> encode horizontal, vertical, and diagonal high-frequency details (e.g., glandular edges, nuclear borders), respectively. Each subband is then processed independently by a dedicated multi-head self-attention (MSA) module with spatially constrained windowing (<xref ref-type="fig" rid="F4">Figure 4</xref>):<list list-type="bullet">
<list-item>
<p>
<inline-formula id="inf51">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-MSA: Applies big-window MSA over the entire <inline-formula id="inf52">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to capture long-range global context;</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf53">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-MSA: Applies horizontal-stripe MSA along rows of <inline-formula id="inf54">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to enhance sensitivity to horizontally oriented textures;</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf55">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-MSA: Applies vertical-stripe MSA along columns of <inline-formula id="inf56">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to emphasize vertically oriented structures;</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf57">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-MSA: Applies small-window MSA over <inline-formula id="inf58">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to preserve fine-grained, localized textural anomalies.</p>
</list-item>
</list>
</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Detailed illustration of the four specialized MSA variants. The Window-Shifting and Stripe-Shifting operations are visualized via red dashed arrows, demonstrating how shifted windows/stripes enable cross-window communication, which is critical for maintaining connectivity across partitioned regions without introducing extra parameters. <bold>(a)</bold> WB-MSA. <bold>(b)</bold> WS-MSA. <bold>(c)</bold> SV-MSA. <bold>(a)</bold> SH-MSA.</p>
</caption>
<graphic xlink:href="fphys-17-1792357-g004.tif">
<alt-text content-type="machine-generated">Diagram with four labeled panels compares block shifting mechanisms: (a) big-window with window-shifting, (b) small-window with window-shifting, (c) vertical-stripe shifting, and (d) horizontal-stripe shifting. Each contains arrows and highlighted blocks indicating shift operations.</alt-text>
</graphic>
</fig>
<p>These frequency-specific attention outputs are concatenated and passed through an Inverse Wavelet Transform to reconstruct a fused feature map <inline-formula id="inf59">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>fused</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. A residual connection from the original <inline-formula id="inf60">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is added before applying Layer Normalization and a Feedforward Network (FFN), yielding the final output of the process (<xref ref-type="disp-formula" rid="e7">Equation 7</xref>):<disp-formula id="e7">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>FFN</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>LN</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>fused</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>To further enhance representational capacity, the FGT block is stacked <inline-formula id="inf61">
<mml:math id="m68">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> times within each stage, enabling progressive refinement of both global architecture and local texture cues. The use of sliding-window (denoted by prefix &#x201c;S&#x201d;) ensures efficient computation while preserving orientation-sensitive discriminative power, critical for distinguishing subtle histopathological phenotypes in colorectal cancer.</p>
</sec>
</sec>
<sec sec-type="results" id="s4">
<label>4</label>
<title>Results</title>
<sec id="s4-1">
<label>4.1</label>
<title>Implementation details</title>
<p>To ensure a fair and reproducible comparison with prior works <xref ref-type="bibr" rid="B20">Liu S. et al. (2024)</xref>, we strictly adhere to the same training protocol throughout our experiments. Input patches are resized to <inline-formula id="inf62">
<mml:math id="m69">
<mml:mrow>
<mml:mn>224</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>224</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels, a standard resolution widely used in vision-based histopathology analysis. During preprocessing, we apply only minimal augmentation: random horizontal flipping, and channel-wise normalization using ImageNet-derived statistics (<inline-formula id="inf63">
<mml:math id="m70">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0.485</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.456</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.406</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf64">
<mml:math id="m71">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0.229</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.224</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.225</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>) <xref ref-type="bibr" rid="B9">Deng et al. (2009)</xref>. No advanced augmentation strategies are employed, thereby isolating architectural improvements from data manipulation effects. The model is optimized using the Adam optimizer <xref ref-type="bibr" rid="B1">Adam (2014)</xref> with an initial learning rate of <inline-formula id="inf65">
<mml:math id="m72">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The learning rate is decayed over time via cosine annealing without restarts, where the period parameter <inline-formula id="inf66">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is set to 10 epochs. Training proceeds for a fixed budget of 300 epochs with a batch size of 32, striking a balance between memory constraints and gradient stability. All experiments are implemented in PyTorch <xref ref-type="bibr" rid="B25">Paszke et al. (2019)</xref> and executed on a single NVIDIA A100 GPU with 40 GB of memory. We report results averaged over three independent runs with different random seeds to account for stochastic variability.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Evaluation metrics</title>
<p>To ensure a comprehensive and robust assessment, we employ multiple complementary evaluation metrics. Specifically, we report four standard classification metrics: Accuracy (Acc) <xref ref-type="bibr" rid="B26">Powers (2020)</xref>, AUC <xref ref-type="bibr" rid="B7">Bradley (1997)</xref>, Precision, Recall <xref ref-type="bibr" rid="B6">Baeza-Yates et al. (1999)</xref>, and F1 score <xref ref-type="bibr" rid="B28">Sokolova and Lapalme (2009)</xref>. Accuracy reflects the overall proportion of correctly classified samples and remains a fundamental indicator in diagnostic tasks. Precision and Recall, widely used in medical AI, assess different aspects of predictive quality: Precision measures the fraction of true positive predictions among all samples predicted as positive, while Recall quantifies the model&#x2019;s ability to identify all actual positive cases. Since these two metrics often exhibit a trade-off&#x2014;improving one may degrade the other&#x2014;we further adopt the F1 score, defined as their harmonic mean, to provide a balanced evaluation of classification performance. In addition, we evaluate model discriminability using the receiver operating characteristic (ROC) curve and the corresponding area under the curve (AUC), which offer threshold-invariant measures of diagnostic capability across all classes. These evaluation metrics are summarized as the following <xref ref-type="disp-formula" rid="e8">Equations 8</xref>&#x2013;<xref ref-type="disp-formula" rid="e12">12</xref>:<disp-formula id="e8">
<mml:math id="m74">
<mml:mrow>
<mml:mtext>Accuracy</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>ACC</mml:mtext>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
<disp-formula id="e9">
<mml:math id="m75">
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m76">
<mml:mrow>
<mml:mtext>Recall</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
<disp-formula id="e11">
<mml:math id="m77">
<mml:mrow>
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>&#x2009;Score</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
<disp-formula id="e12">
<mml:math id="m78">
<mml:mrow>
<mml:mtext>AUC</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>positiveClass</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mtext>rank</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>where <inline-formula id="inf67">
<mml:math id="m79">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of positive samples, <inline-formula id="inf68">
<mml:math id="m80">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of negative samples, and <inline-formula id="inf69">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>rank</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the rank of sample <inline-formula id="inf70">
<mml:math id="m82">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> based on the model&#x2019;s predicted probability (sorted in descending order).</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Experimental results and analysis</title>
<p>
<xref ref-type="table" rid="T1">Table 1</xref> presents a comprehensive comparison of our proposed CRC-Former against the representative architectures including ResNet101 <xref ref-type="bibr" rid="B16">He et al. (2016)</xref>, EfficientNet <xref ref-type="bibr" rid="B29">Tan and Le (2019)</xref>, ViT-B <xref ref-type="bibr" rid="B10">Dosovitskiy (2020)</xref>, Swin-S <xref ref-type="bibr" rid="B17">Liu et al. (2021)</xref>, ConvNext <xref ref-type="bibr" rid="B18">Liu et al. (2022)</xref>, InceptionNext <xref ref-type="bibr" rid="B33">Yu et al. (2024)</xref>, TransXNet <xref ref-type="bibr" rid="B22">Lou et al. (2025)</xref>, BiFormer <xref ref-type="bibr" rid="B36">Zhu et al. (2023)</xref>, GroupMixFormer <xref ref-type="bibr" rid="B12">Ge et al. (2023)</xref>, Eff-CTM <xref ref-type="bibr" rid="B20">Liu S. et al. (2024)</xref>, MedMamba <xref ref-type="bibr" rid="B34">Yue and Li (2024)</xref> and SBTAYLOR-KAN <xref ref-type="bibr" rid="B11">Fatema et al. (2025)</xref>, across five standard classification metrics: Accuracy, F1 score, Precision, Recall, and AUC. As shown, CRC-Former achieves state-of-the-art performance on the Chaoyang colorectal histopathology dataset, outperforming all baseline models in every metric.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Quantitative performance comparison results with ten classic SOTA methods on the Chaoyang dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="center">Year</th>
<th align="center">Accuracy(%) <inline-formula id="inf71">
<mml:math id="m83">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">F1(%) <inline-formula id="inf72">
<mml:math id="m84">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Precision(%) <inline-formula id="inf73">
<mml:math id="m85">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Recall(%) <inline-formula id="inf74">
<mml:math id="m86">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">AUC(%) <inline-formula id="inf75">
<mml:math id="m87">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">ResNet101</td>
<td align="center">2016</td>
<td align="center">83.92</td>
<td align="center">77.32</td>
<td align="center">79.61</td>
<td align="center">76.48</td>
<td align="center">85.46</td>
</tr>
<tr>
<td align="left">EfficientNet-B0</td>
<td align="center">2019</td>
<td align="center">84.71</td>
<td align="center">79.57</td>
<td align="center">80.33</td>
<td align="center">78.94</td>
<td align="center">86.86</td>
</tr>
<tr>
<td align="left">ViT-B</td>
<td align="center">2021</td>
<td align="center">81.53</td>
<td align="center">76.02</td>
<td align="center">77.59</td>
<td align="center">74.94</td>
<td align="center">84.22</td>
</tr>
<tr>
<td align="left">Swin-S</td>
<td align="center">2021</td>
<td align="center">85.13</td>
<td align="center">80.29</td>
<td align="center">81.09</td>
<td align="center">78.93</td>
<td align="center">87.44</td>
</tr>
<tr>
<td align="left">ConvNext-S</td>
<td align="center">2022</td>
<td align="center">78.35</td>
<td align="center">71.89</td>
<td align="center">71.55</td>
<td align="center">72.42</td>
<td align="center">82.62</td>
</tr>
<tr>
<td align="left">InceptionNext</td>
<td align="center">2023</td>
<td align="center">84.81</td>
<td align="center">79.70</td>
<td align="center">80.23</td>
<td align="center">79.25</td>
<td align="center">87.05</td>
</tr>
<tr>
<td align="left">TransXNet</td>
<td align="center">2023</td>
<td align="center">84.53</td>
<td align="center">79.17</td>
<td align="center">80.20</td>
<td align="center">78.43</td>
<td align="center">86.60</td>
</tr>
<tr>
<td align="left">BiFormer</td>
<td align="center">2023</td>
<td align="center">83.12</td>
<td align="center">76.28</td>
<td align="center">78.60</td>
<td align="center">75.91</td>
<td align="center">85.03</td>
</tr>
<tr>
<td align="left">GroupMixFormer</td>
<td align="center">2023</td>
<td align="center">85.09</td>
<td align="center">79.78</td>
<td align="center">80.65</td>
<td align="center">79.24</td>
<td align="center">87.07</td>
</tr>
<tr>
<td align="left">Eff-CTM</td>
<td align="center">2024</td>
<td align="center">86.30</td>
<td align="center">81.87</td>
<td align="center">81.69</td>
<td align="center">82.16</td>
<td align="center">88.82</td>
</tr>
<tr>
<td align="left">MedMamba</td>
<td align="center">2024</td>
<td align="center">85.01</td>
<td align="center">79.67</td>
<td align="center">80.59</td>
<td align="center">79.20</td>
<td align="center">86.99</td>
</tr>
<tr>
<td align="left">SBTAYLOR-KAN</td>
<td align="center">2025</td>
<td align="center">84.62</td>
<td align="center">79.22</td>
<td align="center">80.15</td>
<td align="center">78.83</td>
<td align="center">86.53</td>
</tr>
<tr>
<td align="left">Ours (CRC-Former)</td>
<td align="center">&#x2013; &#x2013;</td>
<td align="center">
<bold>87.42</bold>
</td>
<td align="center">
<bold>83.11</bold>
</td>
<td align="center">
<bold>82.84</bold>
</td>
<td align="center">
<bold>83.33</bold>
</td>
<td align="center">
<bold>89.67</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The best results are highlighted in bold.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Specifically, CRC-Former attains an accuracy of 87.42%, surpassing the previous best (Eff-CTM, 86.30%) by &#x2b;1.12 percentage points. More importantly, it demonstrates superior discriminative power through its balanced high scores across all metrics: 83.11% F1 score, 82.84% precision, 83.33% recall, and 89.67% AUC. Notably, the model&#x2019;s high recall (83.33%) indicates strong sensitivity to malignant tissue regions&#x2014;including early adenomas and poorly differentiated carcinomas&#x2014;while its precision (82.84%) reflects reliable suppression of false positives (e.g., misclassifying inflamed or hyperplastic mucosa as cancer). The macro-averaged F1 score further confirms robustness across all four diagnostic classes, critical for real-world deployment where class imbalance is common. Compared to recent transformer-based approaches such as BiFormer (83.12% Acc) and GroupMixFormer (85.09% Acc), CRC-Former&#x2019;s consistent gains suggest that its hybrid design&#x2014;integrating frequency-aware attention and cross-scale state-space modeling&#x2014;is particularly effective at capturing the multi-scale, heterogeneous texture patterns characteristic of colorectal neoplasia. Unlike CNNs or pure transformers, which often prioritize either local edges or global context, CRC-Former explicitly decomposes tissue morphology into interpretable frequency subbands via Haar wavelets, then fuses them adaptively using Mamba&#x2019;s selective state-space mechanism. This enables the model to simultaneously resolve fine nuclear atypia (high-frequency bands) and glandular architectural distortion (low-frequency bands)&#x2014;features that are often missed by conventional architectures.</p>
<p>Moreover, the elevated 89.67% average AUC score (the ROC curves and AUC scores of the four specific classes in the dataset are shown in <xref ref-type="fig" rid="F5">Figure 5</xref>) underscores the model&#x2019;s ability to maintain high discriminative power across varying decision thresholds&#x2014;a key requirement for clinical screening systems aiming to minimize both false negatives (missed cancers) and false positives (unnecessary biopsies). When viewed holistically, CRC-Former&#x2019;s dominance across all metrics validates its capacity to generalize beyond simple patch-level classification: it learns clinically meaningful representations that align with pathologist reasoning&#x2014;balancing specificity, sensitivity, and interpretability.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>ROC curves and AUC values of the four different classes in the dataset. CRC-Former achieves strong performance across all four classes: Normal (AUC &#x3d; 0.9112), Serrated lesion (AUC &#x3d; 0.8155), Adenoma (AUC &#x3d; 0.8892), and Adenocarcinoma (AUC &#x3d; 0.9709).</p>
</caption>
<graphic xlink:href="fphys-17-1792357-g005.tif">
<alt-text content-type="machine-generated">Four side-by-side ROC curve charts compare classification performance for Normal, Serrated, Adenocarcinoma, and Adenoma categories. The area under the curve values are 0.9112, 0.8155, 0.9709, and 0.8892, respectively.</alt-text>
</graphic>
</fig>
<p>In summary, these results demonstrate that CRC-Former not only advances the state of the art in colorectal cancer classification but also offers a scalable, architecture-driven solution tailored to the unique challenges of gigapixel histopathology. Its integration of signal processing priors with modern sequence modeling opens new avenues for developing robust, efficient, and explainable AI tools for digital pathology.</p>
</sec>
<sec id="s4-4">
<label>4.4</label>
<title>Ablation study</title>
<p>To validate the effectiveness of our architectural components&#x2014;and specifically to assess whether attention in the frequency domain offers advantages over conventional spatial-domain sliding-window mechanisms&#x2014;we conduct an ablation study using the standard Swin Transformer Small (Swin-S) <xref ref-type="bibr" rid="B17">Liu et al. (2021)</xref> as the baseline. Swin-S employs spatially localized, window-based self-attention and serves as a strong representative of current state-of-the-art hierarchical vision backbones.</p>
<p>As shown in <xref ref-type="table" rid="T2">Table 2</xref>, the Swin-S baseline achieves 85.13% accuracy and 87.44% AUC on the Chaoyang dataset. Replacing its spatial sliding-window attention with our Frequency-aware Global&#x2013;Local Transformer Block (FGT)&#x2014;which applies orientation-specific sliding windows (<inline-formula id="inf76">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf77">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) and multi-scale windows (<inline-formula id="inf78">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf79">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) in the Haar wavelet subbands&#x2014;yields a notable improvement to 86.69% accuracy and 89.01% AUC (Model <inline-formula id="inf80">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). This gain demonstrates that decomposing features into frequency subbands and applying structure-aware attention within each band enables more discriminative modeling of histopathological textures (e.g., horizontal crypt alignment in <inline-formula id="inf81">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, vertical stromal invasion in <inline-formula id="inf82">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) than uniform spatial windows. Separately, augmenting the Swin-S backbone with the Cross-Scale Mamba Block (CSM)&#x2014;which fuses features from all four stages via selective state-space modeling&#x2014;improves performance to 86.05% accuracy and 88.62% AUC (Model <inline-formula id="inf83">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>), confirming the value of efficient, long-range cross-resolution context aggregation. The full CRC-Former, integrating both FGT and CSM, achieves the best results: 87.42% accuracy, 83.11% F1 score, and 89.67% AUC. Crucially, the consistent superiority of FGT over Swin-S provides direct evidence that frequency-domain sliding-window attention is more effective than its spatial counterpart for capturing the multi-orientation, multi-scale morphological signatures of colorectal neoplasia. The complementary gains from CSM further indicate that enhanced local representation must be coupled with global cross-scale reasoning to maximize diagnostic performance. This ablation study not only quantifies the contribution of each module but also establishes a key insight: leveraging wavelet-based frequency decomposition as an inductive bias for attention design leads to more pathology-aware feature learning than purely spatial mechanisms&#x2014;a finding with broader implications for vision transformers in the field of medical image analysis.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Ablation study of the proposed FGT and CSM modules in CRC-Former on the Chaoyang dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="center">Swin</th>
<th align="center">FGT</th>
<th align="center">CSM</th>
<th align="center">Accuracy(%) <inline-formula id="inf84">
<mml:math id="m96">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">F1(%) <inline-formula id="inf85">
<mml:math id="m97">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Precision(%) <inline-formula id="inf86">
<mml:math id="m98">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Recall(%) <inline-formula id="inf87">
<mml:math id="m99">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">AUC(%) <inline-formula id="inf88">
<mml:math id="m100">
<mml:mrow>
<mml:mi>&#x2191;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Baseline</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2717;</td>
<td align="center">85.13</td>
<td align="center">80.29</td>
<td align="center">81.09</td>
<td align="center">79.83</td>
<td align="center">87.44</td>
</tr>
<tr>
<td align="left">
<inline-formula id="inf89">
<mml:math id="m101">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2713;</td>
<td align="center">86.05</td>
<td align="center">80.96</td>
<td align="center">81.91</td>
<td align="center">80.91</td>
<td align="center">88.62</td>
</tr>
<tr>
<td align="left">
<inline-formula id="inf90">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2717;</td>
<td align="center">86.69</td>
<td align="center">81.70</td>
<td align="center">81.56</td>
<td align="center">82.47</td>
<td align="center">89.01</td>
</tr>
<tr>
<td align="left">CRC-Former</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">
<bold>87.42</bold>
</td>
<td align="center">
<bold>83.11</bold>
</td>
<td align="center">
<bold>82.84</bold>
</td>
<td align="center">
<bold>83.33</bold>
</td>
<td align="center">
<bold>89.67</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The best results are highlighted in bold.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="s5">
<label>5</label>
<title>Discussion</title>
<p>CRC-Former holds significant clinical potential as an AI-powered decision support tool in colorectal cancer pathology: by delivering rapid, accurate, and interpretable classification of whole-slide images&#x2014;including critical precancerous (adenoma, serrated) and malignant (adenocarcinoma) lesions&#x2014;it can assist pathologists in reducing diagnostic variability, accelerating turnaround time, and improving early detection rates, particularly in settings with limited expert resources. Its consistent performance across lesion types and compatibility with digital pathology workflows position it as a scalable solution for standardizing CRC diagnosis and enhancing screening quality in real-world clinical practice. Our results demonstrate that CRC-Former achieves state-of-the-art performance in colorectal cancer classification on the Chaoyang histopathology dataset, significantly outperforming both CNN- and transformer-based baselines across all evaluation metrics. This success stems from a deliberate architectural shift: rather than treating histopathology images as generic visual data, we embed domain-specific priors&#x2014;namely, multi-scale texture heterogeneity and frequency-domain discriminability&#x2014;directly into the model&#x2019;s inductive bias. The integration of Haar wavelet decomposition with orientation-aware sliding-window attention (FGT) enables the model to resolve diagnostically critical patterns&#x2014;such as crypt distortion, nuclear pleomorphism, and stromal invasion&#x2014;that are often lost in spatial-only representations. Meanwhile, the Cross-Scale Mamba Block (CSM) provides an efficient mechanism for long-range contextual reasoning without the quadratic overhead of self-attention, making the architecture scalable to high-resolution pathology workflows. Notably, the ablation study provides compelling evidence that frequency-domain modeling is not merely an auxiliary enhancement but a core enabler of performance gains. The consistent superiority of FGT over Swin-S&#x2014;a strong spatial baseline&#x2014;validates our hypothesis that pathological textures are better characterized in spectral subbands than in raw pixel space. This insight challenges the prevailing paradigm in medical vision transformers, which largely operate in the spatial domain, and suggests that hybrid signal-processing&#x2013;deep-learning approaches may offer a more principled path toward clinically robust AI. From a clinical perspective, CRC-Former&#x2019;s balanced precision and recall (82.84% and 83.33%, respectively) indicate low rates of both false positives and false negatives&#x2014;critical for minimizing unnecessary biopsies and missed cancers. In summary, CRC-Former exemplifies a new design philosophy for computational pathology: one that unifies classical signal analysis with modern sequence modeling to build systems that are not only accurate but also efficient, interpretable, and aligned with medical domain knowledge.</p>
</sec>
<sec sec-type="conclusion" id="s6">
<label>6</label>
<title>Conclusion</title>
<p>We propose CRC-Former, a frequency-aware architecture for colorectal cancer classification in histopathology images. Departing from spatial-only attention, CRC-Former integrates Haar wavelet&#x2013;based multi-scale spectral attention (FGT block) and selective state-space modeling for cross-resolution fusion (CSM block). On the Chaoyang dataset, it achieves state-of-the-art accuracy (87.42%) and AUC (89.67%), outperforming CNNs, ViTs, and hybrids. Ablations confirm the benefit of frequency-domain modeling, demonstrating the value of wavelet-based inductive bias in medical vision. This work bridges signal processing and deep learning for more efficient, interpretable pathology AI.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>LC: Writing &#x2013; original draft. CL: Writing &#x2013; original draft. FM: Writing &#x2013; original draft. JT: Writing &#x2013; review and editing. KW: Writing &#x2013; review and editing.</p>
</sec>
<ack>
<title>Acknowledgements</title>
<p>The authors sincerely acknowledge the generous support from general surgery center, the first hospital of Jilin University for providing the necessary equipment and funding for this project.</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Adam</surname>
<given-names>K. D. B. J.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>A method for stochastic optimization</article-title>. <source>arXiv Preprint arXiv:1412.6980 1412</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1412.6980</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alzubaidi</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Humaidi</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Al-Dujaili</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Al-Shamma</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Review of deep learning: concepts, cnn architectures, challenges, applications, future directions</article-title>. <source>J. Big Data</source> <volume>8</volume>, <fpage>53</fpage>. <pub-id pub-id-type="doi">10.1186/s40537-021-00444-8</pub-id>
<pub-id pub-id-type="pmid">33816053</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Anusha</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Reddy</surname>
<given-names>D. S.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Fusion of classical and deep learning features with incremental learning for improved classification of lung and colon cancer</article-title>. <source>Sci. Rep.</source> <volume>15</volume>, <fpage>40894</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-24734-w</pub-id>
<pub-id pub-id-type="pmid">41258424</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Anwar</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Majid</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Qayyum</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Awais</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alnowami</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>M. K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Medical image analysis using convolutional neural networks: a review</article-title>. <source>J. Medical Systems</source> <volume>42</volume>, <fpage>226</fpage>. <pub-id pub-id-type="doi">10.1007/s10916-018-1088-1</pub-id>
<pub-id pub-id-type="pmid">30298337</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Attallah</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Lung and colon cancer classification using multiscale deep features integration of compact convolutional neural networks and feature selection</article-title>. <source>Technologies</source> <volume>13</volume>, <fpage>54</fpage>. <pub-id pub-id-type="doi">10.3390/technologies13020054</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Baeza-Yates</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ribeiro-Neto</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>1999</year>). <source>Modern information retrieval</source>, <volume>463</volume>. <publisher-loc>New York</publisher-loc>: <publisher-name>ACM press</publisher-name>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bradley</surname>
<given-names>A. P.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>The use of the area under the roc curve in the evaluation of machine learning algorithms</article-title>. <source>Pattern Recognition</source> <volume>30</volume>, <fpage>1145</fpage>&#x2013;<lpage>1159</lpage>. <pub-id pub-id-type="doi">10.1016/s0031-3203(96)00142-2</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheema</surname>
<given-names>M. N.</given-names>
</name>
<name>
<surname>Nazir</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sheng</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>D. D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Liver extraction using residual convolution neural networks from low-dose ct images</article-title>. <source>IEEE Trans. Biomed. Eng.</source> <volume>66</volume>, <fpage>2641</fpage>&#x2013;<lpage>2650</lpage>. <pub-id pub-id-type="doi">10.1109/TBME.2019.2894123</pub-id>
<pub-id pub-id-type="pmid">30668449</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Socher</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.-J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Fei-Fei</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Imagenet: a large-scale hierarchical image database</article-title>,&#x201d; in <conf-name>2009 IEEE conference on computer vision and pattern recognition (Ieee)</conf-name>, <fpage>248</fpage>&#x2013;<lpage>255</lpage>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>. <source>arXiv Preprint arXiv:2010.11929</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fatema</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Mohammed</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Sehra</surname>
<given-names>S. S.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Taylor-series expanded kolmogorov-arnold network for medical imaging classification</article-title>. <source>arXiv Preprint arXiv:2509.13687</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2509.13687</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ge</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Tong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Advancing vision transformers with group-mix attention</article-title>. <source>arXiv Preprint arXiv:2311.15157</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2311.15157</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Mamba: linear-time sequence modeling with selective state spaces</article-title>. <source>arXiv Preprint arXiv:2312.00752</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2312.00752</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goel</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>R&#xe9;</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Efficiently modeling long sequences with structured state spaces</article-title>. <source>arXiv Preprint arXiv:2111.00396</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2111.00396</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Haar</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>1909</year>). <article-title>
<italic>Zur theorie der orthogonalen funktionensysteme</italic> (Georg-August-Universitat, Gottingen)</article-title>.</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>.</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Feichtenhofer</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>A convnet for the 2020s</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, <fpage>11976</fpage>&#x2013;<lpage>11986</lpage>.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Xi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Swin-umamba: mamba-based unet with imagenet-based pretraining</article-title>. <source>arXiv Preprint arXiv:2402.03302</source>. <pub-id pub-id-type="doi">10.1007/978-3-031-72114-4_59</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yue</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>An efficient medical image classification network based on multi-branch cnn, token grouping transformer and mixer mlp</article-title>. <source>Appl. Soft Comput.</source> <volume>153</volume>, <fpage>111323</fpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2024.111323</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Vmamba: visual state space model</article-title>. <source>Adv. Neural Information Processing Systems</source> <volume>37</volume>, <fpage>103031</fpage>&#x2013;<lpage>103063</lpage>. <pub-id pub-id-type="doi">10.52202/079017-3273</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lou</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Transxnet: learning both global and local dynamics with a dual dynamic token mixer for visual recognition</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <pub-id pub-id-type="doi">10.1109/TNNLS.2025.3550979</pub-id>
<pub-id pub-id-type="pmid">40178959</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>U-mamba: enhancing long-range dependency for biomedical image segmentation</article-title>. <source>arXiv Preprint arXiv:2401.04722</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2401.04722</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Masood</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sheng</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Automated decision support system for lung cancer detection and classification via enhanced rfcn with multilayer fusion rpn</article-title>. <source>IEEE Trans. Industrial Inf.</source> <volume>16</volume>, <fpage>7791</fpage>&#x2013;<lpage>7801</lpage>. <pub-id pub-id-type="doi">10.1109/tii.2020.2972918</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Massa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lerer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bradbury</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chanan</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Pytorch: an imperative style, high-performance deep learning library</article-title>. <source>Adv. Neural Information Processing Systems</source> <volume>32</volume>. <pub-id pub-id-type="doi">10.48550/arXiv.1912.01703</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Powers</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Evaluation: from precision, recall and f-measure to roc, informedness, markedness and correlation</article-title>. <source>arXiv Preprint arXiv:2010.16061</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2010.16061</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ruan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Vm-unet: vision mamba unet for medical image segmentation</article-title>. <source>arXiv Preprint arXiv:2402.02491</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2402.02491</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sokolova</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lapalme</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>A systematic analysis of performance measures for classification tasks</article-title>. <source>Inf. Processing and Management</source> <volume>45</volume>, <fpage>427</fpage>&#x2013;<lpage>437</lpage>. <pub-id pub-id-type="doi">10.1016/j.ipm.2009.03.002</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Efficientnet: rethinking model scaling for convolutional neural networks</article-title>,&#x201d; in <conf-name>International conference on machine learning</conf-name> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>PMLR (Proceedings of Machine Learning Research)</publisher-name>), <fpage>6105</fpage>&#x2013;<lpage>6114</lpage>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>D.-P.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Pyramid vision transformer: a versatile backbone for dense prediction without convolutions</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, <fpage>568</fpage>&#x2013;<lpage>578</lpage>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Xing</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>Fsa-net: fractal-driven synergistic anatomy-aware network for segmenting white line of toldt in laparoscopic images</article-title>,&#x201d; in <conf-name>International Conference on Medical Image Computing and Computer-Assisted Intervention</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>289</fpage>&#x2013;<lpage>299</lpage>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Overfitting remedy by sparsifying regularization on fully-connected layers of cnns</article-title>. <source>Neurocomputing</source> <volume>328</volume>, <fpage>69</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2018.03.080</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Inceptionnext: when inception meets convnext</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/cvf conference on computer vision and pattern recognition</conf-name>, <fpage>5672</fpage>&#x2013;<lpage>5683</lpage>.</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yue</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Medmamba: vision mamba for medical image classification</article-title>. <source>arXiv Preprint arXiv:2403.03849</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2403.03849</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Hard sample aware noise robust learning for histopathology image classification</article-title>. <source>IEEE Transactions Medical Imaging</source> <volume>41</volume>, <fpage>881</fpage>&#x2013;<lpage>894</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2021.3125459</pub-id>
<pub-id pub-id-type="pmid">34735341</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ke</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Lau</surname>
<given-names>R. W.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Biformer: vision transformer with bi-level routing attention</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, <fpage>10323</fpage>&#x2013;<lpage>10333</lpage>.</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1105187/overview">Feng Gao</ext-link>, The Sixth Affiliated Hospital of Sun Yat-sen University, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3361290/overview">Zijin Zhang</ext-link>, Chinese Academy of Sciences (CAS), China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3361311/overview">Yiping Zhou</ext-link>, State Grid Information and Communications Industry Group Co., Ltd., China</p>
</fn>
</fn-group>
</back>
</article>