<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Environ. Sci.</journal-id>
<journal-title>Frontiers in Environmental Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Environ. Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-665X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1648562</article-id>
<article-id pub-id-type="doi">10.3389/fenvs.2025.1648562</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Environmental Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>CoastVisionNet: transformer with integrated spatial-channel attention for coastal land cover classification</article-title>
<alt-title alt-title-type="left-running-head">Yang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fenvs.2025.1648562">10.3389/fenvs.2025.1648562</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Li</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yijun</surname>
<given-names>Liu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Deng</surname>
<given-names>Wenhao</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3103348/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Integrated Circuits</institution>, <institution>Guangdong University of Technology</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Computer Science</institution>, <institution>Xi&#x0027;an University of Technology</institution>, <addr-line>Xi&#x0027;an</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1477641/overview">Changchun Huang</ext-link>, Nanjing Normal University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2961357/overview">Yue Zhao</ext-link>, Xidian University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3108772/overview">Ahmed Gomaa</ext-link>, Egypt-Japan University of Science and Technology Faculty of Engineering, Egypt</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Liu Yijun, <email>ckfkoym511548@outlook.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>21</day>
<month>08</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1648562</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>07</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Yang, Yijun and Deng.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Yang, Yijun and Deng</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The rapid advancement of satellite sensing technologies and the growing need for high-resolution environmental intelligence have highlighted coastal land cover classification as a vital yet challenging task in remote sensing. Coastal zones, being highly dynamic and spatially heterogeneous, require sophisticated semantic modeling strategies that account for both spectral variability and spatial morphology. While traditional convolutional neural networks and fixed-resolution transformer models have made notable strides, they often struggle to generalize across varying topographies and spectral distributions. These limitations stem from rigid spatial encoding schemes, insufficient spectral differentiation, and a lack of dynamic reasoning capabilities.</p>
</sec>
<sec>
<title>Methods</title>
<p>To overcome these challenges, we introduce CoastVisionNet, a transformer-based framework with integrated spatial-channel attention tailored for coastal land cover classification. The system builds on a robust theoretical foundation and is structured around three components: a novel Spectral-Topographic Encoding Network (STEN) for dual-path spectral and morphological representation, a geometry-aware self-attention for cross-modal feature fusion, and a Spectrum-Guided Semantic Modulation (SGSM) strategy for adaptive inference. STEN captures fine-grained spectral gradients and terrain-aware vector fields, enabling the model to preserve topological and spectral consistency across heterogeneous coastal scenes. SGSM enhances generalization by incorporating spectrum-conditioned priors, uncertainty-aware regularization, and curriculum-based spectral reweighting.</p>
</sec>
<sec>
<title>Results</title>
<p>Extensive experiments on diverse coastal satellite datasets demonstrate that CoastVisionNet significantly outperforms existing baselines in classification accuracy, especially in out-of-distribution regions and under varying imaging conditions.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Furthermore, the model exhibits high transferability across different sensors and temporal snapshots, making it well-suited for the complex, evolving nature of coastal environments. This work aligns strongly with emerging priorities in intelligent remote sensing, offering a scalable, interpretable, and physically grounded framework for next-generation coastal monitoring.</p>
</sec>
</abstract>
<kwd-group>
<kwd>coastal land cover classification</kwd>
<kwd>spectral-topographic fusion</kwd>
<kwd>spatial-channel attention</kwd>
<kwd>semantic modulation</kwd>
<kwd>remote sensing transformer</kwd>
</kwd-group>
<counts>
<page-count count="19"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Environmental Informatics and Remote Sensing</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Coastal zones play a pivotal role in both ecological sustainability and economic development, yet they are highly susceptible to environmental changes and human activities (<xref ref-type="bibr" rid="B31">Lv et al., 2024</xref>). Accurate classification of coastal land cover is not only essential for monitoring coastal erosion, habitat change, and urban expansion, but also supports coastal zone management and environmental planning (<xref ref-type="bibr" rid="B11">Chen et al., 2025</xref>). With the increasing availability of high-resolution satellite imagery, there is a growing demand for advanced computational methods that can effectively exploit both spatial and spectral information (<xref ref-type="bibr" rid="B17">Frost et al., 2025</xref>). Therefore, developing a robust and generalizable model for coastal land cover classification is not only necessary, but also timely. Such a model should be capable of adapting to diverse coastal settings, reducing classification noise, and enhancing the interpretability of results for practical applications (<xref ref-type="bibr" rid="B46">Touvron et al., 2021</xref>).</p>
<p>The development of coastal land cover classification has progressed through three major stages, each addressing specific limitations of its predecessors. Early coastal land cover classification relied heavily on rule-based systems and shallow statistical models such as support vector machines (SVMs), random forests (RF), and k-nearest neighbors (k-NN) (<xref ref-type="bibr" rid="B48">Wang et al., 2022</xref>). While these methods offered interpretability and modest success by using spectral indices and handcrafted features, they lacked flexibility and struggled to generalize across heterogeneous coastal landscapes (<xref ref-type="bibr" rid="B45">Tian et al., 2020</xref>). Their reliance on rigid threshold rules and limited spatial awareness often led to misclassification in regions with subtle spectral gradients or overlapping land cover types (<xref ref-type="bibr" rid="B51">Yang et al., 2021</xref>). This motivated a shift toward deep learning models, especially convolutional neural networks (CNNs), which introduced hierarchical feature extraction and improved spatial modeling (<xref ref-type="bibr" rid="B20">Hong et al., 2020</xref>). However, CNNs still suffer from limited receptive fields and difficulties in capturing long-range dependencies (<xref ref-type="bibr" rid="B41">Sun et al., 2022</xref>). To address these challenges, transformer-based architectures have recently emerged as powerful alternatives capable of modeling global contextual relationships and cross-spectral interactions, which are particularly important in coastal regions with complex spatial dynamics (<xref ref-type="bibr" rid="B37">Rao et al., 2021</xref>). With increasing emphasis on robust and scalable solutions, recent work has turned to end-to-end learning systems capable of jointly modeling spectral and spatial information (<xref ref-type="bibr" rid="B32">Mai et al., 2021</xref>). Deep learning models, especially convolutional neural networks (CNNs), have been successful in automatically learning multi-level features from raw imagery, capturing spatial hierarchies and complex spectral relationships (<xref ref-type="bibr" rid="B26">Li et al., 2020</xref>). Their success in image classification tasks made them suitable for remote sensing applications, including coastal land cover mapping (<xref ref-type="bibr" rid="B6">Bhojanapalli et al., 2021</xref>). However, standard CNNs have limitations in capturing long-range dependencies due to their local receptive fields. Recent developments in vision transformers (ViTs) offer a compelling alternative by modeling global context through self-attention mechanisms (<xref ref-type="bibr" rid="B3">Azizi et al., 2021</xref>). Nevertheless, vanilla transformers may overlook important local features (<xref ref-type="bibr" rid="B52">Zhang et al., 2020</xref>). To address this, we propose CoastVisionNet, a transformer-based architecture enhanced with integrated spatial-channel attention modules. These modules allow the model to simultaneously focus on meaningful spatial regions and spectral channels, enhancing feature discriminability and robustness (<xref ref-type="bibr" rid="B24">Kim et al., 2022</xref>). By fusing local and global contextual cues, CoastVisionNet bridges the gap between CNNs and transformers, offering a powerful framework for coastal land cover classification.</p>
<p>Based on the aforementioned limitations of early rule-based systems, statistical models, and conventional deep learning methods, we propose CoastVisionNet, a novel architecture that combines transformer-based global reasoning with spatial-channel attention mechanisms to achieve precise and interpretable coastal land cover classification. This approach is motivated by the need to capture both global context and fine-grained local details, which are essential for distinguishing among spectrally similar classes in heterogeneous coastal environments. By introducing integrated attention across spatial dimensions and spectral channels, our model enhances feature saliency and suppresses background noise, thus enabling more accurate boundary delineation and class discrimination. Furthermore, CoastVisionNet is designed to be lightweight and adaptable, making it suitable for large-scale and real-time coastal monitoring tasks. The model architecture is validated across multiple benchmark datasets, demonstrating consistent improvements over state-of-the-art baselines in terms of classification accuracy, spatial consistency, and computational efficiency.</p>
<p>The proposed method has several key advantages:<list list-type="simple">
<list-item>
<p>&#x2022; CoastVisionNet introduces a novel spatial-channel attention module within a transformer framework to enhance multi-dimensional feature representation.</p>
</list-item>
<list-item>
<p>&#x2022; The method integrates global and local information for improved generalization across diverse coastal regions, offering high accuracy, multi-scenario adaptability, and strong robustness to noise.</p>
</list-item>
<list-item>
<p>&#x2022; Experimental results show that CoastVisionNet outperforms existing CNN and transformer models in overall accuracy and boundary precision across three benchmark coastal datasets.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<sec id="s2-1">
<title>2.1 Transformer models in remote sensing</title>
<p>Transformer-based architectures have increasingly gained traction in remote sensing tasks due to their ability to model long-range dependencies and contextual relationships in spatial data (<xref ref-type="bibr" rid="B21">Hong et al., 2021</xref>; <xref ref-type="bibr" rid="B27">Li et al., 2025</xref>; <xref ref-type="bibr" rid="B10">Chen et al., 2024</xref>). Traditional convolutional neural networks (CNNs), while effective at capturing local patterns, often struggle with learning global representations, which are critical in analyzing high-resolution remote sensing imagery (<xref ref-type="bibr" rid="B38">Roy et al., 2022</xref>). Vision Transformers (ViTs), initially proposed for natural image classification, have been adapted for remote sensing tasks, demonstrating competitive or superior performance compared to CNN counterparts (<xref ref-type="bibr" rid="B23">Khan et al., 2020</xref>; <xref ref-type="bibr" rid="B42">Tanaka et al., 2023</xref>; <xref ref-type="bibr" rid="B19">He et al., 2024</xref>). One major adaptation involves the incorporation of hierarchical structures and locality inductive biases into transformer models to address the high computational cost and lack of inherent translation equivariance (<xref ref-type="bibr" rid="B57">Zhu et al., 2020</xref>). Other approaches like TransUNet integrate transformer encoders with CNN-based decoders, combining the global context modeling of transformers with the detailed spatial resolution capabilities of CNNs (<xref ref-type="bibr" rid="B9">Chen L. et al., 2021</xref>). In the context of land cover classification, transformers have shown notable performance semantic segmentation tasks, where precise delineation of land types is required. Remote sensing datasets often encompass diverse and complex landscapes, making the modeling of global relationships essential for accurate classification (<xref ref-type="bibr" rid="B2">Ashtiani et al., 2021</xref>). Multi-scale and multi-modal transformers have been developed to leverage information from various spectral bands and resolutions, further improving classification accuracy (<xref ref-type="bibr" rid="B33">Masana et al., 2020</xref>). Furthermore, hybrid models that combine CNNs with transformers have been introduced to mitigate the limitations of pure transformer models in spatial feature extraction. These models often use CNNs to extract initial low-level features, followed by transformers to model the interrelations across spatial patches. This synergy has led to improved performance in tasks such as change detection, object detection, and land use classification. Research has also explored domain-specific adaptations, such as employing transformers for hyperspectral image classification, where the spectral dimension introduces additional complexity (<xref ref-type="bibr" rid="B39">Sheykhmousa et al., 2020</xref>). Transformers&#x2019; ability to handle sequential data makes them particularly suited for capturing spectral-spatial correlations. The CoastVisionNet builds upon this trajectory by embedding a transformer backbone tailored for coastal land cover segmentation, suggesting the potential benefits of leveraging transformer architectures in domains characterized by complex spatial dynamics and heterogeneous features. The choice of transformers aligns with the growing trend of adopting attention-based models in remote sensing, particularly where global context and feature interactions significantly impact classification outcomes (<xref ref-type="bibr" rid="B56">Zheng et al., 2022</xref>).</p>
</sec>
<sec id="s2-2">
<title>2.2 Attention mechanisms for image segmentation</title>
<p>Attention mechanisms have revolutionized deep learning-based image segmentation by enhancing a model&#x2019;s ability to focus on relevant spatial and channel-wise features. In semantic segmentation, accurately classifying each pixel in an image necessitates the discrimination of subtle contextual differences across regions, a task well-suited for attention-enhanced architectures (<xref ref-type="bibr" rid="B34">Mascarenhas and l Agarwal, 2021</xref>). Spatial attention mechanisms guide the model to emphasize significant regions in an image, effectively acting as a soft spatial mask. This is particularly useful in land cover classification, where certain areas, such as water bodies or vegetation, may occupy only a small portion of the image yet are crucial for accurate segmentation. Spatial attention enhances feature maps by weighting the importance of each spatial location based on its relevance to the task (<xref ref-type="bibr" rid="B53">Zhang et al., 2022</xref>). Channel attention, on the other hand, focuses on reweighting the importance of each feature channel. In convolutional neural networks, different channels encode different semantic information. Channel attention modules, such as the Squeeze-and-Excitation (SE) block, dynamically adjust the contribution of each channel, enabling the model to prioritize more informative features. This has been particularly useful in tasks requiring fine-grained recognition and classification (<xref ref-type="bibr" rid="B12">Dai and Gao, 2021</xref>). More advanced architectures combine spatial and channel attention to simultaneously refine spatial and semantic features. Dual attention mechanisms, like those used in the Dual Attention Network (DANet), allow for modeling both spatial dependencies and channel interrelationships, enhancing segmentation accuracy across complex scenes. Other techniques include attention gates in encoder-decoder frameworks, which selectively propagate information through the network hierarchy, improving feature localization (<xref ref-type="bibr" rid="B43">Taori et al., 2020</xref>). The integration of attention mechanisms with transformers further amplifies their benefits. Transformers inherently utilize self-attention, which models all pairwise interactions between elements, offering a holistic view of the input (<xref ref-type="bibr" rid="B36">Peng et al., 2022</xref>). However, integrating explicit spatial and channel attention modules allows for finer control over the learned features and enhances interpretability. In the context of CoastVisionNet, the incorporation of integrated spatial-channel attention modules is crucial. Coastal regions are characterized by high spatial heterogeneity and diverse land cover types, such as beaches, mangroves, urban zones, and agricultural fields. A combined attention approach enables the model to focus on salient spatial patterns and important feature channels that distinguish these categories. This dual attention strategy enhances the discriminative power of the network, leading to more accurate and context-aware segmentation results (<xref ref-type="bibr" rid="B4">Bazi et al., 2021</xref>).</p>
</sec>
<sec id="s2-3">
<title>2.3 Coastal Land cover classification techniques</title>
<p>Coastal land cover classification is a critical component of environmental monitoring, urban planning, and disaster management (<xref ref-type="bibr" rid="B21">Hong et al., 2021</xref>). These regions are characterized by dynamic landscapes influenced by natural and anthropogenic factors, necessitating robust methods for accurate classification (<xref ref-type="bibr" rid="B15">Dong et al., 2022</xref>). Traditional classification approaches relied on pixel-based methods using spectral indices and machine learning algorithms such as support vector machines (SVMs) and random forests, often limited by their inability to capture spatial context (<xref ref-type="bibr" rid="B7">Chen C.-F. et al., 2021</xref>). With the advent of deep learning, convolutional neural networks (CNNs) have become the dominant approach for land cover classification, offering superior performance through hierarchical feature extraction (<xref ref-type="bibr" rid="B35">Maur&#xed;cio et al., 2023</xref>). However, these models often require large annotated datasets and may struggle with classifying small or irregularly shaped objects typical of coastal environments. Recent developments have introduced multi-scale and multi-temporal methods to address the temporal and spatial variability in coastal regions (<xref ref-type="bibr" rid="B30">Liu et al., 2024</xref>). These methods leverage time-series data to capture seasonal changes and long-term trends, enhancing the model&#x2019;s ability to distinguish between classes that exhibit similar spectral signatures but differ temporally. Incorporating elevation data and ancillary information, such as LiDAR or radar imagery, has further improved classification outcomes by providing additional contextual cues (<xref ref-type="bibr" rid="B29">Liu et al., 2023b</xref>). Moreover, object-based image analysis (OBIA) has emerged as an effective strategy, segmenting imagery into meaningful objects rather than individual pixels. OBIA combined with deep learning facilitates the integration of spatial, spectral, and contextual information, resulting in more coherent and accurate classifications. The use of remote sensing data from various platforms, including Sentinel-2, Landsat, and UAVs, offers diverse spatial and spectral resolutions, which can be exploited through data fusion techniques. These techniques merge information from multiple sources, enhancing the richness of input data and improving classification accuracy (<xref ref-type="bibr" rid="B28">Liu et al., 2023a</xref>). CoastVisionNet contributes to this evolving landscape by introducing a transformer-based model designed for coastal land cover classification. Its architecture incorporates spatial-channel attention mechanisms, tailored to the unique challenges of coastal environments (<xref ref-type="bibr" rid="B49">Wang et al., 2024</xref>; <xref ref-type="bibr" rid="B54">Zhao et al., 2022</xref>; <xref ref-type="bibr" rid="B14">Deng et al., 2024</xref>). This model addresses the limitations of prior approaches by capturing long-range dependencies, emphasizing relevant spatial regions, and adapting to the heterogeneous nature of coastal land types. The design of CoastVisionNet reflects a synthesis of advances in deep learning, attention mechanisms, and remote sensing, positioning it as a state-of-the-art solution for coastal land cover analysis.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methods</title>
<sec id="s3-1">
<title>3.1 Overview</title>
<p>Remote sensing has long served as a pivotal modality for a wide spectrum of scientific and industrial applications, ranging from environmental monitoring to urban planning, from agricultural forecasting to military reconnaissance. The remarkable advancements in sensor technology and the advent of high-resolution, multi-spectral, and temporally-rich satellite imagery have posed both immense opportunities and significant challenges for automated analysis. At the heart of these challenges lies the fundamental issue of developing scalable, generalizable, and semantically interpretable models that can effectively reason over spatial and spectral heterogeneity. This paper introduces a novel methodology designed to address these foundational requirements.</p>
<p>Our methodological framework is built around a unified architecture for remote sensing scene understanding, designed to integrate symbolic formalization, architectural novelty, and strategic data alignment. To this end, the following three components constitute the core contributions of this work: an abstract formulation of remote sensing interpretation the umbrella of structured representation learning, a newly introduced model architecture that harnesses hierarchical representations for multi-scale spectral reasoning, and a domain-specific inference strategy that enables dynamic modulation of semantic priors and spectral dependencies. Together, these three pillars allow the framework to address major limitations in existing remote sensing pipelines, particularly in their scalability to unseen distributions, their rigidity in encoding multisource semantics, and their lack of adaptive reasoning mechanisms.</p>
<p>The first core component of the proposed approach lies in the formalization of the remote sensing problem space. Remote sensing imagery is intrinsically high-dimensional, temporally sparse, and spatially redundant. Furthermore, the semantic classes present in satellite images are entangled across scales and often exhibit intra-class variability and inter-class ambiguity. In <xref ref-type="sec" rid="s3-2">Section 3.2</xref>, we provide a comprehensive mathematical formalism of the remote sensing domain. This involves constructing the input-output space through rigorous notations of spectral vectors, semantic categories, spatial neighborhoods, and inter-band covariance. More critically, we establish a set of transformation invariances and stochastic process assumptions which guide the formulation of the underlying inference problem. These include band-shift invariance, translation equivariance, and latent topological priors&#x2014;all of which underpin the design of downstream modules. Informed by the abstract formulation, <xref ref-type="sec" rid="s3-3">Section 3.3</xref> introduces a new model, which we term Spectral-Topographic Encoding Network (STEN). Unlike conventional convolutional or attention-based approaches that operate in a fixed-resolution space, our model dynamically adapts to both spectral density and spatial topology. The model leverages a dual-path encoding scheme: one path captures local spectral gradient fields using multi-scale depth-wise convolutions, while the other path models topographic contours and edge distributions using a variational vector field decomposition. These two modalities are then fused through a geometry-aware self-attention mechanism that learns spectral co-occurrence patterns conditioned on topographic continuity. By decoupling the representation of spectral semantics and spatial morphology, STEN not only achieves better class separability but also significantly enhances generalization to out-of-distribution samples. Furthermore, the model includes a recursive encoding layer that iteratively refines feature maps based on residual inter-band entropy, a technique inspired by information bottleneck theory. The final component is presented in <xref ref-type="sec" rid="s3-4">Section 3.4</xref>, wherein we propose an inference strategy referred to as Spectrum-Guided Semantic Modulation (SGSM). The motivation behind SGSM stems from the observation that remote sensing categories are often not mutually exclusive but rather spectrum-dependent. For instance, urban infrastructure and barren land may share similar spectral signatures under certain illumination and seasonal conditions. SGSM introduces a context-sensitive inference pipeline that modulates semantic predictions via a learned spectrum-attention gate. This gate dynamically adjusts the decision boundaries based on inter-band correlation coefficients and ambient reflectance priors. The strategy also incorporates a curriculum-inspired mechanism for spectral augmentation, wherein the training regime selectively emphasizes hard-to-distinguish spectra during early epochs and gradually incorporates easier spectra as training stabilizes. SGSM integrates an uncertainty-aware regularization term in the optimization objective, which penalizes semantically inconsistent predictions across neighboring spectral bands. Through the combination of structured problem formulation, tailored model architecture, and domain-aware strategy, our method achieves state-of-the-art performance on multiple remote sensing benchmarks. These include land cover classification, scene parsing, and object segmentation tasks across a diverse set of satellite platforms and resolutions. More importantly, the unified framework facilitates transferability across different regions and imaging conditions, a crucial property for real-world deployment.</p>
</sec>
<sec id="s3-2">
<title>3.2 Preliminaries</title>
<p>Remote sensing data encapsulate a highly structured and hierarchical form of information, composed of spectral, spatial, and temporal components. To formulate our methodology rigorously, we first provide a formal mathematical description of the remote sensing problem space. Our objective is to establish a unified symbolic foundation that guides the construction of learning objectives, model architectures, and semantic strategies. This section introduces a notational framework for remote sensing image representation, defines key invariances and structural assumptions, and builds a structured inference formulation for downstream semantic tasks such as classification, segmentation, and scene interpretation.</p>
<p>Let <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi mathvariant="script">I</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote a remote sensing image defined on a spatial domain <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2282;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where each pixel <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is associated with a <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-dimensional spectral vector <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Each element <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> of <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> corresponds to the reflectance value in the <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th spectral band.</p>
<p>We define a global image tensor as <xref ref-type="disp-formula" rid="e1">Equation 1</xref>:<disp-formula id="e1">
<mml:math id="m9">
<mml:mrow>
<mml:mi mathvariant="script">T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf9">
<mml:math id="m10">
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the image height and width, respectively.</p>
<p>To encode the local spatial context, we consider a square neighborhood <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> of radius <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> centered at pixel <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <xref ref-type="disp-formula" rid="e2">Equation 2</xref>:<disp-formula id="e2">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>x</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x221e;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>The concatenated spatial-spectral neighborhood feature of pixel <inline-formula id="inf14">
<mml:math id="m16">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is defined as <xref ref-type="disp-formula" rid="e3">Equation 3</xref>:<disp-formula id="e3">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>vec</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>r</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>Let <inline-formula id="inf15">
<mml:math id="m18">
<mml:mrow>
<mml:mi mathvariant="script">Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> be the set of semantic categories such as vegetation, water, urban, and barren land. Each pixel <inline-formula id="inf16">
<mml:math id="m19">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is associated with a (possibly latent) label <inline-formula id="inf17">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">Y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Define the decision function as <xref ref-type="disp-formula" rid="e4">Equation 4</xref>:<disp-formula id="e4">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>r</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf18">
<mml:math id="m22">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the set of learnable parameters, and <inline-formula id="inf19">
<mml:math id="m23">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the <inline-formula id="inf20">
<mml:math id="m24">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-dimensional probability simplex <xref ref-type="disp-formula" rid="e5">Equation 5</xref>:<disp-formula id="e5">
<mml:math id="m25">
<mml:mrow>
<mml:mfenced open="" close="}">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="|">
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mspace width="0.3333em"/>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="0.3333em"/>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>Spectral vectors of homogeneous land cover regions lie on low-dimensional manifolds embedded in <inline-formula id="inf21">
<mml:math id="m26">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Formally, for a semantic class <inline-formula id="inf22">
<mml:math id="m27">
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">Y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, there exists a manifold <inline-formula id="inf23">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2282;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> such that <xref ref-type="disp-formula" rid="e6">Equation 6</xref>:<disp-formula id="e6">
<mml:math id="m29">
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>&#x21d2;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf24">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents Gaussian perturbation due to noise and atmospheric distortion.</p>
<p>Let <inline-formula id="inf25">
<mml:math id="m31">
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x22c3;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> be the global spectral manifold. Then, the embedding function <inline-formula id="inf26">
<mml:math id="m32">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> maps <inline-formula id="inf27">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to a latent representation <inline-formula id="inf28">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> such that <xref ref-type="disp-formula" rid="e7">Equation 7</xref>:<disp-formula id="e7">
<mml:math id="m35">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mo>:</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="script">H</mml:mi>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi mathvariant="script">H</mml:mi>
<mml:mo>&#x2282;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>d</mml:mi>
<mml:mo>&#x226a;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>and ideally preserves geodesic distances <xref ref-type="disp-formula" rid="e8">Equation 8</xref>:<disp-formula id="e8">
<mml:math id="m36">
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2248;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>dist</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>Let <inline-formula id="inf29">
<mml:math id="m37">
<mml:mrow>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote the spectral covariance matrix over the entire image <xref ref-type="disp-formula" rid="e9">Equation 9</xref>:<disp-formula id="e9">
<mml:math id="m38">
<mml:mrow>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>where <inline-formula id="inf30">
<mml:math id="m39">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the mean spectrum.</p>
<p>We define a redundancy penalty operator as <xref ref-type="disp-formula" rid="e10">Equation 10</xref>:<disp-formula id="e10">
<mml:math id="m40">
<mml:mrow>
<mml:mi mathvariant="script">R</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">C</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>to capture redundant information among spectral bands.</p>
<p>A key property of remote sensing is spatial translation equivariance <xref ref-type="disp-formula" rid="e11">Equation 11</xref>:<disp-formula id="e11">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;when&#x2009;</mml:mtext>
<mml:mi mathvariant="script">T</mml:mi>
<mml:mtext>&#x2009;is&#x2009;homogeneous.</mml:mtext>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<p>This justifies the application of convolutional or locally-shared architectures. However, edge effects and topographic variance often violate this assumption locally, motivating the use of adaptive filters.</p>
</sec>
<sec id="s3-3">
<title>3.3 Spectral-topographic encoding network (STEN)</title>
<p>We present the Spectral-Topographic Encoding Network (STEN), a hybrid architecture that integrates spectral analysis and topographic pattern learning to enhance semantic representation of multi-band imaging data. STEN is built upon three core innovations: a residual spectral encoder for capturing cross-band dependencies, a differential topographic encoder to extract spatial-geometric cues, and a transformer-based fusion mechanism that aligns the heterogeneous modalities for robust feature learning.</p>
<p>The architecture follows a four-stage hierarchical structure with progressively reduced spatial resolution and increased channel dimensionality. Each stage includes patch embedding, convolutional blocks, and residual connections. For better clarity and visual alignment, The overall architectural pipeline, including the stage-wise spatial and channel dimensions, is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The detailed internal structure of the STEN module is illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Schematic diagram of the proposed Spectral-Topographic Encoding Network (STEN). The architecture integrates multi-stage components: EEG preprocessing, spectral encoding with residual filtering, topographic encoding using differential vector gradients, and fusion via a cross-modality transformer. It includes signal embedding, convolutional extraction, and recursive refinement with entropy guidance to enable robust spectral-topographic representation for downstream classification or segmentation tasks.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g001.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture for processing EEG data, divided into three sections. The left section shows steps from raw EEG input to predicted noise, including differential topographic encoding and layer normalization. The middle section includes convolution layers and a CBAM block. The right section features a DiTBlock with components like scale, layer normalization, and cross-modality fusion transformer. Each section contains labeled processes and connections.</alt-text>
</graphic>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Overview of the Spectral-Topographic Encoding Network (STEN). The pipeline consists of multiple stages including patch embedding, residual spectral encoding, differential topographic encoding, cross-modal transformer fusion, and the final classification head. The input and output dimensions at each stage are annotated to clarify the spatial and spectral transformations across the network. This diagram highlights the coarse-to-fine structural flow and the progressive feature refinement throughout the network.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a machine learning model. An image of an elephant is processed through an Image Encoder, outputting dimensions 196 by 1024. Simultaneously, the text &#x22;a photo of an elephant&#x22; is processed through a Text Encoder, also resulting in 1024 dimensions. Both outputs are combined in a Cross-Modal Transformer, producing dimensions 196 by 512, which are then processed by an MLP to produce an output of 196 dimensions.</alt-text>
</graphic>
</fig>
<sec id="s3-3-1">
<title>3.3.1 Residual Spectral Encoding</title>
<p>The task of capturing the spectral variations across different spatial locations in high-dimensional input data is crucial for understanding the underlying patterns within multi-band imagery (As shown in <xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Schematic diagram of the Residual Spectral Encoding module. The encoder comprises four hierarchical stages, each consisting of patch embedding, convolutional blocks, and residual connections. The spatial resolution is progressively reduced while the channel dimension increases, enabling efficient multi-level spectral feature extraction. The final embedding supports robust representation learning for classification tasks.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g003.tif">
<alt-text content-type="machine-generated">Diagram of Residual Spectral Encoding showcasing a neural network architecture. It has an input of size 224x224 processed through four stages. Each stage includes a Patch Embed layer followed by Conv Blocks. The output sizes and channels reduce from 56x56, C=72 in Stage 1 to 7x7, C=576 in Stage 4, ending with a classification layer.</alt-text>
</graphic>
</fig>
<p>Given the input tensor <inline-formula id="inf31">
<mml:math id="m42">
<mml:mrow>
<mml:mi mathvariant="script">T</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf32">
<mml:math id="m43">
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf33">
<mml:math id="m44">
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represent the height and width of the image, and <inline-formula id="inf34">
<mml:math id="m45">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of spectral bands, we aim to capture fine-grained spectral features while preserving spatial coherence. To achieve this, we employ a depth-wise residual filtering technique, which has proven effective in extracting local spectral patterns while maintaining computational efficiency. We perform residual filtering across each spectral band, as detailed by the following recursive formulation <xref ref-type="disp-formula" rid="e12">Equation 12</xref>:<disp-formula id="e12">
<mml:math id="m46">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msubsup>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2217;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>where <inline-formula id="inf35">
<mml:math id="m47">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represents the depth-wise filter applied to the <inline-formula id="inf36">
<mml:math id="m48">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th spectral band at layer <inline-formula id="inf37">
<mml:math id="m49">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf38">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the extraction function for the <inline-formula id="inf39">
<mml:math id="m51">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th band at spatial location <inline-formula id="inf40">
<mml:math id="m52">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf41">
<mml:math id="m53">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes a nonlinear activation function, typically ReLU or LeakyReLU, to introduce nonlinearity into the network. The depth-wise convolution allows for more efficient computation by operating independently on each spectral band, ensuring that the model captures both spectral correlations and spatial dependencies with minimal computational overhead. This recursive process allows the network to refine feature representations at each layer, building increasingly abstract features that better capture the spectral nuances across the image.</p>
<p>This multi-layered approach enables the model to progressively refine the spectral information, accounting for both local and global spectral dependencies. The final spectral embedding, <inline-formula id="inf42">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, for each spatial coordinate <inline-formula id="inf43">
<mml:math id="m55">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is obtained by concatenating the features across all layers of the spectral encoder <xref ref-type="disp-formula" rid="e13">Equation 13</xref>:<disp-formula id="e13">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>Concat</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>where <inline-formula id="inf44">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the cumulative dimensionality of the spectral representation after concatenation. This spectral embedding <inline-formula id="inf45">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> captures the rich spectral variation at each spatial location, enabling subsequent modules to utilize these embeddings for more advanced tasks, such as classification or segmentation. Importantly, the residual connections within the depth-wise filtering process help mitigate the vanishing gradient problem, ensuring that deeper layers can retain important low-level features while learning more abstract high-level representations. The use of residual connections not only aids in training deeper models but also facilitates the preservation of low-level features that are crucial for discriminating between similar spectral patterns, such as those encountered in different malaria parasite stages. These residual connections also improve the stability of the network by allowing gradient flow through the network layers without significant loss of information. Furthermore, by maintaining a hierarchical structure of spectral representations, the network is better equipped to handle spectral variations caused by noise, changes in imaging conditions, or other real-world challenges commonly encountered in remote sensing or medical imaging tasks.</p>
</sec>
<sec id="s3-3-2">
<title>3.3.2 Differential topographic encoding</title>
<p>To effectively capture the spatial morphology of an image, we leverage a differential approach to encoding topographic structures using vector field gradients and geometric invariants. The primary goal of this encoding process is to represent local and global terrain characteristics such as edges, contours, and texture gradients that are crucial for understanding spatial relationships in images.</p>
<p>For a given spatial coordinate <inline-formula id="inf46">
<mml:math id="m59">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> within the image, we define the local gradient field <inline-formula id="inf47">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as the set of differences between the feature values at <inline-formula id="inf48">
<mml:math id="m61">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and its neighboring pixels <inline-formula id="inf49">
<mml:math id="m62">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf50">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the immediate neighborhood of pixel <inline-formula id="inf51">
<mml:math id="m64">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This vector field captures the rate of change in the spatial features <xref ref-type="disp-formula" rid="e14">Equation 14</xref>:<disp-formula id="e14">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where <inline-formula id="inf52">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf53">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the feature values at the spatial locations <inline-formula id="inf54">
<mml:math id="m68">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf55">
<mml:math id="m69">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, respectively, and <inline-formula id="inf56">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the gradient between them. This gradient field reflects how rapidly and in which direction the image features (e.g., texture, elevation, color) are changing around each point, similar to the slope of terrain in topographic maps. It helps capture object boundaries, edge directions, and subtle transitions across regions. To further characterize the spatial structure and capture more complex geometric properties of the image, we compute the divergence and curl of the gradient field. The divergence represents the net &#x201c;outflow&#x201d; of the feature signal at each pixel <inline-formula id="inf57">
<mml:math id="m71">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and can be thought of as a measure of local expansion or compression in the image. This is computed by taking the dot product of the gradient <inline-formula id="inf58">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and a unit vector <inline-formula id="inf59">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> along the direction from <inline-formula id="inf60">
<mml:math id="m74">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to its neighbor <inline-formula id="inf61">
<mml:math id="m75">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> <xref ref-type="disp-formula" rid="e15">Equation 15</xref>:<disp-formula id="e15">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>div</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>where <inline-formula id="inf62">
<mml:math id="m77">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the dot product. Intuitively, a large positive divergence indicates that feature values are spreading out from the center (e.g., sandy beach fanning out), while a negative value implies convergence (e.g., contours enclosing a dense object). This helps highlight blob-like structures or areas with concentrated intensity. The curl, on the other hand, captures the rotational tendency of the local gradient field. It measures how much the feature vectors tend to circulate around a point <xref ref-type="disp-formula" rid="e16">Equation 16</xref>:<disp-formula id="e16">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>curl</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">C</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mi>det</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>where <inline-formula id="inf63">
<mml:math id="m79">
<mml:mrow>
<mml:mi mathvariant="script">C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the set of all possible oriented cycles around the pixel <inline-formula id="inf64">
<mml:math id="m80">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the determinant of the two gradient vectors provides a scalar estimate of local rotation. This is useful for detecting circular or curvilinear structures&#x2014;such as water eddies, sand dunes, or curved roads&#x2014;and enhances the model&#x2019;s ability to identify objects with rotational symmetry or loop-like boundaries. To summarize the topographic characteristics at each spatial coordinate <inline-formula id="inf65">
<mml:math id="m81">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we combine the computed divergence, curl, and the magnitude of the gradient, <inline-formula id="inf66">
<mml:math id="m82">
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, along with orientation information <inline-formula id="inf67">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> derived from the principal direction of the gradient. These features are then passed through a nonlinear function <inline-formula id="inf68">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, typically a multilayer perceptron (MLP), to generate a compact topographic descriptor <xref ref-type="disp-formula" rid="e17">Equation 17</xref>:<disp-formula id="e17">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>div</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>curl</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf69">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> encapsulates the geometric and structural properties of the local neighborhood. It encodes the &#x201c;flow,&#x201d; &#x201c;rotation,&#x201d; and &#x201c;directionality&#x201d; of features at each location&#x2014;akin to how a human perceives shape and texture transitions. This enriched topographic signal is then fused with spectral features, providing the model with a comprehensive understanding of spatial geometry and object layout.</p>
</sec>
<sec id="s3-3-3">
<title>3.3.3 Cross-modality fusion transformer</title>
<p>The fusion of spectral and topographic features is a critical step in enhancing the representation of both spatial morphology and spectral semantics. In this process, the topographic features are used to query the spectral features, while the spectral features are utilized for both the key and value components in the attention mechanism. This allows for adaptive alignment of the two types of information. The attention mechanism is defined by the following equations <xref ref-type="disp-formula" rid="e18">Equation 18</xref>:<disp-formula id="e18">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>where <inline-formula id="inf70">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf71">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf72">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the query, key, and value matrices, respectively, generated by linearly transforming the topographic and spectral embeddings using the learned weight matrices <inline-formula id="inf73">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf74">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf75">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. These transformations ensure that the attention mechanism can effectively capture cross-modal correlations.</p>
<p>The attention score between the query <inline-formula id="inf76">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the key <inline-formula id="inf77">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for a neighboring spatial location <inline-formula id="inf78">
<mml:math id="m96">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is computed using the scaled dot-product formula <xref ref-type="disp-formula" rid="e19">Equation 19</xref>:<disp-formula id="e19">
<mml:math id="m97">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x2329;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x232a;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x2329;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x232a;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>where <inline-formula id="inf79">
<mml:math id="m98">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the dot product, <inline-formula id="inf80">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the dimension of the key vectors, and <inline-formula id="inf81">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the set of neighboring locations around <inline-formula id="inf82">
<mml:math id="m101">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The attention score <inline-formula id="inf83">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> reflects the relevance of the spectral feature at <inline-formula id="inf84">
<mml:math id="m103">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> with respect to the topographic feature at <inline-formula id="inf85">
<mml:math id="m104">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Once the attention weights are computed, the fused feature representation at each spatial location is obtained by taking a weighted sum of the value vectors <inline-formula id="inf86">
<mml:math id="m105">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the neighboring pixels <xref ref-type="disp-formula" rid="e20">Equation 20</xref>:<disp-formula id="e20">
<mml:math id="m106">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>where <inline-formula id="inf87">
<mml:math id="m107">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the fused feature, capturing the combined influence of both spectral and topographic features. The fused features are then concatenated with the original spectral and topographic descriptors, followed by a multi-layer perceptron (MLP) to further refine the representation. The updated representation <inline-formula id="inf88">
<mml:math id="m108">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is calculated as <xref ref-type="disp-formula" rid="e21">Equation 21</xref>:<disp-formula id="e21">
<mml:math id="m109">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>MLP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>where <inline-formula id="inf89">
<mml:math id="m110">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the original spectral descriptor, and the residual connection ensures that the spectral information is preserved during the fusion process.</p>
<p>To further enhance the quality of the fused representation, we introduce an entropy-guided recursive refinement process. This refinement process emphasizes informative features while filtering out redundant or noisy patterns, which is particularly important in real-world data with varying quality and artifacts. We first compute the local spectral entropy <inline-formula id="inf90">
<mml:math id="m111">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which measures the uncertainty or unpredictability of the spectral distribution at each pixel <xref ref-type="disp-formula" rid="e22">Equation 22</xref>:<disp-formula id="e22">
<mml:math id="m112">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>where <inline-formula id="inf91">
<mml:math id="m113">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represents the value of the <inline-formula id="inf92">
<mml:math id="m114">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th spectral component at pixel <inline-formula id="inf93">
<mml:math id="m115">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the sum is over all spectral bands. The entropy <inline-formula id="inf94">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> serves as a measure of the diversity in the spectral information at <inline-formula id="inf95">
<mml:math id="m117">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, with higher entropy indicating more uncertainty.</p>
<p>Next, we reweight the fused descriptor <inline-formula id="inf96">
<mml:math id="m118">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> using a sigmoid function applied to the entropy value <xref ref-type="disp-formula" rid="e23">Equation 23</xref>:<disp-formula id="e23">
<mml:math id="m119">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>&#x3b3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>sigmoid</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(23)</label>
</disp-formula>where <inline-formula id="inf97">
<mml:math id="m120">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf98">
<mml:math id="m121">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are learned parameters, and <inline-formula id="inf99">
<mml:math id="m122">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the attention factor that modulates the influence of each pixel based on its spectral entropy. To refine the representation over multiple iterations, we apply recursive updates to the descriptor <xref ref-type="disp-formula" rid="e24">Equation 24</xref>:<disp-formula id="e24">
<mml:math id="m123">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3c8;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(24)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf100">
<mml:math id="m124">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a spectral-gated context aggregator that updates the refined descriptor <inline-formula id="inf101">
<mml:math id="m125">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> at each step, incorporating contextual information from neighboring pixels. This recursive refinement process helps to enhance the feature quality and reduce noise, leading to a more accurate and reliable fused representation, which is essential for downstream tasks such as classification or segmentation.</p>
<p>The architecture of the attention module adopts a standard cross-attention mechanism enhanced for spectral-spatial fusion. Let <inline-formula id="inf102">
<mml:math id="m126">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote the topographic embedding at location <inline-formula id="inf103">
<mml:math id="m127">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf104">
<mml:math id="m128">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote the spectral embedding. The attention module computes <xref ref-type="disp-formula" rid="e25">Equation 25</xref>:<disp-formula id="e25">
<mml:math id="m129">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(25)</label>
</disp-formula>where <inline-formula id="inf105">
<mml:math id="m130">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf106">
<mml:math id="m131">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are learned projection matrices. The attention score between pixel <inline-formula id="inf107">
<mml:math id="m132">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and its neighbor <inline-formula id="inf108">
<mml:math id="m133">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is calculated via scaled dot-product <xref ref-type="disp-formula" rid="e26">Equation 26</xref>:<disp-formula id="e26">
<mml:math id="m134">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(26)</label>
</disp-formula>The fused representation is then computed as <xref ref-type="disp-formula" rid="e27">Equation 27</xref>:<disp-formula id="e27">
<mml:math id="m135">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(27)</label>
</disp-formula>This intermediate output <inline-formula id="inf109">
<mml:math id="m136">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is concatenated with the original descriptors and passed through an MLP with residual connection <xref ref-type="disp-formula" rid="e28">Equation 28</xref>:<disp-formula id="e28">
<mml:math id="m137">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>MLP</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(28)</label>
</disp-formula>
</p>
<p>The residual link ensures spectral consistency. The attention module is repeated across layers and supports multi-head extension if needed.</p>
<p>In <xref ref-type="table" rid="T1">Table 1</xref>, To quantify the efficiency of the proposed STEN module, we calculate its computational complexity in terms of floating-point operations (FLOPs). Under input resolution of <inline-formula id="inf110">
<mml:math id="m138">
<mml:mrow>
<mml:mn>224</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>224</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> with 13 spectral bands, STEN introduces a total of 3.72 GFLOPs, comprising 1.92G for the spectral gradient path and 1.80G for the topographic descriptor path. This accounts for only 7.4% of the full model&#x2019;s FLOPs (50.3G), confirming that the added geometric encoding comes at a modest computational cost. Compared to Swin-Unet and SpectralFormer with 54.8G and 58.1G FLOPs respectively, CoastVisionNet maintains competitive efficiency with enhanced spatial reasoning.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Efficiency comparison of CoastVisionNet and transformer-based models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model/Module</th>
<th align="center">Params (M)</th>
<th align="center">FLOPs (G)</th>
<th align="center">Inference time (ms)</th>
<th align="center">STEN contribution (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Swin-Unet</td>
<td align="center">54.9</td>
<td align="center">54.8</td>
<td align="center">44.3</td>
<td align="center">0.0</td>
</tr>
<tr>
<td align="left">SpectralFormer</td>
<td align="center">52.6</td>
<td align="center">58.1</td>
<td align="center">47.1</td>
<td align="center">0.0</td>
</tr>
<tr>
<td align="left">CoastVisionNet (Full)</td>
<td align="center">47.2</td>
<td align="center">50.3</td>
<td align="center">32.5</td>
<td align="center">7.4</td>
</tr>
<tr>
<td align="left">STEN: Spectral Gradient Path</td>
<td align="center">2.1</td>
<td align="center">1.92</td>
<td align="center">4.2</td>
<td align="center">3.8</td>
</tr>
<tr>
<td align="left">STEN: Topographic Descriptor</td>
<td align="center">1.8</td>
<td align="center">1.80</td>
<td align="center">3.9</td>
<td align="center">3.6</td>
</tr>
<tr>
<td align="left">STEN Total</td>
<td align="center">3.9</td>
<td align="center">3.72</td>
<td align="center">8.1</td>
<td align="center">7.4</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3-4">
<title>3.4 Spectrum-guided semantic modulation (SGSM)</title>
<p>Remote sensing imagery presents profound spectral diversity and spatial ambiguity, often exacerbated by domain shifts across sensors, seasons, and geographies. To address this, we propose a Spectrum-Guided Semantic Modulation (SGSM) strategy that dynamically adjusts STEN&#x2019;s internal behavior during training and inference. SGSM achieves adaptivity through three core mechanisms: spectral prior encoding, uncertainty-aware semantic gating, and a spectrum-driven curriculum scheduler (As shown in <xref ref-type="fig" rid="F4">Figure 4</xref>).</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Schematic diagram of the Spectrum-Guided Semantic Modulation (SGSM) module. The architecture integrates spectral prior encoding, uncertainty-aware semantic gating, and a spectrum-driven curriculum scheduler. It begins with layer normalization and prior-guided encoding, followed by semantic calibration using depth-wise convolution and attention-weighted modulation. The process enhances spectral adaptivity and semantic reliability under uncertain or domain-shifted input conditions.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g004.tif">
<alt-text content-type="machine-generated">Flowchart of a deep learning process, showing the sequence of operations: LN (Layer Normalization), Spectral Prior Encoding, Uncertainty-Aware Semantic Gating, Split, Reshape, DW Conv (Depthwise Convolution), Flatten, and Spectral Curriculum Scheduling. Operations include element-wise multiplication, matrix multiplication, and element-wise addition, with data dimensions indicated at various stages.</alt-text>
</graphic>
</fig>
<p>It is important to note that both the Spectral-Topographic Encoding Network (STEN) and the Spectrum-Guided Semantic Modulation (SGSM) modules are employed during both training and inference. STEN operates as the core feature extraction backbone in all phases, while SGSM is integrated into the prediction head to refine outputs via spectral priors and confidence-based gating. These modules are fully differentiable and impact gradient flow during training, and during inference, they retain their functionality to enhance robustness and adaptivity under unseen or noisy spectral distributions.</p>
<sec id="s3-4-1">
<title>3.4.1 Spectral Prior Encoding</title>
<p>In remote sensing tasks, each pixel <inline-formula id="inf111">
<mml:math id="m139">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is associated with a high-dimensional spectral vector <inline-formula id="inf112">
<mml:math id="m140">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf113">
<mml:math id="m141">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of spectral bands. While deep models like STEN are capable of learning complex nonlinear mappings, they often underutilize explicit spectral priors that reflect the physical and statistical structure of class-specific reflectance patterns (As shown in <xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Schematic diagram of the Spectral Prior Encoding module. The pipeline employs a sequence of convolutional operations and intermediate feature (IF) nodes to extract hierarchical spectral features. Each convolutional block progressively reduces spatial dimensions while enriching spectral semantics, enabling effective modeling of class-conditional reflectance distributions and statistical priors for downstream uncertainty-aware reasoning.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g005.tif">
<alt-text content-type="machine-generated">Flowchart illustrating the spectral prior encoding process. It begins with a 3D block labeled &#x22;K-11x1, S=2&#x22; and proceeds through multiple layers of convolution and pooling, labeled with various kernel sizes and strides. Stages include MaxPool and Conv layers, resulting in output with sizes denoted as Tx96x59x59 up to Tx256x60x6. The diagram uses purple and green blocks to denote different processing stages and illustrates transformation through an encoding network.</alt-text>
</graphic>
</fig>
<p>To address this, we introduce a spectral prior encoding mechanism that models the class-conditional distribution of spectral observations using classical statistical estimators, which are then integrated into downstream decision-making through differentiable operations. Given a labeled dataset <inline-formula id="inf114">
<mml:math id="m142">
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, we compute the empirical spectral mean vector for each semantic class <inline-formula id="inf115">
<mml:math id="m143">
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> as <xref ref-type="disp-formula" rid="e29">Equation 29</xref>:<disp-formula id="e29">
<mml:math id="m144">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mtext>where&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(29)</label>
</disp-formula>serving as the central prototype of class <inline-formula id="inf116">
<mml:math id="m145">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in spectral space. We estimate the sample covariance matrix <inline-formula id="inf117">
<mml:math id="m146">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> over <inline-formula id="inf118">
<mml:math id="m147">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which captures inter-band correlations and accounts for class-specific spectral variability. The resulting spectral prior for class <inline-formula id="inf119">
<mml:math id="m148">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is modeled as a multivariate Gaussian distribution <xref ref-type="disp-formula" rid="e30">Equation 30</xref>:<disp-formula id="e30">
<mml:math id="m149">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(30)</label>
</disp-formula>which assigns a probabilistic score to each spectral vector based on how likely it is to have been generated by class <inline-formula id="inf120">
<mml:math id="m150">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> under the empirical statistics.</p>
<p>During inference or training, we assess the alignment of a test sample <inline-formula id="inf121">
<mml:math id="m151">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with each class prior by computing the Mahalanobis distance <xref ref-type="disp-formula" rid="e31">Equation 31</xref>:<disp-formula id="e31">
<mml:math id="m152">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(31)</label>
</disp-formula>which measures the spectral deviation of <inline-formula id="inf122">
<mml:math id="m153">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the class center <inline-formula id="inf123">
<mml:math id="m154">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> while accounting for band-wise variance and covariance. Unlike Euclidean distance, the Mahalanobis metric naturally adapts to class-specific spread and orientation in the spectral manifold, yielding more discriminative priors. To transform these distances into a soft prior distribution over classes, we apply a softmax-like normalization <xref ref-type="disp-formula" rid="e32">Equation 32</xref>:<disp-formula id="e32">
<mml:math id="m155">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(32)</label>
</disp-formula>where <inline-formula id="inf124">
<mml:math id="m156">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is a temperature parameter controlling the sharpness of the prior distribution. A high <inline-formula id="inf125">
<mml:math id="m157">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> enforces confident assignments based on minimal distance, while lower values yield smoother priors. This normalized prior <inline-formula id="inf126">
<mml:math id="m158">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> reflects the likelihood of pixel <inline-formula id="inf127">
<mml:math id="m159">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> belonging to class <inline-formula id="inf128">
<mml:math id="m160">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> purely based on its spectrum and without requiring any supervision from the model&#x2019;s semantic head.</p>
<p>These spectral priors serve multiple roles in downstream modules: they act as regularization targets to align predicted class probabilities, serve as gating signals in modulation layers, and facilitate interpretability by grounding predictions in physically meaningful reflectance statistics. Furthermore, because the priors are class-conditional and data-driven, they provide robustness against distribution shifts by encoding the inherent geometry of the spectral domain independently of spatial features or visual noise. The integration of <inline-formula id="inf129">
<mml:math id="m161">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> into the broader inference pipeline thus bridges statistical modeling and deep representation learning, allowing SGSM to better regulate semantic predictions under uncertain or ambiguous conditions.</p>
</sec>
<sec id="s3-4-2">
<title>3.4.2 Uncertainty-aware semantic gating</title>
<p>Deep semantic models like STEN often produce overconfident predictions in regions with weak visual cues or ambiguous spectral evidence. To address this, we introduce an uncertainty-aware semantic gating mechanism that adaptively fuses model predictions with spectrum-derived priors based on pixel-level confidence. This dynamic adjustment improves robustness by down-weighting unreliable model outputs and enhancing the role of physical spectral structure in decision-making. Given the STEN output <inline-formula id="inf130">
<mml:math id="m162">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> at pixel <inline-formula id="inf131">
<mml:math id="m163">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf132">
<mml:math id="m164">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the predicted probability of class <inline-formula id="inf133">
<mml:math id="m165">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we first compute the predictive entropy <xref ref-type="disp-formula" rid="e33">Equation 33</xref>:<disp-formula id="e33">
<mml:math id="m166">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(33)</label>
</disp-formula>which reflects the model&#x2019;s epistemic uncertainty at <inline-formula id="inf134">
<mml:math id="m167">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Higher entropy values indicate that the model is less confident in its prediction, signaling the need for auxiliary correction. In parallel, from the spectral prior encoding module, we retrieve <inline-formula id="inf135">
<mml:math id="m168">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>&#x2014;the normalized likelihood that pixel <inline-formula id="inf136">
<mml:math id="m169">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> belongs to class <inline-formula id="inf137">
<mml:math id="m170">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> based on Mahalanobis distance from class-conditional reflectance priors. We then define a gating function <inline-formula id="inf138">
<mml:math id="m171">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for each class <xref ref-type="disp-formula" rid="e34">Equation 34</xref>:<disp-formula id="e34">
<mml:math id="m172">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>sigmoid</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>log</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(34)</label>
</disp-formula>where <inline-formula id="inf139">
<mml:math id="m173">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf140">
<mml:math id="m174">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are hyperparameters that balance the influence of spectral prior confidence and model uncertainty. Intuitively, when uncertainty <inline-formula id="inf141">
<mml:math id="m175">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is high, or the spectrum strongly supports a particular class, the gate favors <inline-formula id="inf142">
<mml:math id="m176">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>; otherwise, it preserves the model&#x2019;s original semantic output. The adjusted per-class prediction becomes <xref ref-type="disp-formula" rid="e35">Equation 35</xref>:<disp-formula id="e35">
<mml:math id="m177">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(35)</label>
</disp-formula>which represents a convex combination between the model and spectral prior. This strategy mitigates the propagation of unreliable predictions while retaining discriminative knowledge when confidence is high. To ensure <inline-formula id="inf143">
<mml:math id="m178">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> remains a valid probability distribution, we apply a normalization step <xref ref-type="disp-formula" rid="e36">Equation 36</xref>:<disp-formula id="e36">
<mml:math id="m179">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(36)</label>
</disp-formula>producing the final adjusted posterior <inline-formula id="inf144">
<mml:math id="m180">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. This mechanism provides two major benefits: it grounds predictions in physically interpretable spectral priors, enhancing trustworthiness, and it reduces noise sensitivity by enforcing smoother behavior under high-uncertainty conditions. As a result, uncertainty-aware semantic gating enables STEN to maintain semantic precision even in challenging domains with varying lighting, material composition, or sensor conditions.</p>
<p>This fusion mechanism directly modifies the predicted probability distribution before final classification. The refined class probability <inline-formula id="inf145">
<mml:math id="m181">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is computed as a convex combination of the original model output and the spectral prior <xref ref-type="disp-formula" rid="e37">Equation 37</xref>:<disp-formula id="e37">
<mml:math id="m182">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(37)</label>
</disp-formula>where <inline-formula id="inf146">
<mml:math id="m183">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is a learned gating function dependent on model entropy <inline-formula id="inf147">
<mml:math id="m184">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the spectral prior confidence <inline-formula id="inf148">
<mml:math id="m185">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Although no explicit regularization term is added to the loss function, the gating operation is fully differentiable and affects the backpropagation path during training. This integration allows spectral prior knowledge to modulate predictions and improves robustness under ambiguous or domain-shifted conditions.</p>
</sec>
<sec id="s3-4-3">
<title>3.4.3 Spectral curriculum scheduling</title>
<p>In hyperspectral and multispectral learning tasks, not all spectral bands contribute equally to class discrimination. Some channels provide strong class-separating cues, while others may be noisy or redundant due to atmospheric interference or sensor overlap. To leverage this inherent asymmetry in spectral utility, we propose a spectral curriculum scheduling strategy that progressively guides the model to attend to the most informative spectral bands first and gradually incorporate weaker ones as training matures. This idea is inspired by curriculum learning, where simpler (i.e., high-signal) inputs are emphasized earlier to stabilize optimization. Formally, we define the spectral importance of each band <inline-formula id="inf149">
<mml:math id="m186">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> at training iteration <inline-formula id="inf150">
<mml:math id="m187">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> using Fisher Information (FI), which quantifies the local sensitivity of the model&#x2019;s output distribution <inline-formula id="inf151">
<mml:math id="m188">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> with respect to the input feature <inline-formula id="inf152">
<mml:math id="m189">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> <xref ref-type="disp-formula" rid="e38">Equation 38</xref>:<disp-formula id="e38">
<mml:math id="m190">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>FI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(38)</label>
</disp-formula>where the derivative captures how much the prediction of class <inline-formula id="inf153">
<mml:math id="m191">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> changes with perturbations in band <inline-formula id="inf154">
<mml:math id="m192">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. A higher FI score indicates that small changes in <inline-formula id="inf155">
<mml:math id="m193">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> cause larger shifts in <inline-formula id="inf156">
<mml:math id="m194">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, suggesting that the band is actively used by the model for semantic decisions. Once the FI scores are computed for all <inline-formula id="inf157">
<mml:math id="m195">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> bands, we normalize them to obtain curriculum weights <xref ref-type="disp-formula" rid="e39">Equation 39</xref>:<disp-formula id="e39">
<mml:math id="m196">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>FI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mtext>FI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(39)</label>
</disp-formula>ensuring <inline-formula id="inf158">
<mml:math id="m197">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. This normalization serves two purposes: (1) it makes the reweighting operation scale-invariant across epochs, and (2) it allows for interpretable attribution of training focus per band. These weights are then used to modulate the spectral input at time <inline-formula id="inf159">
<mml:math id="m198">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <xref ref-type="disp-formula" rid="e40">Equation 40</xref>:<disp-formula id="e40">
<mml:math id="m199">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(40)</label>
</disp-formula>where <inline-formula id="inf160">
<mml:math id="m200">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes element-wise multiplication. Effectively, <inline-formula id="inf161">
<mml:math id="m201">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> biases the network to pay more attention to dominant bands early on, while gradually increasing the contribution of underutilized or noisy bands in later stages of training. To prevent oscillation or sharp band suppression, <inline-formula id="inf162">
<mml:math id="m202">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> can be further smoothed with momentum-based exponential moving averages or band-specific decay schedules. FI scores can be aggregated over batches or epochs to stabilize estimation. This curriculum not only improves convergence stability but also promotes robust feature learning by controlling the temporal order of attention allocation across the spectral domain. Furthermore, it implicitly serves as a regularization mechanism by dynamically constraining the input manifold, encouraging the model to first generalize from strong signals before fitting subtler patterns. Spectral curriculum scheduling offers a principled and interpretable approach to time-dependent spectral modulation, aligned with the biological and physical properties of remote sensing data acquisition.</p>
<p>Our proposed spectrum-driven curriculum scheduling is inspired by the general principles of curriculum learning (CL), but it operates at the granularity of spectral dimensions rather than full input samples. Unlike CBM <xref ref-type="bibr" rid="B22">Jarca et al. (2024)</xref>, which progressively reveals feature regions through masking, or SPCNet <xref ref-type="bibr" rid="B55">Zhao et al. (2025)</xref>, which incorporates inductive bias into self-paced learning, our method computes band-wise significance via Fisher Information and adaptively reweights spectral channels over training epochs. Furthermore, unlike multimodal curriculum methods such as CLIP-VG <xref ref-type="bibr" rid="B50">Xiao et al. (2023)</xref>, which define curriculum over multimodal alignment tasks, our focus lies in stabilizing spectral encoding for remote sensing tasks, which involve highly redundant and noisy bands. This band-centric pacing strategy is particularly suited for hyperspectral or multispectral scenarios, where many bands offer weak or noisy gradients in early training stages.</p>
<p>Compared to existing transformer-based methods in remote sensing, CoastVisionNet introduces a series of targeted innovations. For example, while TransUNet <xref ref-type="bibr" rid="B8">Chen J. et al. (2021)</xref> integrates transformer blocks into a UNet-style encoder-decoder architecture, it lacks an explicit mechanism for disentangling and selectively fusing spectral and spatial cues. In contrast, our STEN module is designed to separately model spectral gradients and topographic morphology, which are later aligned through a geometry-aware self-attention module. Similarly, SpectralFormer <xref ref-type="bibr" rid="B21">Hong et al. (2021)</xref> focuses on capturing spectral dependencies through self-attention, but it does not incorporate adaptive scheduling or uncertainty modeling. Our Spectrum-Guided Semantic Modulation (SGSM) introduces Fisher Information-guided curriculum scheduling and uncertainty-aware gating, which enhance the robustness and interpretability of spectral inference in domain-shifted coastal imagery. Together, these contributions form a cohesive and novel architecture that is specifically optimized for the unique challenges of coastal land cover classification.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<title>4 Experimental setup</title>
<sec id="s4-1">
<title>4.1 Dataset</title>
<p>The BigEarthNet dataset <xref ref-type="bibr" rid="B40">Sumbul et al. (2021)</xref> is a large-scale benchmark consisting of over 590,000 Sentinel-2 image patches across 10 European countries, each annotated with one or more land-cover class labels based on the CORINE Land Cover (CLC) nomenclature. The dataset spans 43 semantic categories, including artificial surfaces, agricultural zones, forests, wetlands, and water bodies. Each image patch covers a 120<inline-formula id="inf163">
<mml:math id="m203">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 120&#xa0;m area and contains 12 spectral bands, facilitating multi-label scene classification, land use monitoring, and deep representation learning under diverse seasonal and geographic conditions. The OSCD Dataset (Onera Satellite Change Detection Dataset) <xref ref-type="bibr" rid="B18">Fu et al. (2021)</xref> includes bi-temporal multispectral image pairs captured by SPOT-6 and SPOT-7 satellites across 24 urban and rural regions worldwide. Annotated with binary change masks, the dataset supports supervised change detection tasks and includes 13 spectral bands. OSCD enables robust evaluation under spatial misalignment, atmospheric variation, and domain shift, and is widely used in research involving urban dynamics, environmental monitoring, and post-disaster analysis. The LandCoverNet dataset <xref ref-type="bibr" rid="B1">Alemohammad and Booth (2020)</xref> is a globally distributed Sentinel-2-based land cover dataset curated by the Radiant Earth Foundation, comprising more than 200,000 scene-labeled image chips across five continents. It adheres to the Dynamic World schema with semantic classes including built-up areas, trees, crops, water, wetlands, and bare ground. Each chip is georeferenced and temporally aligned with expert-validated labels, supporting tasks such as global-scale semantic segmentation, domain generalization, and label robustness studies. The EuroSAT dataset <xref ref-type="bibr" rid="B5">Bhatt and Bhatt (2024)</xref> is a medium-scale classification benchmark derived from Sentinel-2 imagery, containing 27,000 labeled image patches across 10 land use and land cover types including residential, industrial, forest, river, pasture, and highway. Each patch is 64<inline-formula id="inf164">
<mml:math id="m204">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 64 pixels and includes all 13 spectral bands, allowing both RGB-based and multispectral training. Its balanced class distribution and wide accessibility make EuroSAT a popular dataset for remote sensing classification, deep model prototyping, and educational use in satellite image analysis.</p>
</sec>
<sec id="s4-2">
<title>4.2 Experimental details</title>
<p>All experiments were conducted using PyTorch framework on a workstation equipped with NVIDIA A100 GPUs. We adopted a mini-batch size of 64 for all datasets, and trained the models using the Adam optimizer with a weight decay of <inline-formula id="inf165">
<mml:math id="m205">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The initial learning rate was set to 0.001 and decayed using a cosine annealing schedule over the course of 100 epochs. For fair comparison, all models were trained under the same computational budget and data augmentation strategies. For image classification tasks, we applied random resized cropping to <inline-formula id="inf166">
<mml:math id="m206">
<mml:mrow>
<mml:mn>224</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>224</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels, horizontal flipping with a probability of 0.5, and normalization using the BigEarthNet Dataset mean and standard deviation. No additional data augmentation tricks like mixup or CutMix were employed unless explicitly stated. For the backbone network, we utilized a standard ResNet-50 architecture pretrained on BigEarthNet Dataset, followed by a lightweight transformer-based feature aggregator tailored to improve discriminative representation learning. During training, the final fully connected layer was modified to match the number of classes for each respective dataset. For datasets with fine-grained categories like LandCoverNet Dataset, we added a channel attention module to the feature extractor to enhance the learning of subtle local patterns. For the EuroSAT Dataset, we employed group normalization over batch normalization to maintain stability across small batch sizes, given the variability of texture patterns. Each experiment was repeated three times with different random seeds, and the average results are reported to ensure robustness. For hyperparameter tuning, we performed grid search on the validation set using 10% of the training data. Cross-validation was used only for datasets with fewer samples, such as OSCD Dataset, where stratified k-fold (k &#x3d; 5) was employed to mitigate class imbalance. To ensure reproducibility, we fixed random seeds across numpy, PyTorch, and CUDA environments, and logged all experimental configurations using Weights &#x26; Biases. During testing, only center cropping was applied, and top-1 accuracy was used as the primary evaluation metric. For detailed analysis, confusion matrices and per-class accuracy were also computed. Our implementation also supports gradient checkpointing to save memory during training, which was particularly useful for high-resolution texture images in EuroSAT Dataset. To accelerate convergence, label smoothing with a factor of 0.1 was used, especially for datasets prone to overfitting due to small sample size. The experiments were benchmarked under consistent environmental conditions, and no hyperparameter tuning was performed on the test set. All scripts and configuration files will be made publicly available to facilitate reproducibility and further research.</p>
</sec>
<sec id="s4-3">
<title>4.3 Comparison with SOTA methods</title>
<p>To comprehensively evaluate the effectiveness of our proposed method, we conduct extensive comparisons against several state-of-the-art (SOTA) models, including ResNet50 <xref ref-type="bibr" rid="B44">Theckedath and Sedamkar (2020)</xref>, ViT <xref ref-type="bibr" rid="B47">Touvron et al. (2022)</xref>, EfficientNet <xref ref-type="bibr" rid="B25">Koonce (2021)</xref>, DenseNet <xref ref-type="bibr" rid="B13">Dalvi et al. (2023)</xref>, ConvNeXt <xref ref-type="bibr" rid="B16">Feng et al. (2022)</xref>, and DeiT <xref ref-type="bibr" rid="B47">Touvron et al. (2022)</xref>. As shown in <xref ref-type="table" rid="T2">Tables 2</xref>, <xref ref-type="table" rid="T3">3</xref>, our method consistently achieves superior performance across all four benchmark datasets. On the large-scale BigEarthNet Dataset, our method achieves an accuracy of 84.91%, surpassing the best baseline ConvNeXt by 2.88%. For OSCD Dataset, which features higher intra-class variance and fewer training samples per class, our model outperforms all SOTA methods by a notable margin of 1.75% in Accuracy and 2.18% in AUC. This performance boost is largely attributed to our architecture&#x2019;s ability to incorporate both global semantic context and local feature discrimination via the transformer-augmented aggregation module, which allows for dynamic feature reweighting that adapts to diverse visual patterns across varying datasets.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Benchmarking our method against SOTA approaches on LandCoverNet and EuroSAT (with 95% confidence intervals).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="4" align="center">BigEarthNet dataset</th>
<th colspan="4" align="center">OSCD dataset</th>
</tr>
<tr>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">ResNet50 <xref ref-type="bibr" rid="B44">Theckedath and Sedamkar (2020)</xref>
</td>
<td align="center">78.24 <inline-formula id="inf167">
<mml:math id="m207">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">76.11 <inline-formula id="inf168">
<mml:math id="m208">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">75.98 <inline-formula id="inf169">
<mml:math id="m209">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">81.47 <inline-formula id="inf170">
<mml:math id="m210">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">84.12 <inline-formula id="inf171">
<mml:math id="m211">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">83.20 <inline-formula id="inf172">
<mml:math id="m212">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">82.65 <inline-formula id="inf173">
<mml:math id="m213">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">85.10 <inline-formula id="inf174">
<mml:math id="m214">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
</tr>
<tr>
<td align="center">ViT <xref ref-type="bibr" rid="B47">Touvron et al. (2022)</xref>
</td>
<td align="center">81.39 <inline-formula id="inf175">
<mml:math id="m215">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">80.45 <inline-formula id="inf176">
<mml:math id="m216">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">79.87 <inline-formula id="inf177">
<mml:math id="m217">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">84.01 <inline-formula id="inf178">
<mml:math id="m218">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">85.87 <inline-formula id="inf179">
<mml:math id="m219">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">84.32 <inline-formula id="inf180">
<mml:math id="m220">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">83.99 <inline-formula id="inf181">
<mml:math id="m221">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">86.72 <inline-formula id="inf182">
<mml:math id="m222">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
</tr>
<tr>
<td align="center">EfficientNet <xref ref-type="bibr" rid="B25">Koonce (2021)</xref>
</td>
<td align="center">79.52 <inline-formula id="inf183">
<mml:math id="m223">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">78.13 <inline-formula id="inf184">
<mml:math id="m224">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">77.40 <inline-formula id="inf185">
<mml:math id="m225">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">82.76 <inline-formula id="inf186">
<mml:math id="m226">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">86.41 <inline-formula id="inf187">
<mml:math id="m227">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">84.79 <inline-formula id="inf188">
<mml:math id="m228">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">85.22 <inline-formula id="inf189">
<mml:math id="m229">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">87.30 <inline-formula id="inf190">
<mml:math id="m230">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
</tr>
<tr>
<td align="center">DenseNet <xref ref-type="bibr" rid="B13">Dalvi et al. (2023)</xref>
</td>
<td align="center">77.88 <inline-formula id="inf191">
<mml:math id="m231">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">79.02 <inline-formula id="inf192">
<mml:math id="m232">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">78.40 <inline-formula id="inf193">
<mml:math id="m233">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">80.59 <inline-formula id="inf194">
<mml:math id="m234">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">83.75 <inline-formula id="inf195">
<mml:math id="m235">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.21</td>
<td align="center">82.61 <inline-formula id="inf196">
<mml:math id="m236">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">82.18 <inline-formula id="inf197">
<mml:math id="m237">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">84.32 <inline-formula id="inf198">
<mml:math id="m238">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
</tr>
<tr>
<td align="center">ConvNeXt <xref ref-type="bibr" rid="B16">Feng et al. (2022)</xref>
</td>
<td align="center">82.03 <inline-formula id="inf199">
<mml:math id="m239">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">81.58 <inline-formula id="inf200">
<mml:math id="m240">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">80.90 <inline-formula id="inf201">
<mml:math id="m241">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">85.44 <inline-formula id="inf202">
<mml:math id="m242">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">87.66 <inline-formula id="inf203">
<mml:math id="m243">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">86.73 <inline-formula id="inf204">
<mml:math id="m244">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">86.14 <inline-formula id="inf205">
<mml:math id="m245">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">88.05 <inline-formula id="inf206">
<mml:math id="m246">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
</tr>
<tr>
<td align="center">DeiT <xref ref-type="bibr" rid="B47">Touvron et al. (2022)</xref>
</td>
<td align="center">80.67 <inline-formula id="inf207">
<mml:math id="m247">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">79.21 <inline-formula id="inf208">
<mml:math id="m248">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">78.99 <inline-formula id="inf209">
<mml:math id="m249">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">83.82 <inline-formula id="inf210">
<mml:math id="m250">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">85.20 <inline-formula id="inf211">
<mml:math id="m251">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">84.01 <inline-formula id="inf212">
<mml:math id="m252">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">83.50 <inline-formula id="inf213">
<mml:math id="m253">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">85.79 <inline-formula id="inf214">
<mml:math id="m254">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>84.91</bold> <inline-formula id="inf215">
<mml:math id="m255">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.22</bold>
</td>
<td align="center">
<bold>83.87</bold> <inline-formula id="inf216">
<mml:math id="m256">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.24</bold>
</td>
<td align="center">
<bold>83.35</bold> <inline-formula id="inf217">
<mml:math id="m257">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.23</bold>
</td>
<td align="center">
<bold>87.62</bold> <inline-formula id="inf218">
<mml:math id="m258">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.25</bold>
</td>
<td align="center">
<bold>89.41</bold> <inline-formula id="inf219">
<mml:math id="m259">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
<td align="center">
<bold>88.59</bold> <inline-formula id="inf220">
<mml:math id="m260">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.22</bold>
</td>
<td align="center">
<bold>87.93</bold> <inline-formula id="inf221">
<mml:math id="m261">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.24</bold>
</td>
<td align="center">
<bold>90.23</bold> <inline-formula id="inf222">
<mml:math id="m262">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.23</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values are the prepared values.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Evaluation of our model in comparison with SOTA baselines on the BigEarthNet and OSCD datasets (with 95% confidence intervals).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="4" align="center">LandCoverNet dataset</th>
<th colspan="4" align="center">EuroSAT dataset</th>
</tr>
<tr>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">ResNet50 <xref ref-type="bibr" rid="B44">Theckedath and Sedamkar (2020)</xref>
</td>
<td align="center">85.73 <inline-formula id="inf223">
<mml:math id="m263">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">84.60 <inline-formula id="inf224">
<mml:math id="m264">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">83.97 <inline-formula id="inf225">
<mml:math id="m265">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">88.91 <inline-formula id="inf226">
<mml:math id="m266">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">70.88 <inline-formula id="inf227">
<mml:math id="m267">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">71.45 <inline-formula id="inf228">
<mml:math id="m268">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">69.70 <inline-formula id="inf229">
<mml:math id="m269">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">74.20 <inline-formula id="inf230">
<mml:math id="m270">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
</tr>
<tr>
<td align="center">ViT <xref ref-type="bibr" rid="B47">Touvron et al. (2022)</xref>
</td>
<td align="center">88.12 <inline-formula id="inf231">
<mml:math id="m271">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">86.79 <inline-formula id="inf232">
<mml:math id="m272">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">87.30 <inline-formula id="inf233">
<mml:math id="m273">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">90.34 <inline-formula id="inf234">
<mml:math id="m274">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">73.04 <inline-formula id="inf235">
<mml:math id="m275">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">72.50 <inline-formula id="inf236">
<mml:math id="m276">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">73.33 <inline-formula id="inf237">
<mml:math id="m277">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">75.91 <inline-formula id="inf238">
<mml:math id="m278">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
</tr>
<tr>
<td align="center">EfficientNet <xref ref-type="bibr" rid="B25">Koonce (2021)</xref>
</td>
<td align="center">86.80 <inline-formula id="inf239">
<mml:math id="m279">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">85.33 <inline-formula id="inf240">
<mml:math id="m280">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">84.71 <inline-formula id="inf241">
<mml:math id="m281">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">89.05 <inline-formula id="inf242">
<mml:math id="m282">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">74.42 <inline-formula id="inf243">
<mml:math id="m283">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">73.80 <inline-formula id="inf244">
<mml:math id="m284">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">74.17 <inline-formula id="inf245">
<mml:math id="m285">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">77.46 <inline-formula id="inf246">
<mml:math id="m286">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
</tr>
<tr>
<td align="center">DenseNet <xref ref-type="bibr" rid="B13">Dalvi et al. (2023)</xref>
</td>
<td align="center">84.19 <inline-formula id="inf247">
<mml:math id="m287">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">85.71 <inline-formula id="inf248">
<mml:math id="m288">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">84.35 <inline-formula id="inf249">
<mml:math id="m289">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">87.82 <inline-formula id="inf250">
<mml:math id="m290">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">72.88 <inline-formula id="inf251">
<mml:math id="m291">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">73.29 <inline-formula id="inf252">
<mml:math id="m292">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">71.90 <inline-formula id="inf253">
<mml:math id="m293">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">74.69 <inline-formula id="inf254">
<mml:math id="m294">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
</tr>
<tr>
<td align="center">ConvNeXt <xref ref-type="bibr" rid="B16">Feng et al. (2022)</xref>
</td>
<td align="center">89.01 <inline-formula id="inf255">
<mml:math id="m295">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">88.33 <inline-formula id="inf256">
<mml:math id="m296">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">87.85 <inline-formula id="inf257">
<mml:math id="m297">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">91.70 <inline-formula id="inf258">
<mml:math id="m298">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">76.12 <inline-formula id="inf259">
<mml:math id="m299">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">74.80 <inline-formula id="inf260">
<mml:math id="m300">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">75.69 <inline-formula id="inf261">
<mml:math id="m301">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">78.03 <inline-formula id="inf262">
<mml:math id="m302">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
</tr>
<tr>
<td align="center">DeiT <xref ref-type="bibr" rid="B47">Touvron et al. (2022)</xref>
</td>
<td align="center">86.23 <inline-formula id="inf263">
<mml:math id="m303">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">84.77 <inline-formula id="inf264">
<mml:math id="m304">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">85.40 <inline-formula id="inf265">
<mml:math id="m305">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">89.34 <inline-formula id="inf266">
<mml:math id="m306">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">75.41 <inline-formula id="inf267">
<mml:math id="m307">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">73.64 <inline-formula id="inf268">
<mml:math id="m308">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">74.12 <inline-formula id="inf269">
<mml:math id="m309">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">76.95 <inline-formula id="inf270">
<mml:math id="m310">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>91.35</bold> <inline-formula id="inf271">
<mml:math id="m311">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
<td align="center">
<bold>90.41</bold> <inline-formula id="inf272">
<mml:math id="m312">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.23</bold>
</td>
<td align="center">
<bold>90.08</bold> <inline-formula id="inf273">
<mml:math id="m313">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.22</bold>
</td>
<td align="center">
<bold>93.62</bold> <inline-formula id="inf274">
<mml:math id="m314">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
<td align="center">
<bold>79.29</bold> <inline-formula id="inf275">
<mml:math id="m315">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.23</bold>
</td>
<td align="center">
<bold>78.50</bold> <inline-formula id="inf276">
<mml:math id="m316">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.25</bold>
</td>
<td align="center">
<bold>78.81</bold> <inline-formula id="inf277">
<mml:math id="m317">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.24</bold>
</td>
<td align="center">
<bold>81.74</bold> <inline-formula id="inf278">
<mml:math id="m318">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.22</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values are the prepared values.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>On fine-grained classification tasks, such as LandCoverNet Dataset, the superiority of our method becomes even more pronounced. Our method reaches a top-1 Accuracy of 91.35%, which is 2.34% higher than ConvNeXt, and outperforms ViT and EfficientNet by larger margins. In fine-grained tasks, where categories differ by subtle texture, shape, and color variations, the strength of our architecture lies in its ability to preserve fine-level details while suppressing irrelevant background noise. This is further supported by the high F1 Score (90.08%) and AUC (93.62%) achieved, indicating stable generalization under intra-class ambiguity. The channel attention module and the localized token enhancement approach in our framework are particularly effective in detecting discriminative floral features. On the EuroSAT dataset, our model again achieves the best performance with 79.29% Accuracy and 81.74% AUC, outperforming all competitors. Unlike object classification datasets, texture datasets require models to reason about style and abstract visual attributes. The effectiveness of our method on EuroSAT Dataset is a strong testament to the flexibility of our representation learning mechanism, which integrates hierarchical texture semantics through feature pyramids and context-aware refinement. The use of group normalization instead of batch normalization on EuroSAT Dataset effectively stabilizes training under smaller batch regimes, which is crucial for capturing nuanced texture patterns.</p>
<p>In <xref ref-type="fig" rid="F6">Figures 6</xref>, <xref ref-type="fig" rid="F7">7</xref>, these consistent improvements can be largely attributed to several design components in our model, as described in the method section. The hybrid feature extractor ensures both hierarchical abstraction and spatial precision. The design of multi-resolution fusion in the transformer encoder contributes to the ability to model long-range dependencies, enhancing recognition in complex visual scenes. Our approach also benefits from a lightweight architecture that maintains computational efficiency while achieving top-tier performance. Unlike models such as ViT and ConvNeXt, which are computationally intensive, our model achieves higher accuracy without sacrificing training and inference speed. These results not only validate the effectiveness of our method across a diverse set of datasets but also highlight its generalizability and robustness, demonstrating strong potential for real-world deployment in both generic and fine-grained classification tasks.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Benchmarking our method against SOTA approaches on LandCoverNet and EuroSAT.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g006.tif">
<alt-text content-type="machine-generated">Bar chart comparing different methods on the BigEarthNet and OSCD datasets. It shows scores for Accuracy, Recall, F1 Score, and AUC for methods including ResNet50, ViT, EfficientNet, DenseNet, ConvNeXt, DeiT, and Ours. Each method has colored bars for each metric, with &#x22;Ours&#x22; having the highest AUC score.</alt-text>
</graphic>
</fig>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Evaluation of our model in comparison with SOTA baselines on the BigEarthNet and OSCD datasets.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g007.tif">
<alt-text content-type="machine-generated">Line chart comparing different methods on LandCoverNet and EuroSAT datasets. Methods include ResNet50, ViT, EfficientNet, DenseNet, ConvNeXt, DeiT, and Ours. Metrics shown are accuracy, recall, F1 score, and AUC, with Ours generally performing the best across all metrics.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-4">
<title>4.4 Ablation study</title>
<p>To investigate the contribution of each key component in our proposed architecture, we conducted a comprehensive ablation study by systematically removing individual modules and evaluating the performance on all four benchmark datasets. The components under study include Residual Spectral Encoding, Differential Topographic Encoding, and Spectral Prior Encoding. The results are summarized in <xref ref-type="table" rid="T4">Tables 4</xref>, <xref ref-type="table" rid="T5">5</xref>. We denote the full model as Ours and use Residual Spectral Encoding, Differential Topographic Encoding, and Spectral Prior Encoding to represent variants with the respective component removed.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Impact of architectural components in our model evaluated via ablation on BigEarthNet and OSCD (with 95% confidence intervals).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="4" align="center">BigEarthNet dataset</th>
<th colspan="4" align="center">OSCD dataset</th>
</tr>
<tr>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">w/o Residual Spectral Encoding</td>
<td align="center">82.35 <inline-formula id="inf279">
<mml:math id="m319">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">80.41 <inline-formula id="inf280">
<mml:math id="m320">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">80.97 <inline-formula id="inf281">
<mml:math id="m321">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">85.14 <inline-formula id="inf282">
<mml:math id="m322">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">87.73 <inline-formula id="inf283">
<mml:math id="m323">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">86.15 <inline-formula id="inf284">
<mml:math id="m324">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.21</td>
<td align="center">85.69 <inline-formula id="inf285">
<mml:math id="m325">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">88.01 <inline-formula id="inf286">
<mml:math id="m326">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
</tr>
<tr>
<td align="center">w/o Differential Topographic Encoding</td>
<td align="center">83.27 <inline-formula id="inf287">
<mml:math id="m327">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">82.62 <inline-formula id="inf288">
<mml:math id="m328">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">81.23 <inline-formula id="inf289">
<mml:math id="m329">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">86.48 <inline-formula id="inf290">
<mml:math id="m330">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">88.01 <inline-formula id="inf291">
<mml:math id="m331">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">87.70 <inline-formula id="inf292">
<mml:math id="m332">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.20</td>
<td align="center">86.13 <inline-formula id="inf293">
<mml:math id="m333">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
<td align="center">89.17 <inline-formula id="inf294">
<mml:math id="m334">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
</tr>
<tr>
<td align="center">w/o Spectral Prior Encoding</td>
<td align="center">81.44 <inline-formula id="inf295">
<mml:math id="m335">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">81.18 <inline-formula id="inf296">
<mml:math id="m336">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">79.80 <inline-formula id="inf297">
<mml:math id="m337">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">84.73 <inline-formula id="inf298">
<mml:math id="m338">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.26</td>
<td align="center">86.38 <inline-formula id="inf299">
<mml:math id="m339">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.27</td>
<td align="center">85.91 <inline-formula id="inf300">
<mml:math id="m340">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">84.50 <inline-formula id="inf301">
<mml:math id="m341">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">87.32 <inline-formula id="inf302">
<mml:math id="m342">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>84.91</bold> <inline-formula id="inf303">
<mml:math id="m343">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.22</bold>
</td>
<td align="center">
<bold>83.87</bold> <inline-formula id="inf304">
<mml:math id="m344">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.30</bold>
</td>
<td align="center">
<bold>83.35</bold> <inline-formula id="inf305">
<mml:math id="m345">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
<td align="center">
<bold>87.62</bold> <inline-formula id="inf306">
<mml:math id="m346">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.23</bold>
</td>
<td align="center">
<bold>89.41</bold> <inline-formula id="inf307">
<mml:math id="m347">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.29</bold>
</td>
<td align="center">
<bold>88.59</bold> <inline-formula id="inf308">
<mml:math id="m348">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.20</bold>
</td>
<td align="center">
<bold>87.93</bold> <inline-formula id="inf309">
<mml:math id="m349">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.23</bold>
</td>
<td align="center">
<bold>90.23</bold> <inline-formula id="inf310">
<mml:math id="m350">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.22</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values are the prepared values.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Results of ablation experiments on our model across the LandCoverNet and EuroSAT datasets (with 95% confidence intervals).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="4" align="center">LandCoverNet dataset</th>
<th colspan="4" align="center">EuroSAT dataset</th>
</tr>
<tr>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
<th align="center">Accuracy</th>
<th align="center">Recall</th>
<th align="center">F1 Score</th>
<th align="center">AUC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">w/o Residual Spectral Encoding</td>
<td align="center">89.40 <inline-formula id="inf311">
<mml:math id="m351">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.25</td>
<td align="center">88.73 <inline-formula id="inf312">
<mml:math id="m352">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">87.92 <inline-formula id="inf313">
<mml:math id="m353">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">91.47 <inline-formula id="inf314">
<mml:math id="m354">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.21</td>
<td align="center">76.84 <inline-formula id="inf315">
<mml:math id="m355">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">75.29 <inline-formula id="inf316">
<mml:math id="m356">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">76.17 <inline-formula id="inf317">
<mml:math id="m357">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
<td align="center">79.43 <inline-formula id="inf318">
<mml:math id="m358">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24</td>
</tr>
<tr>
<td align="center">w/o Differential Topographic Encoding</td>
<td align="center">90.27 <inline-formula id="inf319">
<mml:math id="m359">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">89.50 <inline-formula id="inf320">
<mml:math id="m360">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.21</td>
<td align="center">88.88 <inline-formula id="inf321">
<mml:math id="m361">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">92.38 <inline-formula id="inf322">
<mml:math id="m362">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">78.32 <inline-formula id="inf323">
<mml:math id="m363">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">77.40 <inline-formula id="inf324">
<mml:math id="m364">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.31</td>
<td align="center">77.15 <inline-formula id="inf325">
<mml:math id="m365">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">80.51 <inline-formula id="inf326">
<mml:math id="m366">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.29</td>
</tr>
<tr>
<td align="center">w/o Spectral Prior Encoding</td>
<td align="center">88.95 <inline-formula id="inf327">
<mml:math id="m367">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
<td align="center">89.18 <inline-formula id="inf328">
<mml:math id="m368">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">88.10 <inline-formula id="inf329">
<mml:math id="m369">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">90.56 <inline-formula id="inf330">
<mml:math id="m370">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.30</td>
<td align="center">77.19 <inline-formula id="inf331">
<mml:math id="m371">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.28</td>
<td align="center">76.87 <inline-formula id="inf332">
<mml:math id="m372">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.21</td>
<td align="center">75.64 <inline-formula id="inf333">
<mml:math id="m373">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.22</td>
<td align="center">78.91 <inline-formula id="inf334">
<mml:math id="m374">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.23</td>
</tr>
<tr>
<td align="center">Ours</td>
<td align="center">
<bold>91.35</bold> <inline-formula id="inf335">
<mml:math id="m375">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
<td align="center">
<bold>90.41</bold> <inline-formula id="inf336">
<mml:math id="m376">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.22</bold>
</td>
<td align="center">
<bold>90.08</bold> <inline-formula id="inf337">
<mml:math id="m377">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
<td align="center">
<bold>93.62</bold> <inline-formula id="inf338">
<mml:math id="m378">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.21</bold>
</td>
<td align="center">
<bold>79.29</bold> <inline-formula id="inf339">
<mml:math id="m379">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.29</bold>
</td>
<td align="center">
<bold>78.50</bold> <inline-formula id="inf340">
<mml:math id="m380">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.24</bold>
</td>
<td align="center">
<bold>78.81</bold> <inline-formula id="inf341">
<mml:math id="m381">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.28</bold>
</td>
<td align="center">
<bold>81.74</bold> <inline-formula id="inf342">
<mml:math id="m382">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <bold>0.26</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values are the prepared values.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>On the BigEarthNet Dataset and OSCD Dataset, the removal of Residual Spectral Encoding causes a noticeable performance drop, reducing accuracy by 2.56% and 1.68% respectively. This indicates that the transformer-based feature aggregator plays a critical role in capturing long-range dependencies and contextual relationships between image regions. Without this module, the model tends to rely heavily on local features, resulting in inferior generalization on diverse and large-scale datasets. The absence of Differential Topographic Encoding also leads to a performance decline, though to a lesser extent. Accuracy drops by approximately 1.64% on OSCD Dataset, emphasizing the importance of channel-wise recalibration in enhancing the discriminative power of learned features. Interestingly, removing Spectral Prior Encoding impacts BigEarthNet Dataset more significantly than OSCD Dataset, suggesting that the multi-resolution fusion is particularly beneficial in scenarios with high visual complexity. These results reinforce our design intuition that fusing multi-scale features is essential for building hierarchical representations adaptable to variable object scales and contexts.</p>
<p>On the LandCoverNet Dataset and EuroSAT Dataset, we observe a similar trend. The elimination of Residual Spectral Encoding results in accuracy drops of 1.95% and 2.45% respectively, confirming its relevance even in fine-grained or texture-heavy classification tasks. The EuroSAT Dataset, in particular, benefits from global reasoning enabled by the transformer module, since textures often span irregular patterns beyond local receptive fields. The effect of removing Differential Topographic Encoding is again significant, reducing F1 Score by over 1.5% across both datasets. In <xref ref-type="fig" rid="F6">Figures 6</xref>, <xref ref-type="fig" rid="F7">7</xref>, this aligns with our hypothesis that channel-level reweighting is crucial for recognizing subtle attribute differences in fine-grained categories. Spectral Prior Encoding, although having slightly less influence on Oxford Flowers, still contributes meaningfully on the EuroSAT Dataset, where multi-scale features help capture both micro and macro texture patterns. The full model consistently outperforms all ablated versions across all datasets, validating the complementary nature of each proposed component. These findings highlight the necessity of an integrated design, where attention, fusion, and global context work in unison to boost both classification accuracy and generalization robustness.</p>
<p>In addition to widely-used CNN and ViT-based baselines, we further evaluate CoastVisionNet against several domain-specific transformer models tailored for remote sensing segmentation tasks. These include SpectralFormer, TransUNet, Swin-Unet, and U-Former&#x2014;each of which leverages multi-scale attention mechanisms and spectral modeling strategies suitable for RS data. The experiments were conducted on both the LandCoverNet and EuroSAT datasets under consistent training settings. As shown in <xref ref-type="table" rid="T6">Table 6</xref> and <xref ref-type="fig" rid="F8">Figure 8</xref>, our method outperforms all the compared RS-specific transformer models in terms of classification accuracy, F1 score, and AUC. CoastVisionNet achieves 91.35% accuracy and 90.08% F1 score on LandCoverNet, along with 79.29% accuracy on EuroSAT. These results demonstrate the superiority and scalability of our proposed framework in handling complex coastal and terrestrial environments.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Comparison with RS-specific transformer-based segmentation models on LandCoverNet and EuroSAT datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="3" align="center">LandCoverNet dataset</th>
<th colspan="3" align="center">EuroSAT dataset</th>
</tr>
<tr>
<th align="center">Accuracy (%)</th>
<th align="center">F1 Score (%)</th>
<th align="center">AUC (%)</th>
<th align="center">Accuracy (%)</th>
<th align="center">F1 Score (%)</th>
<th align="center">AUC (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">SpectralFormer</td>
<td align="center">88.73<inline-formula id="inf343">
<mml:math id="m383">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">87.90<inline-formula id="inf344">
<mml:math id="m384">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">91.88<inline-formula id="inf345">
<mml:math id="m385">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">76.84<inline-formula id="inf346">
<mml:math id="m386">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">75.90<inline-formula id="inf347">
<mml:math id="m387">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">79.43<inline-formula id="inf348">
<mml:math id="m388">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
</tr>
<tr>
<td align="center">TransUNet</td>
<td align="center">89.14<inline-formula id="inf349">
<mml:math id="m389">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">88.08<inline-formula id="inf350">
<mml:math id="m390">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">92.24<inline-formula id="inf351">
<mml:math id="m391">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">77.52<inline-formula id="inf352">
<mml:math id="m392">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">76.84<inline-formula id="inf353">
<mml:math id="m393">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">80.22<inline-formula id="inf354">
<mml:math id="m394">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
</tr>
<tr>
<td align="center">Swin-Unet</td>
<td align="center">90.02<inline-formula id="inf355">
<mml:math id="m395">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">89.10<inline-formula id="inf356">
<mml:math id="m396">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">93.01<inline-formula id="inf357">
<mml:math id="m397">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">78.90<inline-formula id="inf358">
<mml:math id="m398">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">78.18<inline-formula id="inf359">
<mml:math id="m399">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">81.01<inline-formula id="inf360">
<mml:math id="m400">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
</tr>
<tr>
<td align="center">U-Former</td>
<td align="center">89.58<inline-formula id="inf361">
<mml:math id="m401">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">88.67<inline-formula id="inf362">
<mml:math id="m402">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">92.65<inline-formula id="inf363">
<mml:math id="m403">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">78.15<inline-formula id="inf364">
<mml:math id="m404">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.03</td>
<td align="center">77.45<inline-formula id="inf365">
<mml:math id="m405">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
<td align="center">80.50<inline-formula id="inf366">
<mml:math id="m406">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.02</td>
</tr>
<tr>
<td align="center">CoastVisionNet (Ours)</td>
<td align="center">
<bold>91.35</bold>
<inline-formula id="inf367">
<mml:math id="m407">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<bold>0.02</bold>
</td>
<td align="center">
<bold>90.08</bold>
<inline-formula id="inf368">
<mml:math id="m408">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<bold>0.02</bold>
</td>
<td align="center">
<bold>93.62</bold>
<inline-formula id="inf369">
<mml:math id="m409">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<bold>0.02</bold>
</td>
<td align="center">
<bold>79.29</bold>
<inline-formula id="inf370">
<mml:math id="m410">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<bold>0.03</bold>
</td>
<td align="center">
<bold>78.81</bold>
<inline-formula id="inf371">
<mml:math id="m411">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<bold>0.03</bold>
</td>
<td align="center">
<bold>81.74</bold>
<inline-formula id="inf372">
<mml:math id="m412">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<bold>0.03</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values are the prepared values.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Impact of architectural components in our model evaluated via ablation on BigEarthNet and OSCD.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g008.tif">
<alt-text content-type="machine-generated">Scatter plot showing ablation study results on BigEarthNet and OSCD datasets for different models: without Residual, Topographic, Spectral, and Ours. Scores range from 80 to 90, with performance metrics including Accuracy, Recall, F1 Score, and AUC for both datasets. Each metric is represented by different colored markers.</alt-text>
</graphic>
</fig>
<p>To provide empirical support for the efficiency claims, we evaluated the number of parameters and average inference time of each compared model. All measurements were conducted using a single NVIDIA A100 GPU on 224<inline-formula id="inf373">
<mml:math id="m413">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 224 patches. As presented in <xref ref-type="table" rid="T7">Table 7</xref> and <xref ref-type="fig" rid="F9">Figure 9</xref>, CoastVisionNet contains only 47.2 million trainable parameters and achieves an average inference latency of 32.5&#xa0;ms per image, making it the most efficient model in the set. This confirms its potential for scalable deployment in operational remote sensing systems where resource constraints are critical.</p>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>Comparison of model parameters and inference latency on LandCoverNet and EuroSAT datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="2" align="center">LandCoverNet</th>
<th colspan="2" align="center">EuroSAT</th>
</tr>
<tr>
<th align="center">Params (M)</th>
<th align="center">Inference (ms)</th>
<th align="center">Params (M)</th>
<th align="center">Inference (ms)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">SpectralFormer</td>
<td align="center">52.6</td>
<td align="center">47.1</td>
<td align="center">52.6</td>
<td align="center">47.1</td>
</tr>
<tr>
<td align="center">TransUNet</td>
<td align="center">61.3</td>
<td align="center">51.6</td>
<td align="center">61.3</td>
<td align="center">51.6</td>
</tr>
<tr>
<td align="center">Swin-Unet</td>
<td align="center">54.9</td>
<td align="center">44.3</td>
<td align="center">54.9</td>
<td align="center">44.3</td>
</tr>
<tr>
<td align="center">U-Former</td>
<td align="center">49.7</td>
<td align="center">40.8</td>
<td align="center">49.7</td>
<td align="center">40.8</td>
</tr>
<tr>
<td align="center">CoastVisionNet (Ours)</td>
<td align="center">
<bold>47.2</bold>
</td>
<td align="center">
<bold>32.5</bold>
</td>
<td align="center">
<bold>47.2</bold>
</td>
<td align="center">
<bold>32.5</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values are the prepared values.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Results of ablation experiments on our model across the LandCoverNet and EuroSAT datasets.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g009.tif">
<alt-text content-type="machine-generated">Heatmap titled &#x22;Ablation Study Heatmap on LandCoverNet and EuroSAT Dataset&#x22; shows performance metrics of four models: without Residual, Topographic, Spectral, and Ours. Metrics include Accuracy, Recall, F1 Score, and AUC. Each model&#x27;s performance is color-coded from blue (high) to light green (low), indicating varying scores from 77.5 to 92.5.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-5">
<title>4.5 Qualitative results</title>
<p>To further assess the model&#x2019;s ability to delineate complex coastal land cover types, we present qualitative comparisons in <xref ref-type="fig" rid="F10">Figure 10</xref>. The examples are taken from the LandCoverNet dataset and cover a wide range of coastal conditions including water-land boundaries, urban-sand transitions, and vegetated zones. Each row shows: (1) the original Sentinel-2 RGB image, (2) the corresponding ground truth label map, (3) the prediction from Swin-Unet as a strong baseline, and (4) the prediction from CoastVisionNet. It can be observed that our model yields cleaner segmentation boundaries, reduced fragmentation in small land patches, and more accurate identification of mixed land classes in coastal regions. This visual evidence complements the quantitative performance metrics and highlights the interpretability and precision of CoastVisionNet.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Qualitative comparisons of segmentation results on coastal scenes from the LandCoverNet dataset. CoastVisionNet exhibits improved spatial consistency and class delineation compared to strong transformer-based baselines.</p>
</caption>
<graphic xlink:href="fenvs-13-1648562-g010.tif">
<alt-text content-type="machine-generated">Comparison of coastal imagery segmentation. Four columns labeled &#x22;Input,&#x22; &#x22;Ground truth,&#x22; &#x22;Output,&#x22; and &#x22;CoastVision&#x22; each display a top photo and a corresponding bottom segmented image. Categories include blue for water, green for land, orange for sand, and red for buildings.</alt-text>
</graphic>
</fig>
<sec id="s4-5-1">
<title>4.5.1 Failure case analysis</title>
<p>While CoastVisionNet achieves state-of-the-art accuracy across multiple benchmarks, we observe some failure cases primarily concentrated in two categories: (1) Mixed-pixel transition zones: These occur at natural boundaries such as shoreline edges, marshlands, or fragmented coastal vegetation. Due to overlapping land cover types within a single pixel, the spectral response becomes ambiguous, often leading to confused classification between adjacent classes. (2) Spectrally confusing materials: Surfaces such as concrete roofs and dry bare soil can exhibit near-identical spectral profiles, causing the model to mislabel urban vs. natural land cover. This is exacerbated when topographic descriptors do not provide strong discriminative cues.</p>
</sec>
</sec>
</sec>
<sec id="s5">
<title>5 Conclusion and future work</title>
<p>In this work, we aimed to address the complex problem of coastal land cover classification, a task made especially challenging by the dynamic and heterogeneous nature of coastal zones. Traditional CNNs and standard transformer models often fall short in capturing the intricate spectral and spatial characteristics needed for accurate classification in such environments. To tackle these issues, we introduced CoastVisionNet, a novel transformer-based architecture incorporating spatial-channel attention and designed explicitly for coastal remote sensing. The core of our method lies in three major innovations: the Spectral-Topographic Encoding Network (STEN), which separately models spectral gradients and terrain features; a geometry-aware self-attention mechanism that facilitates deep cross-modal fusion; and the Spectrum-Guided Semantic Modulation (SGSM), which adapts inference based on spectrum-conditioned priors and learning dynamics. Through comprehensive experiments across multiple coastal satellite datasets, CoastVisionNet consistently outperformed existing baselines in terms of classification accuracy, robustness to imaging conditions, and generalization to unseen regions. Notably, it also demonstrated strong-agnostic transferability and temporal resilience.</p>
<p>Despite these promising results, two limitations of our current approach warrant further exploration. Although STEN effectively captures topographic and spectral cues, its dual-path design increases computational overhead, which may hinder scalability in large-scale or real-time applications. Future work may explore more efficient encodings or hierarchical token pruning strategies to maintain performance with reduced cost. While SGSM improves robustness, its reliance on hand-tuned spectral priors introduces sensitivity to domain-specific distributions. Moving forward, integrating meta-learning or self-supervised adaptation could mitigate this dependence and further boost model generalization. CoastVisionNet lays a solid foundation for semantic, adaptive, and physically consistent coastal monitoring systems in next-generation remote sensing platforms.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>LYa: Conceptualization, Methodology, Supervision, Project administration, Resources, Visualization, Software, Validation, Writing&#x2013;original draft, Writing &#x2013; review and editing. LYi: Formal analysis, Investigation, Data curation, Conceptualization, Funding acquisition, Software, Writing &#x2013; review and editing, Writing&#x2013;original draft. WD: Writing&#x2013;original draft, Writing&#x2013;review and editing, Visualization, Supervision, Funding acquisition.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The author declares that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alemohammad</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Booth</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Landcovernet: a global benchmark land cover classification training dataset</article-title>. <source>arXiv Prepr. arXiv:2012.03111</source>. <comment>
<ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://arxiv.org/abs/2012.03111">https://arxiv.org/abs/2012.03111</ext-link>
</comment>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ashtiani</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Geers</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Aflatouni</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>An on-chip photonic deep neural network for image classification</article-title>. <source>Nature</source> <volume>606</volume>, <fpage>501</fpage>&#x2013;<lpage>506</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-022-04714-0</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Azizi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mustafa</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ryan</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Beaver</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Freyberg</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Deaton</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Big self-supervised models advance medical image classification</article-title>. <source>IEEE Int. Conf. Comput. Vis.</source>, <fpage>3458</fpage>&#x2013;<lpage>3468</lpage>. <pub-id pub-id-type="doi">10.1109/iccv48922.2021.00346</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bazi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bashmal</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Rahhal</surname>
<given-names>M. M. A.</given-names>
</name>
<name>
<surname>Dayil</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Ajlan</surname>
<given-names>N. A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Vision transformers for remote sensing image classification</article-title>. <source>Remote Sens.</source> <volume>13</volume>, <fpage>516</fpage>. <pub-id pub-id-type="doi">10.3390/rs13030516</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bhatt</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bhatt</surname>
<given-names>V. T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Dcrff-lhrf: an improvised methodology for efficient land-cover classification on eurosat dataset</article-title>. <source>Multimedia Tools Appl.</source> <volume>83</volume>, <fpage>54001</fpage>&#x2013;<lpage>54025</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-023-17612-y</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bhojanapalli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chakrabarti</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Glasner</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Unterthiner</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Veit</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Understanding robustness of transformers for image classification</article-title>. <source>IEEE Int. Conf. Comput. Vis.</source>, <fpage>10211</fpage>&#x2013;<lpage>10221</lpage>. <pub-id pub-id-type="doi">10.1109/iccv48922.2021.01007</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>C.-F.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Panda</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021a</year>). <article-title>Crossvit: cross-attention multi-scale vision transformer for image classification</article-title>. <source>IEEE Int. Conf. Comput. Vis.</source>, <fpage>347</fpage>&#x2013;<lpage>356</lpage>. <pub-id pub-id-type="doi">10.1109/iccv48922.2021.00041</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Adeli</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2021b</year>). &#x201c;<article-title>Transunet: transformers make strong encoders for medical image segmentation</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>12093</fpage>&#x2013;<lpage>12103</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Miao</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021c</year>). <article-title>Review of image classification algorithms based on convolutional neural networks</article-title>. <source>Remote Sens.</source> <volume>13</volume>, <fpage>4712</fpage>. <pub-id pub-id-type="doi">10.3390/rs13224712</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Advanced domain adaptation technique for object detection leveraging semi-automated dataset construction and enhanced yolov8</article-title>. <source>Remote Sens. Lett.</source> <volume>15</volume>, <fpage>289</fpage>&#x2013;<lpage>301</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://ieeexplore.ieee.org/abstract/document/10753164/">https://ieeexplore.ieee.org/abstract/document/10753164/</ext-link>
</comment>.</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>3d reconstruction and landscape restoration of garden landscapes: an innovative approach combining deep features and graph structures</article-title>. <source>Front. Environ. Sci.</source> <volume>13</volume>, <fpage>1556042</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2025.1556042</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Transmed: transformers advance multi-modal medical image classification</article-title>. <source>Diagnostics</source> <volume>11</volume>, <fpage>1384</fpage>. <pub-id pub-id-type="doi">10.3390/diagnostics11081384</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dalvi</surname>
<given-names>P. P.</given-names>
</name>
<name>
<surname>Edla</surname>
<given-names>D. R.</given-names>
</name>
<name>
<surname>Purushothama</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Diagnosis of coronavirus disease from chest x-ray images using densenet-169 architecture</article-title>. <source>SN Comput. Sci.</source> <volume>4</volume>, <fpage>214</fpage>. <pub-id pub-id-type="doi">10.1007/s42979-022-01627-7</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A dual-band wide axial-ratio beamwidth circularly-polarized antenna with v-shaped slot for l2/l5 gnss applications</article-title>. <source>IEEE Antennas Wirel. Propag. Lett.</source> <volume>23</volume>, <fpage>589</fpage>&#x2013;<lpage>593</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://ieeexplore.ieee.org/abstract/document/10753263/">https://ieeexplore.ieee.org/abstract/document/10753263/</ext-link>
</comment>.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Exploring vision transformers for polarimetric sar image classification</article-title>. <source>IEEE Trans. Geoscience Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2021.3137383</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Conv2next: reconsidering conv next network design for image recognition</article-title>,&#x201d; in <source>2022 international conference on computers and artificial intelligence technologies (CAIT)</source> (<publisher-name>IEEE</publisher-name>), <fpage>53</fpage>&#x2013;<lpage>60</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Frost</surname>
<given-names>G. V.</given-names>
</name>
<name>
<surname>Bhatt</surname>
<given-names>U. S.</given-names>
</name>
<name>
<surname>Macander</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Berner</surname>
<given-names>L. T.</given-names>
</name>
<name>
<surname>Walker</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Raynolds</surname>
<given-names>M. K.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>The changing face of the arctic: four decades of greening and implications for tundra ecosystems</article-title>. <source>Front. Environ. Sci.</source> <volume>13</volume>, <fpage>1525574</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2025.1525574</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Oscd: a one-shot conditional object detection framework</article-title>. <source>Neurocomputing</source> <volume>425</volume>, <fpage>243</fpage>&#x2013;<lpage>255</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2020.04.092</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Novel deep learning domain adaptation approach for object detection using semi-self building dataset and modified yolov4</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>15478</fpage>&#x2013;<lpage>15491</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://www.mdpi.com/2032-6653/15/6/255">https://www.mdpi.com/2032-6653/15/6/255</ext-link>.</comment>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Plaza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chanussot</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Graph convolutional networks for hyperspectral image classification</article-title>. <source>IEEE Trans. Geoscience Remote Sens.</source> <volume>59</volume>, <fpage>5966</fpage>&#x2013;<lpage>5978</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2020.3015157</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chanussot</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Spectralformer: rethinking hyperspectral image classification with transformers</article-title>. <source>IEEE Trans. Geoscience Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2021.3130716</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jarca</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Croitoru</surname>
<given-names>F.-A.</given-names>
</name>
<name>
<surname>Ionescu</surname>
<given-names>R. T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Cbm: curriculum by masking</article-title>. <comment>
<italic>arXiv preprint arXiv:2407.05193</italic>
</comment>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rahman</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Efficient vehicle detection and tracking strategy in aerial videos by employing morphological operations and feature points motion analysis</article-title>. <source>Multimedia Tools Appl.</source> <volume>79</volume>, <fpage>12215</fpage>&#x2013;<lpage>12232</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA240503">https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA240503</ext-link>.</comment>
</citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>H. E.</given-names>
</name>
<name>
<surname>Cosa-Linan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Santhanam</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Jannesari</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Maros</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ganslandt</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <source>Transfer learning for medical image classification: a literature review</source>. <publisher-name>BMC Medical Imaging</publisher-name>.</citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Koonce</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Efficientnet</article-title>,&#x201d; in <source>Convolutional neural networks with swift for Tensorflow: image recognition and dataset categorization</source> (<publisher-name>Springer</publisher-name>), <fpage>109</fpage>&#x2013;<lpage>123</lpage>.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Eliceiri</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Dual-stream multiple instance learning network for whole slide image classification with self-supervised contrastive learning</article-title>. <source>Comput. Vis. Pattern Recognit.</source> <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://openaccess.thecvf.com/content/CVPR2021/html/Li_Dual-Stream_Multiple_Instance_Learning_Network_for_Whole_Slide_Image_Classification_CVPR_2021_paper.html">http://openaccess.thecvf.com/content/CVPR2021/html/Li_Dual-Stream_Multiple_Instance_Learning_Network_for_Whole_Slide_Image_Classification_CVPR_2021_paper.html</ext-link>
</comment>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Residual channel-attention (rca) network for remote sensing image scene classification</article-title>. <source>IEEE Trans. Geoscience Remote Sens.</source> <volume>63</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://link.springer.com/article/10.1007/s11042-024-20546-8">https://link.springer.com/article/10.1007/s11042-024-20546-8</ext-link>.</comment>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H. K.</given-names>
</name>
<name>
<surname>Casazza</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023a</year>). <article-title>Editorial: methods and applications in environmental informatics and remote sensing</article-title>. <source>Front. Environ. Sci.</source> <volume>11</volume>, <fpage>1255010</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2023.1255010</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023b</year>). <article-title>Unlocking the potential of explainable artificial intelligence in remote sensing big data</article-title>. <source>Remote Sens.</source> <volume>15</volume>, <fpage>5448</fpage>. <pub-id pub-id-type="doi">10.3390/rs15235448</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Semiblind compressed sensing: a bidirectional-driven method for spatiotemporal fusion of remote sensing images</article-title>. <source>IEEE J. Sel. Top. Appl. Earth Observations Remote Sens.</source> <volume>17</volume>, <fpage>19048</fpage>&#x2013;<lpage>19066</lpage>. <pub-id pub-id-type="doi">10.1109/jstars.2024.3463750</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Guan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Research on coastline extraction and dynamic change from remote sensing images based on deep learning</article-title>. <source>Front. Environ. Sci.</source> <volume>12</volume>, <fpage>1443512</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2024.1443512</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mai</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jeong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Quispe</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Sanner</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Online continual learning in image classification: an empirical survey</article-title>. <source>Neurocomputing</source> <volume>469</volume>, <fpage>28</fpage>&#x2013;<lpage>51</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2021.10.021</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Masana</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Twardowski</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Menta</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bagdanov</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>van de Weijer</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Class-incremental learning: survey and performance evaluation on image classification</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>45</volume>, <fpage>5513</fpage>&#x2013;<lpage>5533</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2022.3213473</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Mascarenhas</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>l Agarwal</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>A comparison between vgg16, vgg19 and resnet50 architecture frameworks for image classification</article-title>,&#x201d; in <source>2021 international conference on disruptive technologies for multi-disciplinary research and applications (CENTCON)</source>.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Maur&#xed;cio</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Domingues</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bernardino</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Comparing vision transformers and convolutional neural networks for image classification: a literature review</article-title>. <source>Appl. Sci.</source>. <pub-id pub-id-type="doi">10.3390/app13095521</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ning</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Domain adaptation in remote sensing image classification: a survey</article-title>. <source>IEEE J. Sel. Top. Appl. Earth Observations Remote Sens.</source> <volume>15</volume>, <fpage>9842</fpage>&#x2013;<lpage>9859</lpage>. <pub-id pub-id-type="doi">10.1109/jstars.2022.3220875</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Global filter networks for image classification</article-title>. <source>Neural Inf. Process. Syst.</source> <pub-id pub-id-type="doi">10.48550/arXiv.2107.00645</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roy</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Deria</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rasti</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Plaza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chanussot</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Multimodal fusion transformer for remote sensing image classification</article-title>. <source>IEEE Trans. Geoscience Remote Sens.</source> <volume>61</volume>, <fpage>1</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2023.3286826</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sheykhmousa</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mahdianpari</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ghanbari</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Mohammadimanesh</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Ghamisi</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Homayouni</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Support vector machine versus random forest for remote sensing image classification: a meta-analysis and systematic review</article-title>. <source>IEEE J. Sel. Top. Appl. Earth Observations Remote Sens.</source> <volume>13</volume>, <fpage>6308</fpage>&#x2013;<lpage>6325</lpage>. <pub-id pub-id-type="doi">10.1109/jstars.2020.3026724</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sumbul</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>De Wall</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kreuziger</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Marcelino</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Costa</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Benevides</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Bigearthnet-mm: a large-scale, multimodal, multilabel benchmark archive for remote sensing image classification and retrieval [software and data sets]</article-title>. <source>IEEE Geoscience Remote Sens. Mag.</source> <volume>9</volume>, <fpage>174</fpage>&#x2013;<lpage>180</lpage>. <pub-id pub-id-type="doi">10.1109/mgrs.2021.3089174</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Spectral&#x2013;spatial feature tokenization transformer for hyperspectral image classification</article-title>. <source>IEEE Trans. Geoscience Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2022.3144158</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tanaka</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yamamoto</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Okada</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Detection of earthquake-induced building damages using remote sensing data and deep learning: a case study of mashiki town, Japan</article-title>. <source>ISPRS J. Photogrammetry Remote Sens.</source> <volume>197</volume>, <fpage>75</fpage>&#x2013;<lpage>89</lpage>. <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://ieeexplore.ieee.org/abstract/document/10282550/">https://ieeexplore.ieee.org/abstract/document/10282550/</ext-link>.</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taori</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dave</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shankar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Carlini</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Recht</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Schmidt</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Measuring robustness to natural distribution shifts in image classification</article-title>. <source>Neural Inf. Process. Syst.</source> <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://openaccess.thecvf.com/content/CVPR2021/html/Li_Dual-Stream_Multiple_Instance_Learning_Network_for_Whole_Slide_Image_Classification_CVPR_2021_paper.html">http://openaccess.thecvf.com/content/CVPR2021/html/Li_Dual-Stream_Multiple_Instance_Learning_Network_for_Whole_Slide_Image_Classification_CVPR_2021_paper.html</ext-link>
</comment>.</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Theckedath</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sedamkar</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Detecting affect states using vgg16, resnet50 and se-resnet50 networks</article-title>. <source>SN Comput. Sci.</source> <volume>1</volume>, <fpage>79</fpage>. <pub-id pub-id-type="doi">10.1007/s42979-020-0114-9</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Krishnan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Tenenbaum</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Isola</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Rethinking few-shot image classification: a good embedding is all you need?</article-title> <source>Eur. Conf. Comput. Vis.</source>, <fpage>266</fpage>&#x2013;<lpage>282</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-58568-6_16</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Touvron</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Bojanowski</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Caron</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cord</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>El-Nouby</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Grave</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Resmlp: feedforward networks for image classification with data-efficient training</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>45</volume>, <fpage>5314</fpage>&#x2013;<lpage>5321</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2022.3206148</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Touvron</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cord</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>J&#xe9;gou</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Deit iii: revenge of the vit</article-title>,&#x201d; in <source>European conference on computer vision</source> (<publisher-name>Springer</publisher-name>), <fpage>516</fpage>&#x2013;<lpage>533</lpage>.</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Transformer-based unsupervised contrastive learning for histopathological image classification</article-title>. <source>Med. Image Anal.</source> <volume>81</volume>, <fpage>102559</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102559</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Multiband circularly-polarized stacked elliptical patch antenna with eye-shaped slot for gnss applications</article-title>. <source>Microw. Opt. Technol. Lett.</source> <volume>66</volume>, <fpage>312</fpage>&#x2013;<lpage>319</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://www.cambridge.org/core/journals/international-journal-of-microwave-and-wireless-technologies/article/multiband-circularlypolarized-stacked-elliptical-patch-antenna-with-eyeshaped-slot-for-gnss-applications/E6EA5D3F6151BA82193FFC638188AB7A">https://www.cambridge.org/core/journals/international-journal-of-microwave-and-wireless-technologies/article/multiband-circularlypolarized-stacked-elliptical-patch-antenna-with-eyeshaped-slot-for-gnss-applications/E6EA5D3F6151BA82193FFC638188AB7A</ext-link>.</comment>
</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Clip-vg: self-paced curriculum adapting of clip for visual grounding</article-title>. <source>IEEE Trans. Multimedia</source> <volume>26</volume>, <fpage>4334</fpage>&#x2013;<lpage>4347</lpage>. <pub-id pub-id-type="doi">10.1109/tmm.2023.3321501</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ke</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Medmnist v2 - a large-scale lightweight benchmark for 2d and 3d biomedical image classification</article-title>. <source>Sci. Data</source> <volume>10</volume>, <fpage>41</fpage>. <pub-id pub-id-type="doi">10.1038/s41597-022-01721-8</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deepemd: few-shot image classification with differentiable earth mover&#x2019;s distance and structured classifiers</article-title>. <source>Comput. Vis. Pattern Recognit.</source>, <fpage>12200</fpage>&#x2013;<lpage>12210</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr42600.2020.01222</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Single-source domain expansion network for cross-scene hyperspectral image classification</article-title>. <source>IEEE Trans. Image Process.</source> <volume>32</volume>, <fpage>1498</fpage>&#x2013;<lpage>1512</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2023.3243853</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A wide axial-ratio beamwidth circularly-polarized oval patch antenna with sunlight-shaped slots for gnss and wimax applications</article-title>. <source>Int. J. RF Microw. Computer-Aided Eng.</source> <volume>32</volume>, <fpage>e23029</fpage>. <comment>Available online at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://link.springer.com/article/10.1007/s11276-022-03093-8">https://link.springer.com/article/10.1007/s11276-022-03093-8</ext-link>.</comment>
</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Spcnet: deep self-paced curriculum network incorporated with inductive bias</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2025.3544724</pub-id>
</citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Rotation-invariant attention network for hyperspectral image classification</article-title>. <source>IEEE Trans. Image Process.</source> <volume>31</volume>, <fpage>4251</fpage>&#x2013;<lpage>4265</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2022.3177322</pub-id>
</citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhuang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ke</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bian</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Deep subdomain adaptation network for image classification</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>32</volume>, <fpage>1713</fpage>&#x2013;<lpage>1722</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2020.2988928</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>