<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Remote Sens.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Remote Sensing</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Remote Sens.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2673-6187</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1730222</article-id>
<article-id pub-id-type="doi">10.3389/frsen.2026.1730222</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A multi-feature fusion based remote sensing inversion method for farmland shelterbelts</article-title>
<alt-title alt-title-type="left-running-head">Zhang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frsen.2026.1730222">10.3389/frsen.2026.1730222</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Qi</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3223691"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhou</surname>
<given-names>Yuncheng</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Hongge</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Wenhao</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Huang</surname>
<given-names>Yuekun</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>College of Information and Electrical Engineering, Shenyang Agricultural University</institution>, <city>Shenyang</city>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Yuncheng Zhou, <email xlink:href="mailto:zhouyc2002@syau.edu.cn">zhouyc2002@syau.edu.cn</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-17">
<day>17</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>7</volume>
<elocation-id>1730222</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>13</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Zhang, Zhou, Zhao, Wu and Huang.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Zhang, Zhou, Zhao, Wu and Huang</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-17">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Precise segmentation of farmland shelterbelts in high-resolution remote sensing imagery represents a crucial yet challenging task for establishing a quantifiable farmland quality evaluation system. The core difficulties arise from two principal issues: (1) effectively distinguishing cultivated land from shelterbelts with similar textural characteristics while suppressing interference from complex backgrounds such as roads and ditches; and (2) accurately segmenting narrow, elongated, and discontinuously distributed single-row shelterbelts with blurred boundaries. Conventional semantic segmentation methods, primarily designed for large-scale objects in natural scenes, generally underperform when confronted with the distinctive characteristics of remote sensing targets. To overcome these challenges, we propose a novel remote sensing inversion framework based on multi-feature fusion. For the first challenge, we designed a Multi-Feature Fusion Block (MFFB) that utilizes a Spatial Gated Fusion Mechanism (SGFM) to adaptively integrate global contextual features captured by Mamba-like linear attention, local details extracted through convolutional operators, and frequency-domain information obtained via Fast Fourier Transform (FFT), thereby significantly enhancing the model&#x2019;s capacity to represent and discriminate complex features. To address the second challenge, we introduced a super-resolution preprocessing strategy along with a Multi-Scale Contextual feature Extraction (MSCE) module within an encoder-decoder architecture. The former effectively increases the pixel width of narrow shelterbelts through enhanced image detail reconstruction, while the latter ensures segmentation continuity for elongated features by integrating multi-scale contextual information. Experimental results on our self-constructed farmland shelterbelt dataset demonstrate that our method achieves segmentation accuracies of 96.42% for cultivated land and 82.83% for shelterbelts, outperforming both mainstream general-purpose semantic segmentation models and specialized remote sensing methods, thus validating the effectiveness of the proposed framework for precise farmland shelterbelt extraction.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>farmland shelterbelts</kwd>
<kwd>multi-feature fusion</kwd>
<kwd>remote sensing</kwd>
<kwd>semantic segmentation</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>Ministry of Science and Technology of the People&#x2019;s Republic of China</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100002855</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp1">2023YFD1501303</award-id>
</award-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the Sub-project of National Key R&#x26;D Plan (Grant No. 2023YFD1501303 and 2021YFD1500204).</funding-statement>
</funding-group>
<counts>
<fig-count count="8"/>
<table-count count="9"/>
<equation-count count="20"/>
<ref-count count="37"/>
<page-count count="17"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Agro-Environmental Remote Sensing</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Arable land, being the fundamental basis for grain production (<xref ref-type="bibr" rid="B29">Wang W. et al., 2024</xref>), underpins the material foundation of human survival. Assessing arable land quality is therefore critical for monitoring its current status and fostering the sustainable utilization of this vital resource (<xref ref-type="bibr" rid="B4">Duan and Luo, 2024</xref>). Shelterbelts surrounding farmland play a multifaceted role in agricultural ecosystems: they mitigate wind and sand erosion, reduce topsoil loss, prevent crop lodging, regulate farmland microclimates, and support biodiversity conservation. In China, the national standard &#x201c;Cultivated Land Quality Grade&#x201d; (GB/T 33469&#x2013;2016) formally recognizes the extent of farmland shelterbelt coverage as a key criterion in cultivated land quality evaluation. Conventional approaches to shelterbelt mapping, such as field surveys and visual interpretation of remote sensing imagery, remain labor-intensive, time-consuming, and prone to subjective bias. The development of automated methods&#x2014;leveraging remote sensing data and computational algorithms to accurately delineate the spatial extent and distribution of both cropland and adjacent shelterbelts&#x2014;would thus substantially reduce manual effort, improve operational efficiency, and enhance the objectivity and scientific rigor of farmland quality assessment.</p>
<p>Extensive research has been conducted globally on remote sensing surveys of cropland and forest land. <xref ref-type="bibr" rid="B36">Zhu et al. (2024)</xref> proposed an improved watershed segmentation algorithm for single-tree canopy extraction in complex forest environments based on high-resolution UAV imagery. By extracting multi-dimensional features including spectral, shape, and texture characteristics, they achieved an 87% segmentation accuracy. <xref ref-type="bibr" rid="B14">Liu et al. (2024)</xref> integrated Sentinel-2 remote sensing imagery with topographic prior information, employing both random forest and support vector machine methods to classify ten major tree species in large forest areas, achieving a maximum classification accuracy of 87.45%. Unlike large contiguous forest areas, farmland shelterbelts surrounding farmlands are narrow and elongated, often featuring single trees or single rows. Due to the low spatial resolution of multispectral and hyperspectral satellite remote sensing imagery, their effective extraction is challenging. Conventional remote sensing inversion for forest and cropland classification relies on manually extracted features and machine learning models such as random forests (<xref ref-type="bibr" rid="B17">Ok et al., 2012</xref>), support vector machines (<xref ref-type="bibr" rid="B31">Wei and Hoai, 2016</xref>), decision trees (<xref ref-type="bibr" rid="B2">Chanmee and Kesorn, 2023</xref>), and other machine learning classification models to extract regional distributions. The accuracy of these inversions depends not only on the classifiers themselves but also on the classification features, whose extraction and selection rely heavily on the researcher&#x2019;s experience.</p>
<p>With advances in artificial intelligence technology, research on segmenting and extracting land feature boundaries using high-resolution remote sensing imagery and data-driven deep learning models has gradually gained prominence. <xref ref-type="bibr" rid="B20">Rakhlin et al. (2018)</xref> employed the UNet model (<xref ref-type="bibr" rid="B21">Ronneberger et al., 2015</xref>) to segment land use types based on Google Earth satellite data, achieving an average intersection-over-union ratio of 0.649. <xref ref-type="bibr" rid="B34">Xu et al. (2020)</xref> improved UNet&#x2019;s skip connections and loss function to explore the feasibility of cultivable land parcel extraction on Landsat-5 satellite data, achieving 91.74% recognition accuracy. These studies demonstrate the technical viability of using semantic segmentation models based on remote sensing imagery for land feature classification and recognition. <xref ref-type="bibr" rid="B33">Xiao et al. (2025)</xref> employed a model integrating multi-scale convolutions with Mamba (<xref ref-type="bibr" rid="B6">Gu and Dao, 2023</xref>) at the pixel level in remote sensing imagery. By constructing channel-spatial attention and dense multi-scale fusion modules, they enhanced the model&#x2019;s feature expression capability, validating its effectiveness on the WHDLD dataset. Field boundaries such as ditches and roads often appear as fine features in remote sensing imagery. This method may exhibit discontinuities in identifying field boundaries and struggles to effectively recognize single-tree or single-row agricultural shelterbelts. <xref ref-type="bibr" rid="B11">Kwenda et al. (2024)</xref> applied deep neural networks to extract features from satellite and aerial imagery, further employing classifiers like random forests to segment forest areas. <xref ref-type="bibr" rid="B22">Ru et al. (2023)</xref> integrated spatial pyramid pooling into the UNet model, automatically identifying forest boundaries based on Landsat-8 satellite imagery to track annual forest changes. The DeepGDLE model (<xref ref-type="bibr" rid="B30">Wang G. et al., 2024</xref>) replaces the feature extraction network in the DeepLabV3&#x2b; (<xref ref-type="bibr" rid="B3">Chen et al., 2018</xref>) encoder with a GhostNet architecture, substituting dilated convolutions with separable convolutions. This approach reduces model parameters while significantly improving computational efficiency, achieving an overall accuracy of 72.85% in semantic segmentation experiments on remote sensing datasets. Existing research primarily relies on satellite imagery and deep learning models to segment large-scale contiguous cultivated fields and forest areas, with limited studies focusing on analyzing and extracting farmland shelterbelts. Shelterbelts are narrow, making them difficult to accurately identify using medium-to-low resolution remote sensing imagery. During crop growing seasons, trees, crops, and low shrubs can also be spectrally and texturally confused. Furthermore, sample databases for remote sensing inversion of farmland shelterbelts remain scarce. How to leverage deep learning&#x2019;s robust feature extraction capabilities to develop remote sensing inversion methods that effectively identify both cultivated fields and shelterbelts warrants further exploration.</p>
<p>To address the practical requirements of farmland forest network surveys for soil quality assessment, as well as challenges in remote sensing inversion&#x2014;such as narrow and elongated shelterbelt widths, difficult boundary extraction and segmentation, and scarce sample databases&#x2014;this paper proposes a remote sensing inversion method for farmland shelterbelts based on semantic segmentation models. A farmland forest network dataset was constructed using Jilin-1 satellite imagery. The contributions of this work can be summarized as follows: 1) We designed the Multi-Feature Fusion Block (MFFB) to achieve adaptive fusion of multi-source features. Specifically, it combines the global modeling capability of Multi-Level Linear Layers (MLLA), the local feature extraction advantage of convolution operators, and the high-frequency detail capture capability of Fast Fourier Transform (FFT) in the frequency domain. Through the Semantic Gradient-based Fusion Model (SGFM), it achieves adaptive fusion of multi-source features, significantly enhancing the model&#x2019;s representation capability for complex objects. 2) Based on the MFFB module, we constructed a remote sensing image super-resolution model and a semantic segmentation model. The super-resolution model employs a phased upsampling strategy to effectively enhance image reconstruction quality; the semantic segmentation model fuses multi-scale contextual information through an encoder-decoder architecture, achieving high-precision segmentation of cultivated land and shelterbelts. 3) We have constructed and publicly released the AWSD2025 remote sensing image dataset of shelterbelts, providing a crucial foundation for subsequent research.</p>
<p>This study aims to provide solutions for the dynamic monitoring and scientific evaluation of farmland shelterbelt systems. We propose a remote sensing inversion method for farmland shelterbelts based on multi-feature fusion. First, we constructed a sample database of farmland shelterbelts using high-resolution imagery from the Jilin-1 satellite. Subsequently, we designed a Multi-Feature Fusion Block (MFFB) integrating Mamba-like linear attention, convolutional operators, and Fourier transforms to collaboratively extract global context, local details, and frequency-domain features from imagery. Based on this module, we developed both a super-resolution model and a semantic segmentation model. The Spatial Gate Fusion Mechanism (SGFM) adaptively integrates multi-source features, enhancing the models&#x2019; segmentation capability for narrow forest strips. Finally, segmentation results were vectorized to generate spatial data suitable for Geographic Information System (GIS) analysis. Experimental results demonstrate that the proposed method achieves high accuracy in farmland forest network segmentation, providing reliable technical support for calculating forest network indicators in arable land quality assessment. This contributes positively to advancing agricultural remote sensing automation and sustainable management of arable land resources.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2-1">
<label>2.1</label>
<title>Dataset</title>
<p>The study area was selected from the central-southern region of the Songnen Plain (45&#xb0;37&#x2032;20.57&#x2033;N&#x2014;46&#xb0;20&#x2032;53.26&#x2033;N, 125&#xb0;26&#x2032;46.64&#x2033;E&#x2212;126&#xb0;07&#x2032;29.10&#x2033;E) (<xref ref-type="fig" rid="F1">Figure 1</xref>). This area lies within Heilongjiang Province&#x2019;s key monitoring zone for farmland shelterbelts in China. It features a temperate semi-humid continental monsoon climate with a frost-free period of 130&#x2013;150 days. Primary crops include corn, rice, and soybeans under a single-crop-per-year system. Prevailing winds during the growing season are southerly. The shelterbelt stands are predominantly composed of poplar, willow, and pine species.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Study area.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g001.tif">
<alt-text content-type="machine-generated">Map graphic showing the Songnen Plain with a highlighted study area in the central region. Enlarged satellite imagery details the study area, with two smaller panels below labeled &#x201C;Sample Imagery Data&#x201D; showing close-up views of land and vegetation.</alt-text>
</graphic>
</fig>
<p>This study selected remote sensing imagery from the Jilin-1 High-Resolution 03 Satellite (JL1GF03) as the primary data source. The multispectral data encompass four bands: blue (450&#x2013;520&#xa0;nm), green (520&#x2013;600&#xa0;nm), red (630&#x2013;690&#xa0;nm), and near-infrared (760&#x2013;900&#xa0;nm) bands, with a spatial resolution of 3&#xa0;m. The panchromatic image has a spatial resolution of 0.75&#xa0;m. Fifteen remote sensing images with cloud cover below 3% during August and September from 2020 to 2022 were selected as research data. The data underwent preprocessing including radiometric calibration, atmospheric correction, and geometric precision correction. The blue (B), green (G), and red (R) bands from the multispectral data, along with the panchromatic imagery, were selected. Through data sharpening, an RGB remote sensing image with a spatial resolution of 0.75&#xa0;m was generated and stored as a GeoTIFF file for future use.</p>
<p>A core component of this study&#x2019;s remote sensing inversion method for farmland forest networkization involves applying a data-driven semantic segmentation model to extract boundaries of cultivated land and farmland shelterbelts (including forested areas). To meet the training and testing requirements of the semantic segmentation model, the LabelImg annotation tool was used to manually delineate the study area imagery, creating masks for cultivated land parcels and farmland shelterbelts, while other areas were labeled as background. The pixel proportions for the three categories were 65.52%, 5.25%, and 29.93%, respectively. Field boundaries were delineated not by property ownership attributes but by the intersections of cultivated land with other features such as shelterbelts, roads, rivers, and residential areas. For shelterbelts, boundaries were defined by the projected outlines of tree canopies in the imagery. The study area imagery and manually annotated results collectively form the farmland-forest network dataset used in this research.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Overall framework</title>
<p>High-resolution remote sensing imagery, combined with well-designed semantic segmentation models, enables the delineation of cropland parcels and farmland shelterbelts to generate a semantic mask M, in which each pixel is assigned a land-cover class. The area of cropland and shelterbelts can then be directly estimated from the pixel counts of different categories. Given that shelterbelts exert protective effects over an effective distance and that prevailing wind directions vary seasonally, farmland shelterbelt monitoring requires analysis of both spatial distribution and distance to cropland parcels. Direct use of M for such analysis is limited; therefore, this study further vectorizes M, where binary masks of cropland and shelterbelts are first generated as formulated in <xref ref-type="disp-formula" rid="e1">Equation 1</xref>.<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>B</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the binary map of class <italic>c</italic> (cropland or shelterbelt). Morphological closing followed by opening operations are applied to fill small holes or gaps within continuous regions caused by misclassification in the semantic segmentation model, while simultaneously removing small noises and smoothing region boundaries. Subsequently, a boundary-tracing algorithm (<xref ref-type="bibr" rid="B23">Suzuki and Abe, 1985</xref>) is employed to extract the pixel coordinates of region boundaries. Since these boundaries are composed of pointwise-adjacent pixel coordinates and contain substantial redundancy, the Douglas&#x2013;Peucker algorithm (<xref ref-type="bibr" rid="B26">Visvalingam and Whyatt, 1990</xref>) is further used to fit the region boundaries into closed curves and reduce redundant coordinates. Based on the mapping between geographic and pixel coordinates in GeoTIFF imagery, the boundary pixel coordinates are then converted into geographic coordinates, and polygonal geographic features are generated and stored as Shapefiles using the Geospatial Data Abstraction Library (GDAL). This vectorized geographic information further enables advanced farmland shelterbelt analysis through the spatial analysis capabilities of GIS.</p>
<p>Farmland shelterbelts appear as narrow elongated strips in remote sensing imagery, and accurate segmentation of such slender objects has long been a challenge for semantic segmentation models. Enhancing the spatial resolution of imagery can effectively increase the apparent width of these strips, while higher-resolution imagery also sharpens cropland and shelterbelt boundaries, facilitating more precise boundary delineation by segmentation models. To this end, a super-resolution model is employed in this study to preprocess remote sensing imagery, thereby improving spatial resolution. The super-resolved imagery is then used as the input to the semantic segmentation model with the aim of enhancing segmentation accuracy. The overall technical framework for farmland shelterbelt inversion and vectorized extraction is illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Framework for inverting and vectorizing farmland shelterbelts from remote sensing imagery.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g002.tif">
<alt-text content-type="machine-generated">Diagram showing an automated workflow for converting satellite images to vector maps. The process includes an image super-resolution model from low to high resolution, semantic segmentation to produce a mask, binarization, and multiple vectorization processing steps: image enhancement, boundary extraction, curve fitting, and geo-vector export.</alt-text>
</graphic>
</fig>
<p>The technical framework, as illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>, operates through a logically progressive pipeline consisting of &#x201c;preprocessing enhancement,&#x201d; &#x201c;multi-domain feature fusion,&#x201d; and &#x201c;refined structural extraction.&#x201d; This integrated design is systematically engineered to address the inherent challenges of identifying shelterbelts within complex agricultural landscapes, particularly their narrow morphology and the prevalent spectral confusion with surrounding crops. Based on the specific characteristics of the high-resolution research data, an optimized architectural design for both the super-resolution and semantic segmentation models is implemented. The primary innovation of this work lies in the development of the Multi-Feature Fusion Block (MFFB). Departing from conventional single-domain extraction paradigms, the MFFB introduces a novel Spatial Gated Fusion Mechanism (SGFM) to establish dynamic weight allocation across the spatial, frequency, and Mamba-like linear attention (MLLA) domains for the first time. From a theoretical perspective, this mechanism effectively breaks through the classification bottleneck caused by spectral similarity, representing the most significant original contribution of this study. Furthermore, we have implemented two key strategic optimizations to existing architectural components. First, to specifically address the narrow and elongated geometry of shelterbelts, we developed the Multi-Scale Contextual feature Extraction (MSCE) module, which innovatively integrates Pixel Unshuffle (PU) techniques with a multi-branch design to optimize receptive field coverage for fine-grained targets. Second, we optimized a staged super-resolution preprocessing strategy that utilizes incremental upsampling instead of conventional one-step interpolation, effectively preserving edge fidelity and reconstructing crucial structural details prior to the segmentation phase.</p>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Feature extraction module</title>
<p>The feature extraction module is the core component of both the remote sensing super-resolution and semantic segmentation models, and its capability directly determines model performance. In imagery, cropland parcels are large-scale targets while shelterbelts are elongated and narrow, requiring the module to capture global context for spatial continuity and to avoid fragmented segmentation. Because crops, trees, and grasslands have similar spectral signatures, effective extraction of high-frequency texture details is essential for discrimination. In addition, the super-resolution model relies on high-frequency details to reconstruct fine boundaries and on low-frequency information to restore overall structure and color. Thus, the module must integrate local features with frequency-domain and global contextual information.</p>
<sec id="s2-3-1">
<label>2.3.1</label>
<title>Feature extraction operators</title>
<p>In the task of semantic segmentation for remote sensing imagery, global contextual modeling is of paramount importance. Traditional Convolutional Neural Networks (CNNs) are intrinsically limited by their local receptive fields; while proficient in extracting fine textures, they struggle to capture the long-range, continuous linear structures characteristic of farmland shelterbelts, often leading to fragmented segmentation results. Conversely, although Transformer-based architectures can establish global dependencies via self-attention mechanisms, they suffer from quadratic computational complexity (O(<italic>N</italic>
<sup>
<italic>2</italic>
</sup>) relative to sequence length. This poses a severe memory bottleneck and computational challenge when processing high-resolution remote sensing data. To address these limitations, we incorporate the Mamba-like Linear Attention (MLLA) operator (<xref ref-type="bibr" rid="B6">Gu and Dao, 2023</xref>) within the proposed Multi-Feature Fusion Block (MFFB). By leveraging Selective State Space Models (SSSM), Mamba achieves quasi-linear computational complexity (O(<italic>N</italic>)), enabling a global receptive field comparable to Transformers but with significantly reduced resource overhead. The integration of MLLA allows our model to perceive the extended trajectories of shelterbelts at a macro scale, ensuring geometric continuity of linear structures while simultaneously achieving a synergistic balance between global modeling capacity and inference efficiency.</p>
<p>While SSSM achieves efficient training through parallel scanning, recursive computation is less optimized on GPUs than matrix multiplication, resulting in performance bottlenecks. <xref ref-type="bibr" rid="B37">Han et al. (2024)</xref> demonstrated that SSSM is essentially a special variant of linear attention (LA) (<xref ref-type="bibr" rid="B9">Katharopoulos et al., 2020</xref>), which can be formulated as LA augmented with input and forget gates. On this basis, they developed the Mamba-like Linear Attention (MLLA) operator (<xref ref-type="fig" rid="F3">Figure 3a</xref>). MLLA preserves global attention while reducing computational complexity to approximately <italic>O(N)</italic>. In this study, we replace Transformer MHSA with MLLA and construct MLLAFormer (<xref ref-type="fig" rid="F3">Figure 3b</xref>) to extract and fuse global contextual features from remote sensing imagery.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Feature extraction operator; <bold>(a)</bold> Mamba-like linear attention (MLLA); <bold>(b)</bold> MLLAFormer module; <bold>(c)</bold> Conv block <bold>(d)</bold> ConvFormer module; <bold>(e)</bold> FFTFormer module; Linear refers to a linear layer; &#x2299; and &#x2295; represent element-wise multiplication and addition, respectively; <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is Sigmoid activation; The same below.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g003.tif">
<alt-text content-type="machine-generated">Diagram presents five neural network architecture block diagrams labeled a through e. Diagram (a) shows MLLA with branches of linear, convolutional, and linear attention operations. Diagram (b) displays a sequence of LayerNorm, MLLA, and MLP. Diagram (c) details Linear, GAP, and PConv operations stacked vertically. Diagram (d) includes LayerNorm, Conv block, and MLP. Diagram (e) depicts LayerNorm, FFT block, and output split into iFFT, MLP, and FFT. All diagrams feature arrows indicating data flow and integration points.</alt-text>
</graphic>
</fig>
<p>Convolutional modules are advantageous for local feature extraction, and their effectiveness has been validated in various computer vision tasks. To reduce computational complexity, this study adopts a combination of pointwise convolution (PConv) and depthwise convolution (DConv) as the local feature extraction operator. Furthermore, to enhance local feature representation and reduce potential redundancy, a channel attention operator is applied after convolution, with its computation defined as <xref ref-type="disp-formula" rid="e2">Equation 2</xref>.<disp-formula id="e2">
<mml:math id="m4">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a6;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>&#x3c9;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e2">Equation 2</xref>, <italic>X</italic> denotes the input feature map, <inline-formula id="inf3">
<mml:math id="m5">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents global average pooling (GAP), <inline-formula id="inf4">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the weight parameter of a linear transformation, and <inline-formula id="inf5">
<mml:math id="m7">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the Sigmoid activation function. The convolutional block with this operator is illustrated in <xref ref-type="fig" rid="F3">Figure 3c</xref>. <xref ref-type="bibr" rid="B13">Lin et al. (2023)</xref> showed that the architectural design of Transformers is one of the key factors for their success. To leverage this design while maintaining structural consistency among feature extraction operators, this study replaces Transformer MHSA with the proposed convolutional block to construct ConvFormer (<xref ref-type="fig" rid="F3">Figure 3d</xref>), which is employed to extract local features from remote sensing imagery.</p>
<p>The multi-layer perceptron (MLP) in Transformers tends to focus on low-frequency features in the spatial domain (<xref ref-type="bibr" rid="B19">Rahaman et al., 2019</xref>), limiting its ability to capture fine textures and sharp edges. This constraint is unfavorable for reconstructing high-frequency details in the super-resolution process and for preserving fine object boundaries in semantic segmentation results. Fourier feature transformation, by contrast, is effective in capturing high-frequency details (<xref ref-type="bibr" rid="B5">Feng and Liu, 2025</xref>). Therefore, in this study, we employ the fast Fourier transform (FFT) to convert inputs from the spatial domain to the frequency domain, where an MLP directly extracts image features. On this basis, we construct FFTFormer (<xref ref-type="fig" rid="F3">Figure 3e</xref>) to capture frequency-domain features from remote sensing imagery.</p>
</sec>
<sec id="s2-3-2">
<label>2.3.2</label>
<title>Multi-feature fusion block</title>
<p>MLLAFormer, ConvFormer, and FFTFormer extract heterogeneous features from remote sensing imagery in the global, local, and frequency domains, respectively. Across different layers of the semantic segmentation and super-resolution networks, these three operators should contribute unequally; thus, naive feature concatenation or summation cannot meet this requirement. To adaptively balance their contributions, we propose a Spatially Gated Fusion Mechanism (SGFM), which computes fusion weights in a spatially aware manner. The computation of SGFM is defined in <xref ref-type="disp-formula" rid="e3">Equations 3</xref>, <xref ref-type="disp-formula" rid="e4">4</xref>.<disp-formula id="e3">
<mml:math id="m8">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m9">
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e3">Equations 3</xref>, <xref ref-type="disp-formula" rid="e4">4</xref>, <inline-formula id="inf6">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the feature maps extracted by different operators; <inline-formula id="inf7">
<mml:math id="m11">
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> indicates channel-wise concatenation; <inline-formula id="inf8">
<mml:math id="m12">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes element-wise (Hadamard) multiplication; <inline-formula id="inf9">
<mml:math id="m13">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is a pointwise convolution producing a single-channel output; <inline-formula id="inf10">
<mml:math id="m14">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the spatial gating map; <inline-formula id="inf11">
<mml:math id="m15">
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the fusion result of <inline-formula id="inf12">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf13">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the spatial domain, where <italic>c, h</italic> and <italic>w</italic> represent the number of channels, height, and width of the feature map, respectively. The structure of the Spatially Gated Fusion Mechanism (SGFM) is illustrated in <xref ref-type="fig" rid="F4">Figure 4a</xref>. The gating map <italic>G</italic> is generated jointly from <inline-formula id="inf14">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf15">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and their features are fused through G to produce <italic>Y</italic>. During backpropagation, the gradients are propagated via <italic>G</italic> into the corresponding feature extraction operators of both <inline-formula id="inf16">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf17">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. As a result, each operator not only reduces its own error but also contributes to minimizing the errors of the others, making SGFM function as a lightweight cross-attention mechanism.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Multi-feature fusion block (MFFB); <bold>(a)</bold> Spatially gated fusion mechanism (SGFM); <bold>(b)</bold> Multi-feature fusion block (MFFB); <italic>X</italic>
<sub>
<italic>1</italic>
</sub> and <italic>X</italic>
<sub>
<italic>2</italic>
</sub> are the input feature maps; <italic>G</italic> denotes the spatially gated mask; <italic>Y</italic> is the feature fusion result; <italic>&#xd7; &#x3b3;</italic>
<sub>
<italic>1</italic>
</sub>, etc., denote layer scaling operations based on the learnable parameter <italic>&#x3b3;1.</italic> The same below.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g004.tif">
<alt-text content-type="machine-generated">Diagram illustrating two model architectures: (a) a flowchart showing inputs X1 and X2, gated interactions through components labeled PConv and operations G and 1-G, combining outputs to form Y; (b) two sublayer modules, local and global, each containing layers labeled ConvFormer or MLLAFormer, SGFM, and FFTFormer, with flow connections and summation nodes.</alt-text>
</graphic>
</fig>
<p>Different types of features are extracted using MLLAFormer, ConvFormer, and FFTFormer,and subsequently fused through the Spatially Gated Fusion Mechanism (SGFM) to construct the Multi-Feature Fusion Block (MFFB), whose structure is illustrated in <xref ref-type="fig" rid="F4">Figure 4b</xref>. The MFFB consists of local and global sub-layers. In the local sub-layer, the input feature map <inline-formula id="inf18">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is first processed by two parallel branches, ConvFormer and FFTFormer, to extract local and frequency-domain features. These features are then fused by SGFM to generate the intermediate result <italic>Y</italic>. To reuse the original information, a third parallel branch is introduced in the local sub-layer, which scales the input feature map with a learnable parameter <inline-formula id="inf19">
<mml:math id="m23">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mi>c</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and further fuses it with <italic>Y</italic>, as formulated in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>.<disp-formula id="e5">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e5">Equation 5</xref>, denotes the output of the local sub-layer. In the global sub-layer of MFFB, MLLAFormer replaces ConvFormer to perform global feature extraction. FFTFormer is applied in both the local and global sub-layers and, together with SGFM, guides the extraction of frequency-aware local and global features. Consequently, MFFB captures local and global information in both the spatial and frequency domains, while the gated fusion mechanism balances the relative importance of heterogeneous features. Building on the proposed MFFB, we construct remote sensing super-resolution and semantic segmentation models.</p>
</sec>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>Remote sensing super-resolution model</title>
<sec id="s2-4-1">
<label>2.4.1</label>
<title>Model architecture</title>
<p>We construct the remote sensing super-resolution network using the proposed MFFB as the core feature extraction unit. Following the progressive upscaling paradigm commonly adopted in prior work (<xref ref-type="bibr" rid="B18">Qiao et al., 2024</xref>) (<xref ref-type="fig" rid="F5">Figure 5a</xref>), a convolutional stem first encodes basic features (<inline-formula id="inf20">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) from the low-resolution (LR) image with d channels; then <italic>L</italic> feature extraction layers (FELs) are stacked, each implemented by repeatedly applying the MFFB <italic>D</italic> times to deepen representation capacity while keeping both the input and output channel dimensions at <italic>d</italic>.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Remote sensing image super-resolution network model; <bold>(a)</bold> Super-resolution network architecture; <bold>(b)</bold> Upsample; <italic>D</italic> and <italic>L</italic> are the hyperparameter of the network.The same below.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g005.tif">
<alt-text content-type="machine-generated">Two schematic diagrams compare neural network architectures for image processing. Diagram (a) details a feature extraction layer with repeated blocks and convolution operations, leading to an upsampled output and a processed image of a field. Diagram (b) shows an upsampling module with components like convolution, SGFM, PixelShuffle, and TransConv, connected by arrows indicating workflow direction.</alt-text>
</graphic>
</fig>
<p>We stack <italic>L</italic> carefully designed feature extraction layers (FELs). After the <italic>L</italic>th FEL, a convolutional layer adjusts the channel dimension of the output feature map to match the basic feature <inline-formula id="inf21">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, yielding the high-level feature <inline-formula id="inf22">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. A long skip connection then fuses <inline-formula id="inf23">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf24">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> via element-wise addition to obtain <inline-formula id="inf25">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, from which an upsampler reconstructs the high-resolution (HR) image. Following this architecture, we instantiate each FEL with the proposed MFFB and repeat MFFB <italic>D</italic> times within the FEL to deepen representation capacity; a subsequent convolutional layer and layer normalization are appended to stabilize training, and a local skip connection element-wise fuses the FEL input and output for feature reuse. The input and output channel width of every FEL is fixed at <italic>d</italic>. Consequently, (<italic>d,L,D</italic>) serve as hyperparameters controlling the complexity of the super-resolution model.</p>
</sec>
<sec id="s2-4-2">
<label>2.4.2</label>
<title>Upsample</title>
<p>Interpolation, pixel shuffle, and transposed convolution are commonly used upsamplers in super-resolution networks. While interpolation and pixel shuffle are static upsampling methods without learnable parameters (<xref ref-type="bibr" rid="B7">Guo et al., 2024</xref>), relying on a single strategy limits the ability to capture diverse features. In this study, we design an upsampler that combines pixel shuffle and transposed convolution, as illustrated in <xref ref-type="fig" rid="F5">Figure 5b</xref>. The upsampler contains two parallel branches: one branch applies a transposed convolution with stride <italic>R</italic> and kernel size <italic>R &#xd7; R</italic> to upsample by a factor of <italic>R</italic>, while the other branch performs pixel shuffle upsampling. To endow the pixel shuffle branch with learning capability, a convolutional layer is first introduced to expand the input feature channels to dR<sup>2</sup>, which are then rearranged spatially by pixel shuffle into blocks of size <italic>d &#xd7; R &#xd7; R</italic>, thus achieving <italic>R</italic>-fold upsampling. The outputs of the two branches are adaptively fused by SGFM, and a pointwise convolution with three output channels is applied to reconstruct the HR image. Using this upsampler, the final super-resolution model is constructed, enabling LR remote sensing images to be super-resolved into HR images of size <italic>R</italic> times larger.</p>
</sec>
</sec>
<sec id="s2-5">
<label>2.5</label>
<title>Cropland and shelterbelt semantic segmentation model</title>
<p>We construct a semantic segmentation network for croplands and shelterbelts using an encoder&#x2013;decoder architecture, where the encoder extracts image features and the decoder performs multi-scale contextual feature fusion and pixel-wise classification.</p>
<sec id="s2-5-1">
<label>2.5.1</label>
<title>Encoder</title>
<p>Building on the multi-domain feature extraction capability of MFFB in the global, local, and frequency domains, we construct the semantic segmentation model (<xref ref-type="fig" rid="F6">Figure 6a</xref>). The encoder consists of four stages. In each stage, a convolutional module (<xref ref-type="fig" rid="F6">Figure 6b</xref>) is first applied for basic feature extraction, adjusting the channel dimension of the feature map to <inline-formula id="inf26">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf27">
<mml:math id="m32">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the stage index, while strided convolution is used for spatial downsampling. This is followed by stacking <inline-formula id="inf28">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> MFFB modules. Here, <inline-formula id="inf29">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf30">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are treated as encoder hyperparameters to control the network width (output channel dimension) and depth (number of layers), respectively. Each stage generates feature maps<inline-formula id="inf31">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:msup>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:msup>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> at different spatial scales, where <italic>H</italic> and <italic>W</italic> denote the height and width of the input image. These multi-scale feature maps <inline-formula id="inf32">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> form a feature pyramid, which serves as the input to the decoder.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Remote sensing image semantic segmentation model; <bold>(a)</bold> Semantic segmentation network; <bold>(b)</bold> Module structure; <bold>(c)</bold> Multi-scale context extraction block (MSCE); <italic>I</italic> and <italic>M</italic> denote the input image and segmentation mask, respectively;/4 indicates that the spatial resolution of the feature map is 1/4 of <italic>I</italic>; <italic>L</italic>
<sub>
<italic>i</italic>
</sub>
<italic>, D</italic>
<sub>
<italic>i</italic>
</sub>
<italic>, C</italic> and <italic>N</italic> are network hyperparameters; <italic>F</italic>
<sub>
<italic>4</italic>
</sub>
<italic>, F</italic>
<sub>
<italic>up4</italic>
</sub>
<italic>, F, F&#x2032;, F</italic>
<sub>
<italic>s1</italic>
</sub>, <italic>etc.</italic>, represent feature maps at different scales; Up represents bilinear interpolation upsampling, and &#x2191;4 indicates a spatial upsampling factor of 4; 3 &#xd7; 3 Conv-<italic>D</italic>
<sub>1</sub>, <italic>s2</italic> denotes a convolution with a kernel size of 3 &#xd7; 3, <italic>D</italic>
<sub>1</sub> output channels, and a stride of 2.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g006.tif">
<alt-text content-type="machine-generated">Diagram showing three subfigures: (a) a deep learning network architecture with encoder and decoder blocks, labeled modules, and data flow arrows, (b) magnified view of the Stem and Downsample blocks with convolution layers, and (c) a detailed pathway for feature extraction and fusion using shuffling, pooling, and MSCE modules with a former unit.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-5-2">
<label>2.5.2</label>
<title>Multi-scale environmental feature extraction module</title>
<p>The decoder first aggregates multi-level encoder features (<inline-formula id="inf33">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf34">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and<inline-formula id="inf35">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). Three parallel branches upsample these features to a common spatial resolution. In each branch, a pointwise convolution adjusts the channel dimension of <inline-formula id="inf36">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from <inline-formula id="inf37">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to <italic>C</italic>, followed by bilinear interpolation to obtain <inline-formula id="inf38">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The parameter <italic>C</italic> serves as a decoder hyperparameter to balance modeling capacity and computational cost. The three upsampled features are then concatenated along the channel dimension and fused via a pointwise convolution to perform feature aggregation and channel alignment, as formulated in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>.<disp-formula id="e6">
<mml:math id="m44">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e6">Equation 6</xref>, <inline-formula id="inf39">
<mml:math id="m45">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>8</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the aggregated feature map and <italic>N</italic> represents the number of channels. <italic>N</italic> is treated as another decoder hyperparameter, serving the same role as <italic>C</italic> in balancing modeling capacity and computational complexity.</p>
<p>Since the three parallel branches only involve pointwise convolution and upsampling, they lack the ability to model multi-scale contextual environments. Incorporating multi-scale context modeling of the input imagery can enhance cross-scale semantic representation, mitigate intra-class scale variation, and improve inter-class discrimination. To this end, we design a Multi-Scale Context Extraction (MSCE) module (<xref ref-type="fig" rid="F6">Figure 6c</xref>) to strengthen the decoder&#x2019;s multi-scale contextual modeling capacity. The input to MSCE is the aggregated feature map <italic>F</italic>. The module consists of three branches: in Branch 1, pixel unshuffle (PU) rearranges non-overlapping 4 &#xd7; 4 spatial blocks of <italic>F</italic> into the channel dimension to produce <inline-formula id="inf40">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>16</mml:mn>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x2044;</mml:mo>
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x2044;</mml:mo>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, effectively encoding a receptive field of 4 &#xd7; 4 without information loss. In Branch 2, a 3 &#xd7; 3 convolution with stride 2 is applied to <italic>F</italic>, followed by PU that rearranges non-overlapping 2 &#xd7; 2 blocks into the channel dimension, generate <inline-formula id="inf41">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> with an effective receptive field of 5 &#xd7; 5. In Branch 3, a 5 &#xd7; 5 convolution with stride 4 is used to capture larger-scale contextual features, producing <inline-formula id="inf42">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>4</mml:mn>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x2044;</mml:mo>
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x2044;</mml:mo>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. These branches provide receptive fields of different scales and thus capture contextual information at multiple levels. To balance contributions across scales while reducing computational cost, pointwise convolution is first applied to adjust the channel dimension of <inline-formula id="inf43">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf44">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf45">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to <italic>N</italic>, after which the three are fused. Although MSCE extracts multi-scale contextual features, global context is still absent. Therefore, we further apply MLLAFormer to the concatenated multi-scale features to integrate global context, with the overall fusion process defined in <xref ref-type="disp-formula" rid="e7">Equation 7</xref>.<disp-formula id="e7">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>MLLAFormer</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e7">Equation 7</xref>, denotes the multi-scale contextual features enhanced with global information. These features are further upsampled to match the spatial resolution of <italic>F</italic>, and a pointwise convolution is applied to adjust the channel dimension, as formulated in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>.<disp-formula id="e8">
<mml:math id="m53">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi mathvariant="italic">&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="italic">N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="italic">U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="italic">F</mml:mi>
<mml:mi mathvariant="italic">s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e8">Equation 8</xref>, <inline-formula id="inf46">
<mml:math id="m54">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x2044;</mml:mo>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x2044;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the multi-scale contextual features generated by MSCE, and <italic>U</italic> represents the upsampling operation. The overall computation of MSCE can be concisely expressed as <xref ref-type="disp-formula" rid="e9">Equation 9</xref>.<disp-formula id="e9">
<mml:math id="m55">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>MSCE</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-5-3">
<label>2.5.3</label>
<title>Decoder architecture</title>
<p>Compared with <italic>F</italic>, <inline-formula id="inf47">
<mml:math id="m56">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> incorporates multi-scale and global contextual information, which provides crucial auxiliary cues for foreground classification (<xref ref-type="bibr" rid="B25">Terven et al., 2023</xref>). Since <inline-formula id="inf48">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf49">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf50">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are suited to segment foreground targets at different scales, we fuse <inline-formula id="inf51">
<mml:math id="m60">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> into each of these feature maps. To retain fusion flexibility, we employ SGFM; taking the fusion with <inline-formula id="inf52">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as an example, the process is formulated in <xref ref-type="disp-formula" rid="e10">Equation 10</xref>.<disp-formula id="e10">
<mml:math id="m62">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>SGFM</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e10">Equation 10</xref>, <inline-formula id="inf53">
<mml:math id="m63">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the fused feature. By the same procedure, <inline-formula id="inf54">
<mml:math id="m64">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is fused with <inline-formula id="inf55">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf56">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to obtain <inline-formula id="inf57">
<mml:math id="m67">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf58">
<mml:math id="m68">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, respectively.</p>
<p>MSCE takes F as input and fuses it to produce <inline-formula id="inf59">
<mml:math id="m69">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, a multi-scale contextual feature with global information. To further enhance the diversity of contextual features, following the approach of DeepLab v3&#x2b;, global average pooling (GAP) is employed to extract the global feature <inline-formula id="inf60">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>GAP</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which is then utilized to assist semantic segmentation. Simultaneously, to leverage <italic>F</italic> potential for semantic segmentation, it undergoes a transformation via pointwise convolution, yielding <inline-formula id="inf61">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which is then applied for semantic segmentation. Multiple feature types are cascaded, and an MLP composed of two pointwise convolutions predicts the semantic segmentation mask <italic>M</italic> for the input image. This process is expressed as <xref ref-type="disp-formula" rid="e11">Equation 11</xref>:<disp-formula id="e11">
<mml:math id="m72">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtext>MLP</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<p>The equation employs an upsampling function U to upscale the segmentation mask predicted by the MLP to the same size as the input image <italic>I</italic>. The structure of the decoder and the entire remote sensing image semantic segmentation network is shown in <xref ref-type="fig" rid="F6">Figure 6a</xref>.</p>
</sec>
</sec>
<sec id="s2-6">
<label>2.6</label>
<title>Model training and evaluation</title>
<sec id="s2-6-1">
<label>2.6.1</label>
<title>Implementation detail</title>
<p>Training and testing experiments for super-resolution models and semantic segmentation models were conducted on multiple computers equipped with Intel Xeon Silver 4314 processors, 128&#xa0;GB of memory, Nvidia L20 compute cards, Windows Server 2022 operating systems, and CUDA 11.8 acceleration libraries. Based on PyTorch 2.7, all models were implemented using Python, alongside the corresponding computational programs for the technical framework depicted in <xref ref-type="fig" rid="F2">Figure 2</xref>. The farmland forest network dataset, comprising research area imagery and annotation results, was divided into five equal parts. Four parts were randomly selected as training data, with the remaining part reserved as test data.</p>
<p>The super-resolution model was trained using the AdamW optimizer with a learning rate of 2 &#xd7; 10<sup>&#x2212;4</sup>. Training samples were dynamically generated. Taking an upsampling factor <italic>R &#x3d; 2</italic> as an example: first, randomly crop image patches with width and height ranging from 128 to 512 pixels from the training images. Then, resize these patches to 128 &#xd7; 128 pixels as high-resolution (HR) images. Perform bilinear interpolation downsampling on the HR images to generate low-resolution (LR) images with a resolution of 64 &#xd7; 64 pixels. Use LR and HR images as sample pairs to train the super-resolution model. Existing super-resolution models (<xref ref-type="bibr" rid="B1">Ariav and Cohen, 2022</xref>) commonly employ L1 as the loss function. To enhance the structural accuracy of local super-resolution reconstruction, we further incorporate the SSIM loss <italic>L</italic>
<sub>
<italic>SSIM</italic>
</sub> (<xref ref-type="bibr" rid="B27">Wang et al., 2004</xref>).The loss function Lsup for our super-resolution model is defined as shown in <xref ref-type="disp-formula" rid="e12">Equation 12</xref>:<disp-formula id="e12">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi mathvariant="italic">sup</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>In the equation, <inline-formula id="inf62">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf63">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the proportional adjustment coefficients for the L<sub>1</sub> and L<sub>SSIM</sub> losses, respectively, and <inline-formula id="inf64">
<mml:math id="m76">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is the super-resolution reconstructed image.</p>
<p>The semantic segmentation model was trained using the AdamW optimizer with a learning rate of 10<sup>&#x2212;5</sup> and input images at a resolution of 512 &#xd7; 512 pixels. Randomly cropped image patches and their corresponding semantic annotations from the training dataset were used as training samples, with data augmentation applied through random flipping and color dithering. To address the pixel imbalance across three land cover classes in the study area imagery, the loss function <italic>L</italic> for the semantic segmentation model was defined using both the commonly applied cross-entropy loss (<italic>L</italic>
<sub>
<italic>CE</italic>
</sub>) and the Dice loss (<italic>L</italic>
<sub>
<italic>Dice</italic>
</sub>), which is more robust to sample imbalance, as shown in <xref ref-type="disp-formula" rid="e13">Equations 13</xref>&#x2013;<xref ref-type="disp-formula" rid="e15">15</xref>.<disp-formula id="e13">
<mml:math id="m77">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
<disp-formula id="e14">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi mathvariant="italic">log</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
<disp-formula id="e15">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
</p>
<p>In the equation, <italic>M</italic>
<sub>
<italic>gt</italic>
</sub> represents the ground truth semantic labels corresponding to the input image, with <italic>&#x3b1; &#x3d; 0.4</italic> and <italic>&#x3b2; &#x3d; 0.6</italic> as proportional adjustment coefficients. Using arg min&#xa0;<italic>L</italic> as the objective, the semantic segmentation model is trained and optimized until the loss converges to a stable value.</p>
</sec>
<sec id="s2-6-2">
<label>2.6.2</label>
<title>Evaluation metric</title>
<p>The performance of semantic segmentation models is evaluated using precision, recall, intersection over union (IoU), pixel accuracy (PA), and mean intersection over union (mIoU) (<xref ref-type="bibr" rid="B8">He et al., 2024</xref>). The definitions of each metric are given in <xref ref-type="disp-formula" rid="e16">Equations 16</xref>&#x2013;<xref ref-type="disp-formula" rid="e20">20</xref>:<disp-formula id="e16">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
<disp-formula id="e17">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
<disp-formula id="e18">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mtext>IoU</mml:mtext>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>
<disp-formula id="e19">
<mml:math id="m83">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:msubsup>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
<disp-formula id="e20">
<mml:math id="m84">
<mml:mrow>
<mml:mtext>mIoU</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mtext>IoU</mml:mtext>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>
</p>
<p>In the equation, <inline-formula id="inf65">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of pixels correctly predicted as category <italic>i</italic>, <inline-formula id="inf66">
<mml:math id="m86">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of pixels incorrectly predicted as category <italic>i</italic>, <inline-formula id="inf67">
<mml:math id="m87">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of pixels in category <italic>i</italic> incorrectly predicted as other categories, and <inline-formula id="inf68">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the total number of pixels in the test set samples. The performance of super-resolution models is evaluated using peak signal-to-noise ratio (PSNR) (<xref ref-type="bibr" rid="B24">Tanchenko, 2014</xref>) and structural similarity index (SSIM).</p>
</sec>
<sec id="s2-6-3">
<label>2.6.3</label>
<title>Evaluation metric for geometric fidelity</title>
<p>To rigorously quantify the geometric fidelity and boundary integrity of the vectorized shelterbelts, we introduced the &#x201c;Shelterbelt Closure Index&#x201d; as a key evaluation metric. According to the Chinese National Standard Design Specifications for Farmland Shelterbelt Engineering (GB/T 50817-2013), the closure index of shelterbelts surrounding farmland parcels in severely wind-sand affected areas should be &#x2265;0.75 to ensure effective protection. The calculation methodology is implemented as follows: First, a buffer zone with a radius of 150&#xa0;m is generated for each individual field polygon to identify the corresponding protective shelterbelts Second, the &#x201c;protection angle&#x201d; is defined as the maximum geometric angle formed by connecting the geometric centre of the field polygon to the boundary extremities of a single shelterbelt (as illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>) Finally, the closure rate is calculated as the ratio of the cumulative sum of all unique protection angles within the buffer zone to 360&#xb0;, with a value &#x2265; 0.75$ serving as the threshold for meeting the geometric standard.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Example of a simple canopy closure calculation for shelterbelts.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g007.tif">
<alt-text content-type="machine-generated">Abstract graphic featuring a tilted light green rectangle with the words &#x22;palatino linotype&#x22; in black serif font. Several dashed red and black lines radiate from a central point with three red arcs, and bold green angled shapes border the rectangle.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<sec id="s3-1">
<label>3.1</label>
<title>Effectiveness analysis of super-resolution models</title>
<sec id="s3-1-1">
<label>3.1.1</label>
<title>Comparison of super-resolution methods</title>
<p>To validate the effectiveness of the super-resolution model proposed in this study, we compared it with several state-of-the-art super-resolution methods under a 4x upsampling task. These included the traditional bicubic interpolation baseline method, the Transformer-based super-resolution model TransENet (<xref ref-type="bibr" rid="B12">Lei et al., 2022</xref>), and the Mamba-based models MambaIR (<xref ref-type="bibr" rid="B7">Guo et al., 2024</xref>) and FreMamba (<xref ref-type="bibr" rid="B33">Xiao et al., 2024</xref>). The hyperparameters for our super-resolution model were set to <italic>d &#x3d; 96, D &#x3d; 6</italic>, and <italic>L &#x3d; 6</italic>. All methods were trained and tested on the farmland forest network dataset. The comparison models employed the same training methodology as their original literature. Results are presented in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Performance comparison of super-resolution methods.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">Parameters/M</th>
<th align="center">PSNR/dB</th>
<th align="center">SSIM/%</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Proposed method</td>
<td align="center">13.69</td>
<td align="center">31.75</td>
<td align="center">84.49</td>
</tr>
<tr>
<td align="center">TransENet</td>
<td align="center">37.74</td>
<td align="center">31.31</td>
<td align="center">82.91</td>
</tr>
<tr>
<td align="center">MambaIR</td>
<td align="center">11.76</td>
<td align="center">31.28</td>
<td align="center">82.73</td>
</tr>
<tr>
<td align="center">FreMamba</td>
<td align="center">16.72</td>
<td align="center">30.12</td>
<td align="center">81.15</td>
</tr>
<tr>
<td align="center">Bicubic</td>
<td align="center">&#x2014;</td>
<td align="center">25.65</td>
<td align="center">77.25</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T1">Table 1</xref>, the super-resolution model developed in this study demonstrates the highest performance, outperforming the comparison methods in both PSNR and SSIM metrics. This approach is effective for the upsampling task on remote sensing images of farmland shelterbelts. Compared to TransENet, our model achieves a 0.44&#xa0;dB higher PSNR and a 1.58 percentage point improvement in SSIM, while requiring only 36% of TransENet&#x2019;s parameters, indicating superior parameter efficiency. Our Multi-Feature Fusion Block (MFFB) simultaneously extracts features from both spatial and frequency domains, offering more diverse feature extraction capabilities than the Transformer architecture. Compared to FreMamba and MambaIR, our model also demonstrates significant advantages. In terms of PSNR, it outperforms the former by 0.47&#xa0;dB and the latter by 1.63&#xa0;dB. For SSIM, it achieves improvements of 1.76 and 3.34 percentage points, respectively. Despite having 18% more parameters than FreMamba, our model achieves better performance gains, reflecting a favorable performance-efficiency balance. All deep learning-based models substantially outperformed the traditional bicubic interpolation benchmark. Compared to this baseline, the proposed model achieved a 6.10&#xa0;dB improvement in PSNR and a 7.24 percentage point increase in SSIM, confirming the effectiveness of feature-reconstruction-based upscaling methods. Overall, the proposed super-resolution model demonstrates applicability for upscaling remote sensing imagery of farmland shelterbelts.</p>
</sec>
<sec id="s3-1-2">
<label>3.1.2</label>
<title>Ablation study</title>
<p>To analyze the impact of introducing LSSIM loss during the design and training of MFFB and Upsampler on the super-resolution model in this study, further ablation experiments were conducted. Model 1 replaces MFFB&#x2019;s MLLAFormer and FFTFormer with ConvFormer, replaces SGFM with element-wise addition, and replaces the upsampler with PixelShuffle. Training uses only L1 loss, meaning MLLAFormer, FFTFormer, SGFM, Upsampler, and <italic>L</italic>
<sub>
<italic>SSIM</italic>
</sub> are disabled in Model 1. Building upon Model 1, subsequent processing steps were progressively enabled to construct Models 2 through 6. All models performed 4x upsampling and underwent identical training and testing on the farmland forest network dataset, with results presented in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Comparison of super-resolution results across model variants.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model no.</th>
<th colspan="5" align="center">Treatments</th>
<th rowspan="2" align="center">PSNR/dB</th>
<th rowspan="2" align="center">SSIM/%</th>
</tr>
<tr>
<th align="center">MLLAFormer</th>
<th align="center">FFTFormer</th>
<th align="center">SGFM</th>
<th align="center">Upsampler</th>
<th align="center">LSSIM</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">1</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="center">31.34</td>
<td align="center">81.50</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">&#x2713;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="center">31.44</td>
<td align="center">83.03</td>
</tr>
<tr>
<td align="center">3</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="center">31.59</td>
<td align="center">83.60</td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="center">31.64</td>
<td align="center">83.73</td>
</tr>
<tr>
<td align="center">5</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="left">&#x200b;</td>
<td align="center">31.70</td>
<td align="center">83.79</td>
</tr>
<tr>
<td align="center">6</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">31.75</td>
<td align="center">84.49</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T2">Table 2</xref>, PSNR and SSIM gradually increase with the introduction of different modules and processing steps, indicating that MFFB, Upsampler, and <italic>L</italic>
<sub>
<italic>SSIM</italic>
</sub> all enhance the super-resolution performance of remote sensing images of farmland shelterbelts. Model 1 (baseline model) achieves a PSNR of 31.34&#xa0;dB without enabling MLLAFormer processing, comparable to TransENet in <xref ref-type="table" rid="T1">Table 1</xref>. At this stage, MFFB consists solely of the ConvFormer base module, demonstrating the effectiveness of the ConvFormer architecture design. Compared to Model 1, Model 2 achieved a 0.1&#xa0;dB increase in PSNR and a 1.53 percentage point improvement in SSIM, demonstrating MLLAFormer&#x2019;s ability to enhance super-resolution performance. Unlike ConvFormer, MLLAFormer replaces convolutions with MLLA, whose global modeling capability strengthens the spatial dependencies of features and improves the spatial continuity reconstruction in super-resolution. Compared to Model 2, Model 3 achieves a PSNR improvement of 0.15&#xa0;dB and an SSIM increase of 0.57 percentage points. FFTFormer extracts features in the frequency domain, strengthening image texture detail and edge recovery capabilities. Model 4 enables SGFM on top of Model 3, resulting in marginal PSNR and SSIM gains of 0.05&#xa0;dB and 0.13 percentage points, respectively. Model 5 achieves a 0.06&#xa0;dB PSNR improvement after enabling the Upsampler. By combining transposed convolution with pixel rearrangement, the Upsampler enhances the model&#x2019;s upscaling capability. Model 6 introduces <italic>L</italic>
<sub>
<italic>SSIM</italic>
</sub> on top of Model 5, achieving 0.05&#xa0;dB and 0.7 percentage point improvements in PSNR and SSIM, respectively. <italic>L</italic>
<sub>
<italic>SSIM</italic>
</sub> better constrains the model&#x2019;s reconstruction quality of local image structures, thereby improving overall image reconstruction. Compared to Model 1, Model 6 achieved 0.41&#xa0;dB and 2.99 percentage point improvements in PSNR and SSIM respectively, indicating the effectiveness of all processing steps. The modular design in this study enhances the super-resolution performance of remote sensing imagery of farmland shelterbelts. Model 6 is adopted as the super-resolution model for subsequent experimental analysis.</p>
</sec>
<sec id="s3-1-3">
<label>3.1.3</label>
<title>The impact of upsampling strategies on super-resolution performance</title>
<p>To investigate the impact of different upscaling strategies on super-resolution model performance, we retrained Model 6 from <xref ref-type="table" rid="T2">Table 2</xref> with an upscaling factor of 2. Based on this, we compared three upscaling approaches: 2&#xd7; upscaling using the newly trained model, 4&#xd7; upscaling achieved through a two-step staged super-resolution process with the newly trained model, and direct 4x upscaling using the original Model 6. The results are presented in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Impact of upsampling strategies on super-resolution performance.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Upsampling strategy</th>
<th align="center">PSNR/dB</th>
<th align="center">SSIM/%</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">1&#x2192;2</td>
<td align="center">40.76</td>
<td align="center">97.53</td>
</tr>
<tr>
<td align="center">1&#x2192;4</td>
<td align="center">31.75</td>
<td align="center">84.49</td>
</tr>
<tr>
<td align="center">1&#x2192;2&#x2192;4</td>
<td align="center">34.60</td>
<td align="center">85.97</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T3">Table 3</xref> demonstrates that the upsampling strategy significantly impacts the super-resolution reconstruction performance of remote sensing images of farmland shelterbelts. Compared to 2&#xd7; upsampling, 4&#xd7; upsampling yields poorer reconstruction results for super-resolution images. 2&#xd7; upsampling only requires expanding a single LR pixel into a 2 &#xd7; 2 HR pixel block, making it easier for the model to learn local high-frequency details and low-frequency structures. Compared to 2&#xd7; upsampling, direct 4x upsampling resulted in PSNR and SSIM decreases of 9.01&#xa0;dB and 13.04 percentage points, respectively. 4&#xd7; upsampling requires predicting a 4 &#xd7; 4 HR image block from a single LR pixel and its surrounding blurred structures, making it challenging for the model to accurately infer fine local details with limited contextual features. The staged upsampling process outperforms direct 4x upsampling, achieving a 2.85&#xa0;dB improvement in PSNR and a 1.48 percentage point increase in SSIM. First, 2&#xd7; upsampling generates high-quality intermediate-resolution images, providing reliable input features for the second-stage MFFB. This enables the model to enhance details and sharpen images based on partially optimized data. This strategy mitigates the challenges of high-magnification upsampling while leveraging the progressive learning advantages of the MFFB feature extraction module. In the super-resolution task for remote sensing images of farmland shelterbelts, the upscaling strategy significantly impacts image reconstruction quality. Under identical upscaling factors, staged upscaling outperforms single-stage approaches. All subsequent experiments involving 4&#xd7; upscaling employ the two-stage upscaling process.</p>
</sec>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Effectiveness analysis of semantic segmentation models</title>
<sec id="s3-2-1">
<label>3.2.1</label>
<title>Comparison of semantic segmentation models</title>
<p>To systematically evaluate the effectiveness of the semantic segmentation model developed in this study for farmland forest network segmentation tasks, five representative semantic segmentation models were selected for comparative analysis. These include RS<sup>3</sup> Mamba (<xref ref-type="bibr" rid="B15">Ma et al., 2024a</xref>) based on Mamba, DC-Swin (<xref ref-type="bibr" rid="B28">Wang et al., 2022</xref>) and GASOT-Net (<xref ref-type="bibr" rid="B35">Zhang et al., 2024</xref>) based on Swin Transformer, SSRS (<xref ref-type="bibr" rid="B16">Ma et al., 2024b</xref>) fine-tuned on SAM (<xref ref-type="bibr" rid="B10">Kirillov et al., 2023</xref>), and the convolutional-based DeepLabV3&#x2b; and UNet. The hyperparameters for the model under study were set as follows: <italic>L1 &#x3d; 2, L2 &#x3d; 4, L3 &#x3d; 8, L4 &#x3d; 4, D1 &#x3d; 64, D2 &#x3d; 128, D3 &#x3d; 256, D4 &#x3d; 512, C &#x3d; 320, N &#x3d; 96</italic>. All models were trained and tested using the farmland forest network dataset, with inputs being two-stage quadruple-upsampled images. Results are presented in <xref ref-type="table" rid="T4">Table 4</xref>.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Performance comparison of semantic segmentation models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Methods</th>
<th colspan="3" align="center">Cropland</th>
<th colspan="3" align="center">Shelterbelt</th>
<th rowspan="2" align="center">mIoU</th>
<th rowspan="2" align="center">PA/%</th>
</tr>
<tr>
<th align="center">Precision/%</th>
<th align="center">Recall/%</th>
<th align="center">IoU</th>
<th align="center">Precision/%</th>
<th align="center">Recall/%</th>
<th align="center">IoU</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Proposed method</td>
<td align="center">96.42</td>
<td align="center">98.40</td>
<td align="center">0.9513</td>
<td align="center">82.83</td>
<td align="center">87.99</td>
<td align="center">0.7403</td>
<td align="center">0.8345</td>
<td align="center">94.89</td>
</tr>
<tr>
<td align="center">RS<sup>3</sup> Mamba</td>
<td align="center">93.09</td>
<td align="center">95.81</td>
<td align="center">0.9208</td>
<td align="center">82.53</td>
<td align="center">85.05</td>
<td align="center">0.7399</td>
<td align="center">0.8169</td>
<td align="center">93.20</td>
</tr>
<tr>
<td align="center">DC-Swin</td>
<td align="center">94.10</td>
<td align="center">96.41</td>
<td align="center">0.9307</td>
<td align="center">82.55</td>
<td align="center">85.06</td>
<td align="center">0.7401</td>
<td align="center">0.8236</td>
<td align="center">94.55</td>
</tr>
<tr>
<td align="center">GASOT-Net</td>
<td align="center">94.69</td>
<td align="center">97.57</td>
<td align="center">0.9411</td>
<td align="center">82.11</td>
<td align="center">85.51</td>
<td align="center">0.7386</td>
<td align="center">0.8337</td>
<td align="center">94.73</td>
</tr>
<tr>
<td align="center">SSRS</td>
<td align="center">90.21</td>
<td align="center">95.69</td>
<td align="center">0.9174</td>
<td align="center">79.91</td>
<td align="center">82.82</td>
<td align="center">0.7068</td>
<td align="center">0.8014</td>
<td align="center">91.01</td>
</tr>
<tr>
<td align="center">DeepLabV3&#x2b;</td>
<td align="center">94.53</td>
<td align="center">96.85</td>
<td align="center">0.9390</td>
<td align="center">80.47</td>
<td align="center">84.80</td>
<td align="center">0.7362</td>
<td align="center">0.8251</td>
<td align="center">93.57</td>
</tr>
<tr>
<td align="center">UNet</td>
<td align="center">65.12</td>
<td align="center">80.76</td>
<td align="center">0.6773</td>
<td align="center">54.66</td>
<td align="center">72.65</td>
<td align="center">0.5705</td>
<td align="center">0.6159</td>
<td align="center">78.37</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T4">Table 4</xref>, the semantic segmentation model developed in this study outperforms the comparison models across all metrics. The segmentation accuracy for cultivated land reached 96.42% with an IoU of 0.9513, while the accuracy for shelterbelt segmentation was 82.83% with an IoU of 0.7403. The model achieved a pixel accuracy (PA) of 94.89% and a mean IoU (mIoU) of 0.8345. Compared with RS<sup>3</sup> Mamba, our model improved the IoU for cultivated land segmentation by 1.31%, increased the pixel recall for shelterbelts by 3.19 percentage points, enhanced PA by 1.32 percentage points, and boosted mIoU by 1.14%. The MFFB module in this model employs FFTFormer to capture high-frequency textures like canopy edges, mitigating confusion between forest strips and cultivated crops at marginal locations. MLLAFormer establishes long-range spatial dependencies, reducing fragmentation in elongated forest strips. Compared to the Transformer-based DC-Swin, our method achieves advantages of 1.09% in mIoU and 0.34% in PA, indicating that the multi-feature fusion mechanism outperforms the single attention paradigm. In comparison with GASOT-Net, which is also based on Swin Transformer, our method achieves a marginal improvement in mIoU 0.08%, while significantly increasing the segmentation accuracy of shelterbelts by 0.72 percentage points, further demonstrating the effectiveness of multi-domain feature fusion for slender objects. Compared to SSRS, our model achieves a 1.32% improvement in mIoU. Agairbelts respectively, along with a 4.13% increase in mIoU and a 3.79 percentage point rise in PA. When benchmarked against UNet, our model achieves a 26.20% improvement in mIoU. The MFFB feature fusion in this model, achieved through a triadic approach combining global, local, and frequency-domain features, addresses the challenge of capturing slender forest strips and objects with similar textures&#x2014;a limitatinst DeepLabV3&#x2b;, our model demonstrates 3.70% and 3.65% higher IoU for cropland and shelteon when relying solely on spatial domain convolutions. All methods demonstrated higher segmentation accuracy for farmland than for shelterbelts, indicating that precise segmentation of elongated objects remains a challenge. Overall, the proposed semantic segmentation model is suitable for farmland forest network segmentation.</p>
<p>To further verify the generalization capability of the proposed method, this study conducted additional comparisons with various baseline methods on the LOVEDA dataset. The experiment quantified the segmentation accuracy of five foreground categories, with the results of each category presented in terms of the Intersection over Union (IoU), as detailed in <xref ref-type="table" rid="T5">Table 5</xref>.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Performance comparison of different semantic segmentation models on the LOVEDA dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">Building</th>
<th align="center">Road</th>
<th align="center">Water</th>
<th align="center">Barren</th>
<th align="center">Forest</th>
<th align="center">Agriculture</th>
<th align="center">mIoU</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Proposed method</td>
<td align="center">0.6017</td>
<td align="center">0.5929</td>
<td align="center">0.6843</td>
<td align="center">0.3851</td>
<td align="center">0.4115</td>
<td align="center">0.2738</td>
<td align="center">0.5054</td>
</tr>
<tr>
<td align="center">RS3 Mamba</td>
<td align="center">0.5875</td>
<td align="center">0.5792</td>
<td align="center">0.6100</td>
<td align="center">0.3724</td>
<td align="center">0.3967</td>
<td align="center">0.2564</td>
<td align="center">0.4693</td>
</tr>
<tr>
<td align="center">DC-Swin</td>
<td align="center">0.5891</td>
<td align="center">0.5757</td>
<td align="center">0.6769</td>
<td align="center">0.2593</td>
<td align="center">0.3710</td>
<td align="center">0.2085</td>
<td align="center">0.4922</td>
</tr>
<tr>
<td align="center">GASOT-Net</td>
<td align="center">0.5896</td>
<td align="center">0.5050</td>
<td align="center">0.6691</td>
<td align="center">0.3572</td>
<td align="center">0.3741</td>
<td align="center">0.2565</td>
<td align="center">0.4957</td>
</tr>
<tr>
<td align="center">SSRS</td>
<td align="center">0.5783</td>
<td align="center">0.5618</td>
<td align="center">0.6637</td>
<td align="center">0.2639</td>
<td align="center">0.3894</td>
<td align="center">0.2352</td>
<td align="center">0.4799</td>
</tr>
<tr>
<td align="center">DeepLabV3&#x2b;</td>
<td align="center">0.5278</td>
<td align="center">0.5189</td>
<td align="center">0.6347</td>
<td align="center">0.2984</td>
<td align="center">0.3423</td>
<td align="center">0.1844</td>
<td align="center">0.4412</td>
</tr>
<tr>
<td align="center">UNet</td>
<td align="center">0.5235</td>
<td align="center">0.4817</td>
<td align="center">0.5426</td>
<td align="center">0.2019</td>
<td align="center">0.3247</td>
<td align="center">0.1632</td>
<td align="center">0.4261</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The results in <xref ref-type="table" rid="T5">Table 5</xref> demonstrate that the proposed method achieves the highest mIoU on the public LOVEDA dataset, confirming its strong generalization capability. The model performs particularly well on structurally distinct categories such as &#x201c;building,&#x201d; &#x201c;road,&#x201d; and &#x201c;river,&#x201d; indicating that its integrated frequency-domain and global features offer significant advantages in extracting generalizable edge and structural information. Compared to other models, the proposed method maintains a leading performance on datasets with more complex scenes and greater object diversity, proving that its design is not only tailored to the specific task of farmland shelterbelt extraction but also exhibits broad applicability.</p>
<p>Building upon the evaluation of model accuracy, to comprehensively analyze the feasibility and efficiency advantages of the proposed method in practical applications, we further compared the computational complexity and inference speed of various semantic segmentation models. <xref ref-type="table" rid="T6">Table 6</xref> presents the performance metrics of the proposed method and the comparative models in terms of parameter count (Params), computational cost (GMACs), and average inference time per image (Latency). All tests were conducted under identical hardware conditions, with an input image resolution of 512 &#xd7; 512 pixels and a batch size of 1, reflecting the operational efficiency of the models in real deployment scenarios. The experimental results are presented in <xref ref-type="table" rid="T6">Table 6</xref>.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Comparison of computational efficiency and deployment metrics.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">Params(M)</th>
<th align="center">GMACs</th>
<th align="center">Latency (ms/img)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Proposed method</td>
<td align="center">94.1</td>
<td align="center">126.5</td>
<td align="center">29.2</td>
</tr>
<tr>
<td align="center">RS<sup>3</sup> Mamba</td>
<td align="center">45.7</td>
<td align="center">78.4</td>
<td align="center">22.1</td>
</tr>
<tr>
<td align="center">DC-Swin</td>
<td align="center">96.2</td>
<td align="center">161.3</td>
<td align="center">47.5</td>
</tr>
<tr>
<td align="center">GASOT-Net</td>
<td align="center">176.4</td>
<td align="center">277.6</td>
<td align="center">103.8</td>
</tr>
<tr>
<td align="center">SSRS</td>
<td align="center">48.7</td>
<td align="center">85.3</td>
<td align="center">28.3</td>
</tr>
<tr>
<td align="center">DeepLabV3&#x2b;</td>
<td align="center">43.3</td>
<td align="center">82.1</td>
<td align="center">60.8</td>
</tr>
<tr>
<td align="center">UNet</td>
<td align="center">18.9</td>
<td align="center">219.2</td>
<td align="center">49.7</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In terms of computational efficiency and deployment performance, the proposed method achieves a favorable balance between model complexity and inference speed. Although its parameter count is higher than that of lightweight models such as RS<sup>3</sup> Mamba, it is significantly lower than the computationally intensive GASOT-Net. In terms of computational cost (GMACs), the proposed method is only 45.5% of GASOT-Net and lower than UNet, indicating that its multi-feature fusion mechanism enhances representational capacity without introducing excessive computational overhead. Moreover, The inference latency of 29.2&#xa0;ms indicates that the MFFB module significantly enhances feature representation without introducing exponential computational overhead. Overall, while maintaining high segmentation accuracy, the proposed method exhibits relatively superior computational efficiency and real-time processing capability, making it suitable for remote sensing extraction tasks of farmland shelterbelts that require both efficiency and precision.</p>
</sec>
<sec id="s3-2-2">
<label>3.2.2</label>
<title>Feature extraction module for semantic segmentation effectiveness analysis</title>
<p>To analyse the contribution of MFFB&#x2019;s core submodules&#x2014;MLLAFormer, FFTFormer, and SGFM&#x2014;to semantic segmentation, four variant models were constructed. Baseline Model A disables MLLAFormer, FFTFormer, and SGFM, replacing them within MFFB with ConvFormer for MLLAFormer and FFTFormer, and element-wise addition for SGFM. Model B enables MLLAFormer on top of Model A, Model C enables FFTFormer on top of Model B, and Model D further enables SGFM. On the farmland forest network dataset, each variant model was trained and tested using identical methods, with results shown in <xref ref-type="table" rid="T7">Table 7</xref>.</p>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>Comparison of semantic segmentation results across model variants.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model no.</th>
<th colspan="3" align="center">Treatments</th>
<th colspan="3" align="center">Cropland</th>
<th colspan="3" align="center">Shelterbelt</th>
<th rowspan="2" align="center">mIoU</th>
<th rowspan="2" align="center">PA/%</th>
</tr>
<tr>
<th align="center">MLLAFormer</th>
<th align="center">FFTFormer</th>
<th align="center">SGFM</th>
<th align="center">Precision/%</th>
<th align="center">Recall/%</th>
<th align="center">IoU</th>
<th align="center">Precision/%</th>
<th align="center">Recall/%</th>
<th align="center">IoU</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">A</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="center">94.86</td>
<td align="center">98.01</td>
<td align="center">0.9338</td>
<td align="center">79.19</td>
<td align="center">86.76</td>
<td align="center">0.7113</td>
<td align="center">0.7937</td>
<td align="center">93.28</td>
</tr>
<tr>
<td align="center">B</td>
<td align="center">&#x2713;</td>
<td align="left">&#x200b;</td>
<td align="left">&#x200b;</td>
<td align="center">95.76</td>
<td align="center">98.12</td>
<td align="center">0.9403</td>
<td align="center">79.97</td>
<td align="center">86.93</td>
<td align="center">0.7187</td>
<td align="center">0.8063</td>
<td align="center">93.86</td>
</tr>
<tr>
<td align="center">C</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="left">&#x200b;</td>
<td align="center">96.29</td>
<td align="center">98.38</td>
<td align="center">0.9445</td>
<td align="center">81.58</td>
<td align="center">87.48</td>
<td align="center">0.7388</td>
<td align="center">0.8205</td>
<td align="center">94.05</td>
</tr>
<tr>
<td align="center">D</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">96.42</td>
<td align="center">98.40</td>
<td align="center">0.9513</td>
<td align="center">82.83</td>
<td align="center">87.99</td>
<td align="center">0.7403</td>
<td align="center">0.8345</td>
<td align="center">94.89</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T7">Table 7</xref>, the semantic segmentation performance of the model gradually improves with the progressive introduction of submodules in MFFB. Compared to Model A, Model B achieves a 0.70% increase in cultivated land segmentation IoU, a 0.90 percentage point improvement in precision, and a 1.04% rise in Farmland Shelterbelts segmentation IoU. This demonstrates that MLLAFormer&#x2019;s global context modeling capability enhances segmentation for both large-scale and elongated objects. Compared to Model B, Model C achieves a 0.53 percentage point increase in cultivated land segmentation accuracy, a 1.61 percentage point increase in protective forest belt segmentation accuracy, a 0.45% improvement in IoU for cultivated land, a 2.80% improvement in IoU for protective forest belts, and a 1.76% increase in mIoU. This demonstrates that FFTFormer&#x2019;s frequency domain feature extraction plays a crucial role in segmenting elongated objects. Frequency-domain feature extraction enhances the clarity of elongated object boundary features, improving the distinguishability between different object boundaries. Compared to Model C, Model D achieves a 1.25 percentage point increase in Farmland Shelterbelts segmentation accuracy and a 1.71% improvement in mIoU, indicating that SGFM contributes to enhancing semantic segmentation accuracy. SGFM&#x2019;s spatial gated weighting coordinates the synergistic optimization of local, global, and frequency-domain spatial features, positively enhancing the model&#x2019;s semantic segmentation performance. Compared to Model A, Model D achieved a 1.56 percentage point increase in cultivated land segmentation accuracy and a 1.87 percentage point increase in IoU, while Farmland Shelterbelts segmentation accuracy and IoU improved by 3.64 and 4.08 percentage points, respectively. while mIoU increased by 5.14%, validating the effectiveness of the MFFB module design. Overall, the coupled design of multi-feature fusion effectively improved the semantic segmentation accuracy of farmland shelterbelts. Model D is adopted as the final semantic segmentation model for this study.</p>
</sec>
<sec id="s3-2-3">
<label>3.2.3</label>
<title>Effectiveness analysis of super-resolution for semantic segmentation</title>
<p>Further quantitative evaluation of the effectiveness of super-resolution preprocessing for semantic segmentation of farmland shelterbelts. Using the same semantic segmentation model, three sets of comparative experiments were conducted: Experiment I served as the control group, directly inputting images at native resolution; Experiment II used images upsampled via 2&#xd7; super-resolution; Experiment III employed images upsampled via a two-stage 4&#xd7; super-resolution process. The semantic segmentation accuracy was calculated for each group, with results presented in <xref ref-type="table" rid="T8">Table8</xref>.</p>
<table-wrap id="T8" position="float">
<label>TABLE 8</label>
<caption>
<p>Impact of super-resolution on semantic segmentation performance.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Experiments</th>
<th rowspan="2" align="center">Upsampling factor</th>
<th colspan="3" align="center">Cropland</th>
<th colspan="3" align="center">Shelterbelt</th>
<th rowspan="2" align="center">mIoU</th>
<th rowspan="2" align="center">PA/%</th>
</tr>
<tr>
<th align="center">Precision/%</th>
<th align="center">Recall/%</th>
<th align="center">IoU</th>
<th align="center">Precision/%</th>
<th align="center">Recall/%</th>
<th align="center">IoU</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">I</td>
<td align="center">&#x2014;&#x2014;</td>
<td align="center">95.51</td>
<td align="center">97.70</td>
<td align="center">0.9340</td>
<td align="center">78.51</td>
<td align="center">85.75</td>
<td align="center">0.6944</td>
<td align="center">0.8141</td>
<td align="center">93.95</td>
</tr>
<tr>
<td align="center">II</td>
<td align="center">2&#xd7;</td>
<td align="center">96.36</td>
<td align="center">98.17</td>
<td align="center">0.9487</td>
<td align="center">79.70</td>
<td align="center">87.65</td>
<td align="center">0.7188</td>
<td align="center">0.8269</td>
<td align="center">94.20</td>
</tr>
<tr>
<td align="center">III</td>
<td align="center">4&#xd7;</td>
<td align="center">96.42</td>
<td align="center">98.40</td>
<td align="center">0.9513</td>
<td align="center">82.83</td>
<td align="center">87.99</td>
<td align="center">0.7403</td>
<td align="center">0.8345</td>
<td align="center">94.89</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T8">Table 8</xref>, super-resolution preprocessing significantly enhances semantic segmentation performance. Compared to the control group, 4&#xd7; upsampling improved segmentation accuracy for farmland and shelterbelts by 0.91 and 4.32 percentage points respectively, increased IoU by 1.85% and 6.61%, and boosted mIoU for semantic segmentation by 2.47%. Super-resolution preprocessing enhances segmentation for both farmland and shelterbelts, particularly improving the segmentation accuracy of shelterbelts. Super-resolution reconstruction amplifies high-frequency components in images, thereby improving the texture discrimination capability of semantic segmentation models. Simultaneously, super-resolution preprocessing enhances the width and edge detail information of narrow shelterbelts, effectively alleviating the challenge of segmenting elongated objects faced by semantic segmentation models. This study also conducted segmentation experiments with 8&#xd7; upsampling, but encountered rapid performance degradation. This was attributed to artifacts appearing in the significantly upsampled images, whose style differed markedly from the training samples, causing a substantial decline in semantic segmentation model performance. Therefore, these results are not reported. Overall, super-resolution preprocessing at certain magnifications significantly improves semantic segmentation accuracy, particularly for challenging narrow Farmland Shelterbeltss.</p>
</sec>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Segmentation result vectorization and application examples</title>
<p>Selected test images from the study area underwent a two-stage 4x upsampling process using the super-resolution model developed in this study. Subsequently, the semantic segmentation model was applied to delineate cultivated land and farmland shelterbelts. Following the technical framework illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>, the segmentation masks were vectorized and converted into Shape format vector files. Selected sample images and vectorization results are shown in <xref ref-type="fig" rid="F8">Figure 8</xref>.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Example of image vectorization.</p>
</caption>
<graphic xlink:href="frsen-07-1730222-g008.tif">
<alt-text content-type="machine-generated">Grid of fifteen panels showing examples of a remote sensing workflow for agricultural field analysis. Top row displays original aerial imagery, middle row shows corresponding yellow and black segmentation masks with clear parcel boundaries, and bottom row presents orange and green vectorized maps. The final column illustrates local details with close-up views of masks and vector lines.</alt-text>
</graphic>
</fig>
<p>As shown in <xref ref-type="fig" rid="F8">Figure 8</xref>, the semantic segmentation model developed in this study effectively delineates cultivated land parcels and farmland shelterbelts, with segmentation masks successfully converted into vector maps. Estimated areas of cultivated land and shelterbelts were derived from these vector maps. Furthermore, based on the geographic boundaries corresponding to the sample images, data from the Third National Land Survey of China (released in 2021) was obtained for the respective regions. This dataset includes cultivated land areas derived from actual field measurements, which are considered the actual values for cultivated land. Simultaneously, the manually annotated results from the sample images were vectorized to obtain the corresponding protective forest belt areas, which serve as the actual values for forest belt areas. The overall performance of the technical framework was evaluated using absolute error and relative error as assessment metrics, with results presented in <xref ref-type="table" rid="T9">Table 9</xref>.</p>
<table-wrap id="T9" position="float">
<label>TABLE 9</label>
<caption>
<p>Evaluation of vectorization accuracy from remote sensing imagery.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Samples</th>
<th colspan="4" align="center">Cropland</th>
<th colspan="4" align="center">Shelterbelt</th>
</tr>
<tr>
<th align="center">Actual area/hm<sup>2</sup>
</th>
<th align="center">Estimated area/hm<sup>2</sup>
</th>
<th align="center">Absolute error/hm<sup>2</sup>
</th>
<th align="center">Relative error/%</th>
<th align="center">Actual area/hm<sup>2</sup>
</th>
<th align="center">Estimated area/hm<sup>2</sup>
</th>
<th align="center">Absolute error/hm<sup>2</sup>
</th>
<th align="center">Relative error/%</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">1</td>
<td align="center">1134.64</td>
<td align="center">1062.85</td>
<td align="center">71.79</td>
<td align="center">6.33</td>
<td align="center">82.43</td>
<td align="center">86.85</td>
<td align="center">4.42</td>
<td align="center">5.36</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">1233.59</td>
<td align="center">1108.49</td>
<td align="center">125.10</td>
<td align="center">10.14</td>
<td align="center">81.26</td>
<td align="center">86.19</td>
<td align="center">4.93</td>
<td align="center">6.07</td>
</tr>
<tr>
<td align="center">3</td>
<td align="center">1211.37</td>
<td align="center">1141.47</td>
<td align="center">69.90</td>
<td align="center">5.77</td>
<td align="center">60.97</td>
<td align="center">67.09</td>
<td align="center">6.12</td>
<td align="center">10.04</td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">1202.05</td>
<td align="center">1108.66</td>
<td align="center">93.39</td>
<td align="center">7.77</td>
<td align="center">89.77</td>
<td align="center">94.77</td>
<td align="center">5.00</td>
<td align="center">5.57</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T9">Table 9</xref>, the proposed method demonstrates high accuracy in both farmland and shelterbelt area inversion. The relative error range for farmland area inversion is 5.77%&#x2013;10.14%, with an average relative error of approximately 7.50%. The relative error range for shelterbelt area inversion was 5.36%&#x2013;10.04%, with an average relative error of approximately 6.76%. Despite the small size and elongated, fragmented morphology of shelterbelts, their area inversion accuracy was comparable to that of farmland, with a lower average error. This demonstrates the effectiveness of the remote sensing inversion and vector extraction method for farmland shelterbelts. The technical framework developed in this study enables automated, high-precision extraction of farmland forest network geographic information, demonstrating significant engineering application value.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>While this study primarily focuses on farmland shelterbelt extraction, the core architectural innovations possess significant potential for broader applicability. To rigorously evaluate the robustness and generalizability of the proposed framework, supplementary experiments were conducted on the publicly available LoveDA dataset. The results demonstrate that our method achieves superior performance in identifying categories with distinct structural features, such as &#x201c;buildings,&#x201d; &#x201c;roads,&#x201d; and &#x201c;water bodies,&#x201d; ultimately reaching a state-of-the-art mIoU of 0.5054. This performance gain serves as empirical evidence that the integration of frequency-domain features and global context via the MFFB module effectively captures universal structural information. Consequently, the framework is not merely specialized for shelterbelt detection but is capable of navigating diverse objects within complex urban and rural landscapes.</p>
<p>Building upon these findings, future research will further explore the adaptability of the framework across multi-source and multi-resolution sensor data. Specifically, we plan to validate its performance on Sentinel-2 medium-resolution multispectral imagery and the Chinese Gaofen (GF-2, GF-6) series. A key objective will be investigating how the frequency-domain components can compensate for the loss of spatial detail inherent in lower-resolution data. Furthermore, considering the significant bio-geographical variations in shelterbelts&#x2014;such as differences in tree species (e.g., poplar vs. pine) and canopy density across arid and humid regions&#x2014;we intend to implement transfer learning and domain adaptation strategies. By leveraging the multi-domain feature representations learned from the AWSD2025 and LoveDA datasets, we aim to develop a cross-regional, sensor-agnostic monitoring system, providing a robust technical foundation for large-scale agricultural remote sensing assessments.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>This study addresses two critical challenges in extracting farmland shelterbelt networks from high-resolution remote sensing imagery: (1) inadequate feature extraction and boundary ambiguity caused by the elongated morphological characteristics of shelterbelts, and (2) inter-class confusion due to spectral similarity between woodland and crops. To overcome these challenges, we propose a novel remote sensing inversion framework based on multi-feature fusion.</p>
<p>The core contribution of this framework lies in the construction of a unified multi-feature fusion module that synergistically integrates Mamba-inspired linear attention, convolutional operators, and Fast Fourier Transform to collaboratively extract global contextual, local detailed, and frequency-domain features. A specially designed spatially gated fusion mechanism enables the model to adaptively integrate these complementary features. Building upon this module, we developed both a super-resolution model and a semantic segmentation model, forming a comprehensive processing pipeline. To address the morphological characteristics of shelterbelts, we adopted a multi-stage upsampling strategy for super-resolution preprocessing, effectively enhancing the morphological features and reconstructing crucial details.</p>
<p>Experiments conducted on our self-constructed farmland shelterbelt dataset demonstrate the effectiveness of the proposed method. Ablation studies confirm the importance of each component in the multi-feature fusion module and validate the effectiveness of their collaborative. Compared with existing state-of-the-art methods, our framework shows superior performance across multiple evaluation metrics, particularly in enhancing the segmentation accuracy of elongated features such as shelterbelts. The research also verifies the promoting effect of the super-resolution preprocessing strategy on semantic segmentation performance, while the final vectorization results demonstrate high accuracy in area inversion, highlighting the practical application value of our technical framework.</p>
<p>Future work will focus on exploring the applicability of this framework to other agricultural geographic element extraction tasks and investigating its potential in more challenging scenarios such as on-orbit satellite data processing, thereby providing new technical support for advancing the automation and intelligence of agricultural remote sensing technology.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/supplementary material.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>QZ: Data curation, Formal Analysis, Methodology, Software, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review and editing. YZ: Funding acquisition, Resources, Supervision, Writing &#x2013; review and editing. HZ: Data curation, Formal Analysis, Writing &#x2013; review and editing. WW: Data curation, Writing &#x2013; review and editing. YH: Data curation, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ariav</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Cohen</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Depth map super-resolution <italic>via</italic> cascaded transformers guidance. Front</article-title>. <source>Signal process.</source> <volume>2</volume>, <fpage>847890</fpage>. <pub-id pub-id-type="doi">10.3389/frsip.2022.847890</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chanmee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kesorn</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Semantic decision trees: a new learning system for the ID3-Based algorithm using a knowledge base</article-title>. <source>Adv. Eng. Inf.</source> <volume>58</volume>, <fpage>102156</fpage>. <pub-id pub-id-type="doi">10.1016/j.aei.2023.102156</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L. C.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Papandreou</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Schroff</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Encoder-decoder with atrous separable convolution for semantic image segmentation</article-title>,&#x201d; in <source>Computer vision &#x2013; eccv 2018</source> (<publisher-loc>Munich, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>), <volume>11211</volume>, <fpage>801</fpage>&#x2013;<lpage>818</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_49</pub-id>
<source>Lecture notes in computer science</source>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Effects of participation in cooperatives on the cultivated land quality protection behavior of grain family farms: evidence from China</article-title>. <source>Front. Sustain. Food Syst.</source> <volume>8</volume>, <fpage>1378847</fpage>. <pub-id pub-id-type="doi">10.3389/fsufs.2024.1378847</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Frequency-quantized variational autoencoder based on 2D-FFT for enhanced image reconstruction and generation</article-title>. <source>Comput. Mater. Contin.</source> <volume>83</volume>, <fpage>1547</fpage>&#x2013;<lpage>1562</lpage>. <pub-id pub-id-type="doi">10.32604/cmc.2025.056248</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Mamba: linear-time sequence modeling with selective state spaces</article-title>. <source>arXiv Prepr</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2312.00752">https://arxiv.org/abs/2312.00752</ext-link> (Accessed December 15, 2024).</comment>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>MambaIR: a simple baseline for image restoration with state-space model</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision (ECCV 2024)</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Nature Switzerland</publisher-name>), <fpage>222</fpage>&#x2013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-72307-9_12</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Flatten transformer: vision transformer with focused linear attention</article-title>. <source>IEEE Transactions on Pattern Analysis and Machine Intelligence</source> <volume>46</volume>(<issue>7</issue>), <fpage>4861</fpage>&#x2013;<lpage>4875</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2024.3368040</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A new framework for improving semantic segmentation in aerial imagery</article-title>. <source>Front. Remote Sens.</source> <volume>5</volume>, <fpage>1370697</fpage>. <pub-id pub-id-type="doi">10.3389/frsen.2024.1370697</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Katharopoulos</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Vyas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pappas</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Fleuret</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Transformers are rnns: fast autoregressive transformers with linear attention</article-title>,&#x201d; in <source>Proceedings of the 37th international conference on machine learning</source> (<publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>5156</fpage>&#x2013;<lpage>5165</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v119/katharopoulos20a.html">https://proceedings.mlr.press/v119/katharopoulos20a.html</ext-link> (Accessed March 10, 2023).</comment>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kirillov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mintun</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ravi</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Rolland</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gustafson</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>Segment anything</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision</source> (<publisher-loc>Vancouver, Canada</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4015</fpage>&#x2013;<lpage>4026</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV51070.2023.02985</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kwenda</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gwetu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fonou-Dombeu</surname>
<given-names>J. V.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Hybridizing deep neural networks and machine learning models for aerial satellite forest image segmentation</article-title>. <source>J. Imaging</source> <volume>10</volume>, <fpage>132</fpage>. <pub-id pub-id-type="doi">10.3390/jimaging10060132</pub-id>
<pub-id pub-id-type="pmid">38921609</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lei</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Mo</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Transformer-based multistage enhancement for remote sensing image super-resolution</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>60</volume>, <fpage>5612101</fpage>&#x2013;<lpage>5612111</lpage>. <pub-id pub-id-type="doi">10.1109/TGRS.2021.3068327</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>ConvFormer: plug-And-play CNN-style transformers for improving medical image segmentation</article-title>,&#x201d; in <source>Medical image computing and computer assisted intervention &#x2013; miccai 2023</source>. <source>Lecture notes in computer science</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Greenspan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Madabhushi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mousavi</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Salcudean</surname>
<given-names>S.</given-names>
</name>
</person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <volume>14223</volume>, <fpage>642</fpage>&#x2013;<lpage>651</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-43901-8_61</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Evaluating the potential of Sentinel-2 time series imagery and machine learning for tree species classification in a mountainous forest</article-title>. <source>Remote Sens.</source> <volume>16</volume>, <fpage>293</fpage>. <pub-id pub-id-type="doi">10.3390/rs16020293</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Pun</surname>
<given-names>M. O.</given-names>
</name>
</person-group> (<year>2024a</year>). <article-title>RS3Mamba: visual state space model for remote sensing image semantic segmentation</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>21</volume>, <fpage>55</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1109/LGRS.2024.3405678</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2024b</year>). <article-title>SAM-Assisted remote sensing imagery semantic segmentation with object and boundary constraints</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>62</volume>, <fpage>54</fpage>&#x2013;<lpage>56</lpage>. <pub-id pub-id-type="doi">10.1109/TGRS.2024.3405679</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ok</surname>
<given-names>A. O.</given-names>
</name>
<name>
<surname>Akar</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Gungor</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Evaluation of random forest method for agricultural crop classification</article-title>. <source>Eur. J. Remote Sens.</source> <volume>45</volume>, <fpage>421</fpage>&#x2013;<lpage>432</lpage>. <pub-id pub-id-type="doi">10.5721/EuJRS20124535</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qiao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Hi-mamba: hierarchical mamba for efficient image super-resolution</article-title>. <source>arXiv Prepr</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2410.10140">https://arxiv.org/abs/2410.10140</ext-link> (Accessed January 26, 2025).</comment>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Rahaman</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Baratin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Arpit</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Draxler</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hamprecht</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>On the spectral bias of neural networks</article-title>,&#x201d; in <source>Proceedings of the 36th international conference on machine learning</source> <publisher-loc>Long Beach, CA, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>, <fpage>5301</fpage>&#x2013;<lpage>5310</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v97/rahaman19a.html">https://proceedings.mlr.press/v97/rahaman19a.html</ext-link> (Accessed July 5, 2023).</comment>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Rakhlin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Davydow</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Nikolenko</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Land cover classification from satellite imagery with u-net and lov&#xe1;sz-softmax loss</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition workshops</source> (<publisher-loc>Salt Lake City, UT, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2304</fpage>&#x2013;<lpage>2312</lpage>. <pub-id pub-id-type="doi">10.1109/CVPRW.2018.00297</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-Net: Convolutional networks for biomedical image segmentation</article-title>,&#x201d; in <source>Medical image computing and computer-assisted intervention &#x2013; miccai 2015</source>. <source>Lecture notes in computer science</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Navab</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hornegger</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wells</surname>
<given-names>W. M.</given-names>
</name>
<name>
<surname>Frangi</surname>
<given-names>A. F.</given-names>
</name>
</person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <volume>9351</volume>, <fpage>234</fpage>&#x2013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ru</surname>
<given-names>F. X.</given-names>
</name>
<name>
<surname>Zulkifley</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Abdani</surname>
<given-names>S. R.</given-names>
</name>
<name>
<surname>Spraggon</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Forest segmentation with spatial pyramid pooling modules: a surveillance system based on satellite images</article-title>. <source>Forests</source> <volume>14</volume>, <fpage>405</fpage>. <pub-id pub-id-type="doi">10.3390/f14020405</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Suzuki</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Abe</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>1985</year>). <article-title>Topological structural analysis of digitized binary images by border following</article-title>. <source>Comput. Vis. Graph. Image Process.</source> <volume>30</volume>, <fpage>32</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1016/0734-189X(85)90016-7</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tanchenko</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Visual-PSNR measure of image quality</article-title>. <source>J. Vis. Commun. Image Represent.</source> <volume>25</volume>, <fpage>874</fpage>&#x2013;<lpage>878</lpage>. <pub-id pub-id-type="doi">10.1016/j.jvcir.2014.01.008</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Terven</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>C&#xf3;rdova-Esparza</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Romero-Gonz&#xe1;lez</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Ramirez-Pedraza</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A comprehensive review of YOLO architectures in computer vision: from YOLOv1 to YOLOv8 and YOLO-NAS</article-title>. <source>Mach. Learn. Knowl. Extr.</source> <volume>5</volume>, <fpage>1680</fpage>&#x2013;<lpage>1716</lpage>. <pub-id pub-id-type="doi">10.3390/make5040081</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Visvalingam</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Whyatt</surname>
<given-names>J. D.</given-names>
</name>
</person-group> (<year>1990</year>). <article-title>The douglas-peucker algorithm for line simplification: Re-evaluation through visualization</article-title>. <source>Comput. Graph. Forum</source> <volume>9</volume>, <fpage>213</fpage>&#x2013;<lpage>225</lpage>. <pub-id pub-id-type="doi">10.1111/j.1467-8659.1990.tb00398.x</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Bovik</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Sheikh</surname>
<given-names>H. R.</given-names>
</name>
<name>
<surname>Simoncelli</surname>
<given-names>E. P.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Image quality assessment: from error visibility to structural similarity</article-title>. <source>IEEE Trans. Image Process.</source> <volume>13</volume>, <fpage>600</fpage>&#x2013;<lpage>612</lpage>. <pub-id pub-id-type="doi">10.1109/TIP.2003.819861</pub-id>
<pub-id pub-id-type="pmid">15376593</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A novel transformer based semantic segmentation scheme for fine-resolution remote sensing images</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>19</volume>, <fpage>80</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1109/LGRS.2022.3158934</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Dang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wen</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2024a</year>). <article-title>Optimization and classification control of permanent basic farmland based on quality classification</article-title>. <source>Front. Environ. Sci.</source> <volume>12</volume>, <fpage>1331534</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2024.1331534</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024b</year>). <article-title>Lightweight land cover classification <italic>via</italic> semantic segmentation of remote sensing imagery and analysis of influencing factors</article-title>. <source>Front. Environ. Sci.</source> <volume>12</volume>, <fpage>1329517</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2024.1329517</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Hoai</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Region ranking SVM for image classification</article-title>,&#x201d; in <source>2016 IEEE conference on computer vision and pattern recognition (CVPR), Las Vegas, NV, USA</source> (<publisher-name>IEEE</publisher-name>), <fpage>2987</fpage>&#x2013;<lpage>2996</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.325</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Frequency-assisted mamba for remote sensing image super-resolution</article-title>. <source>IEEE Trans. Multimed.</source> <volume>26</volume>, <fpage>8123</fpage>&#x2013;<lpage>8136</lpage>. <pub-id pub-id-type="doi">10.1109/TMM.2024.3407563</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>MF-Mamba: Multiscale convolution and mamba fusion model for semantic segmentation of remote sensing imagery</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>63</volume>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1109/TGRS.2025.3593410</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>High-resolution U-Net: preserving image details for cultivated land extraction</article-title>. <source>Sensors</source> <volume>20</volume>, <fpage>4064</fpage>. <pub-id pub-id-type="doi">10.3390/s20154064</pub-id>
<pub-id pub-id-type="pmid">32707825</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tashi</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Global adaptive second-order transformer for remote sensing image semantic segmentation</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>62</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1109/TGRS.2024.3400262</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chai</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Individual tree crown segmentation in subtropical broadleaf forests using UAV-Based ultrahigh-resolution RGB data</article-title>,&#x201d; in <source>2024 IEEE international Geoscience and remote sensing symposium (IGARSS), Athens, Greece</source> (<publisher-name>IEEE</publisher-name>), <fpage>3097</fpage>&#x2013;<lpage>3099</lpage>. <pub-id pub-id-type="doi">10.1109/IGARSS53475.2024.10642737</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/597227/overview">Salvatore Manfreda</ext-link>, University of Naples Federico II, Italy</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2412764/overview">Ehsan Khoramshahi</ext-link>, University of Eastern Finland, Finland</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2958341/overview">Yijie Zhang</ext-link>, University of Electronic Science and Technology of China, China</p>
</fn>
</fn-group>
</back>
</article>