<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1733727</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Multiscale CNN-state space model with feature fusion for crop disease detection from UAV imagery</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhang</surname><given-names>Ting</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3257086/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname><given-names>Dengwu</given-names></name>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname><given-names>Wen</given-names></name>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
</contrib-group>
<aff id="aff1"><institution>School of Computer Science, Xijing University</institution>, <city>Xi&#x2019;an</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Ting Zhang, <email xlink:href="mailto:ztpaper25@163.com">ztpaper25@163.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-12-17">
<day>17</day>
<month>12</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1733727</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>22</day>
<month>11</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Zhang, Wang and Chen.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Zhang, Wang and Chen</copyright-holder>
<license>
<ali:license_ref start_date="2025-12-17">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Accurate detection of crop diseases from unmanned aerial vehicle (UAV) imagery is critical for precision agriculture. This task remains challenging due to the complex backgrounds, variable scales of lesions, and the need to model both fine-grained spot details and long-range spatial dependencies within large field scenes. To address these issues, this paper proposes a novel Multiscale CNNState Space Model with Feature Fusion (MSCNN-VSS). The model is specifically designed to hierarchically extract and integrate multi-level features for UAVbased analysis: a dilated multi-scale Inception module is introduced to capture diverse local lesion patterns across different scales without sacrificing spatial detail; a Visual State Space (VSS) block serves as the core component to efficiently model global contextual relationships across the canopy with linear computational complexity, effectively overcoming the limitations of Transformers on high-resolution UAV images; and a hybrid attention module is subsequently applied to refine the fused features and accentuate subtle diseased regions. Extensive experiments on a UAV-based crop disease dataset demonstrate that MSCNN-VSS achieves state-of-the-art performance, with a Pixel Accuracy (PA) of 0.9421 and a mean Intersection over Union (mIoU) of 0.9152, significantly outperforming existing CNN and Transformer-based benchmarks. This work provides a balanced and effective solution for automated crop disease detection in practical agricultural scenarios.</p>
</abstract>
<kwd-group>
<kwd>crop disease detection</kwd>
<kwd>superpixel segmentation</kwd>
<kwd>unmanned aerial vehicle (UAV)</kwd>
<kwd>Visual StateSpace (VSS)</kwd>
<kwd>multiscale CNN-VSS with feature fusion (MSCNN-VSS)</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared financial support was received for this work and/or its publication. This research were funded by Key Research and Development Program of Shaanxi Provincial Department of Science and Technology (2025NC-YBXM-216,2025GH-YBXM-077) and Development Support Project of Shaanxi Provincial Department of Education (24JR158,24JR159).</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="5"/>
<equation-count count="7"/>
<ref-count count="24"/>
<page-count count="11"/>
<word-count count="4827"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Global crop production is seriously threatened by various diseases such as aphids, powdery mildew and yellow rust, causing huge economic losses (<xref ref-type="bibr" rid="B1">Abbas et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B22">Zhang et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B7">Jia et&#xa0;al., 2025</xref>). Image segmentation of diseased leaves of crops is the key to disease detection and prevention (<xref ref-type="bibr" rid="B21">Xu et&#xa0;al., 2022a</xref>, <xref ref-type="bibr" rid="B20">2022</xref>; <xref ref-type="bibr" rid="B23">Zhang et al., 2023</xref>). Unmanned aerial vehicle (UAV) remote sensing has become an important technical means for the detection and identification of large-scale crop diseases (<xref ref-type="bibr" rid="B3">Bouguettaya et al., 2023</xref>; <xref ref-type="bibr" rid="B10">Li et al., 2024</xref>; <xref ref-type="bibr" rid="B15">Shahi et al., 2023</xref>). It possesses obvious advantages, including high spatial resolution, operational flexibility, efficiency, and the ability to conduct rapid and low-cost monitoring of large areas under the conditions of high reliability and high data resolution (<xref ref-type="bibr" rid="B11">Maes and Steppe, 2019</xref>). The accurate detection of diseases from UAV imagery relies on advanced analytical methods. The field has evolved from traditional machine learning towards deep learning. Convolutional Neural Networks (CNNs) and U-Net architectures have demonstrated remarkable performance by automating feature learning (<xref ref-type="bibr" rid="B13">Qin et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B8">Kerkech et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B24">Zhu et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B23">Zhang and Zhang, 2023</xref>). However, these models are inherently constrained by their limited receptive field, making it difficult to capture long-range dependencies and global contexts in crop disease images (<xref ref-type="bibr" rid="B9">Lei et&#xa0;al., 2021</xref>). This limits their ability to model correlations between distant lesions.</p>
<p>To overcome the receptive field limitation, Transformer-based models were introduced. They demonstrate the capability to handle inputs of varying dimensions and dynamically extract critical information through self-attention (<xref ref-type="bibr" rid="B4">De and Brown, 2023</xref>). Transformers can capture complex spatial dependency relationships between leaf lesions and healthy tissues, effectively identifying subtle spectral features of early diseases even under complex backgrounds (<xref ref-type="bibr" rid="B17">Singh et&#xa0;al., 2024</xref>). Frameworks like PD-TR (<xref ref-type="bibr" rid="B19">Wang J, et al., 2024</xref>) show significant advantages in cross-regional lesion correlation modeling. However, due to their quadratic complexity, Transformers impose a significant computational cost when processing high-resolution and high-dimensional UAV images.</p>
<p>Recently, Visual State Space (VSS) models have aroused great interest as an efficient alternative (<xref ref-type="bibr" rid="B2">Alonso et&#xa0;al., 2024</xref>). SSM-based Mamba models have shown great potential for long-range dependency modeling with linear complexity (<xref ref-type="bibr" rid="B6">Hu et&#xa0;al., 2024</xref>). By employing a state space mechanism, VSS models offer a more balanced approach for modeling both global contexts and local features in UAV crop disease detection. Architectures like VM-UNet (<xref ref-type="bibr" rid="B14">Ruan et&#xa0;al., 2024</xref>), Swin-UMamba (<xref ref-type="bibr" rid="B16">Shi et&#xa0;al., 2024</xref>), and Multiscale Vision Mamba-UNet (MSVM-UNet) (<xref ref-type="bibr" rid="B6">Hu et al., 2024</xref>) have demonstrated the potential of VSS blocks in segmentation tasks. Despite these promising developments, the potential of SSM and VSS architectures has rarely been fully exploited in UAV crop disease detection (<xref ref-type="bibr" rid="B3">Bouguettaya et al., 2023</xref>; <xref ref-type="bibr" rid="B12">Narmilan et al., 2022</xref>).</p>
<p>To bridge this gap, this paper constructs a multiscale CNN-VSS with feature fusion (MSCNN-VSS) for crop disease detection. The main contributions are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>A hybrid CNN-VSS architecture is constructed, providing a balanced solution for the segmentation of complex unmanned aerial vehicle (UAV) crop disease images.</p></list-item>
<list-item>
<p>Collaboratively integrate multi-scale convolutional, VSS and hybrid attention modules, to enhance local feature diversity, capture global context dependencies, and optimize feature representation.</p></list-item>
<list-item>
<p>Extensive experiments were conducted on the dataset of crop disease images based on unmanned aerial vehicles.</p></list-item>
</list>
<p>The rest of this paper is arranged as follows. Section 2 introduces the proposed MSCNN-VSS, focusing on its main module design. Section 3 presents the experimental setup, benchmark results, and a comprehensive analysis including ablation studies and visualizations for welding defect segmentation. Finally, Section 4 concludes this paper.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>The MSCNN-VSS model</title>
<p>The architecture of MSCNN-VSS is shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>, consisting of Superpixel block, encoder, decoder, feature fusion module, and hybrid attention, where encoder and decoder are combined by skip connection to achieve better segmentation results with few annotation images. The main structures are shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>, where VSS block derived from VMamba is the backbone of encoder and decoder, the dimension of the input image is <italic>W&#xd7;H</italic>&#xd7;3, and the output channel of encoder is 2<italic><sup>i</sup></italic>&#xd7;<italic>C</italic>, <italic>C</italic> is often set to 96, and the output channel of decoder is the opposite of that of encoder, gradually decreasing.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Architecture of MSCNN-VSS.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1733727-g001.tif">
<alt-text content-type="machine-generated">Diagram illustrating a plant disease detection model. The process begins with an input image of a leaf, segmented using SLIC. The SLIC module divides the image into patches, which undergo multiscale inception, patch embedding, and a series of VSS and patch merging operations in the encoder. The decoder reverses these steps with patch expansion. The features are reshaped and fused before applying hybrid attention through average and max pooling, followed by a softmax layer, resulting in detected spots.</alt-text>
</graphic></fig>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The structures of three main modules, where &#x2295; and &#x2297; indicate add and Hadamard product operations. <bold>(A)</bold> Multiscale Inception and its 3 dilated kernels (r = 1,2,3). <bold>(B)</bold> VSS. <bold>(C)</bold> Feature fusion.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1733727-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture with three sections labeled A, B, and C. Section A depicts various convolutional layers and pooling with different rates and sizes. Section B shows an SS2D scanning mode with different processing units like Linear, DW Conv, and SiLU. Section C displays a feature fusion process with multiple layers and softmax function, illustrating interactions between features F1, F2, F3, and F4. Arrows indicate data flow and transformation.</alt-text>
</graphic></fig>
<p>The notation and description of key mathematical symbols and operations are shown in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Notation and description of key mathematical symbols and operations.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Symbol/Operation</th>
<th valign="middle" align="center">Description</th>
<th valign="middle" align="center">Symbol/Operation</th>
<th valign="middle" align="center">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center"><italic>X</italic></td>
<td valign="middle" align="center">Input feature map to a block or operation.</td>
<td valign="middle" align="center">Atten(&#xb7;)</td>
<td valign="middle" align="center">Spatial attention mechanism block.</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>Z</italic></td>
<td valign="middle" align="center">Output feature map of a block or operation.</td>
<td valign="middle" align="center"><italic>Ai</italic>2&#x200b;</td>
<td valign="middle" align="center">Attention maps adaptively learned by the scale-aware block for feature <italic>Fi</italic>&#x200b;.</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>Y</italic></td>
<td valign="middle" align="center">Final predicted segmentation map.</td>
<td valign="middle" align="center"><italic>H</italic>attention</td>
<td valign="middle" align="center">Output feature map from the hybrid attention module.</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>Y</italic>^</td>
<td valign="middle" align="center">Ground truth annotation (label).</td>
<td valign="middle" align="center">AP(&#xb7;)</td>
<td valign="middle" align="center">Global Average Pooling operation.</td>
</tr>
<tr>
<td valign="middle" align="center">VSS Block</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">MP(&#xb7;)</td>
<td valign="middle" align="center">Global Max Pooling operation.</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>X</italic>&#x2032;</td>
<td valign="middle" align="center">Intermediate feature map after the first linear projection and layer normalization.</td>
<td valign="middle" align="center"><italic>&#x3c3;</italic>(&#xb7;)</td>
<td valign="middle" align="center">Sigmoid activation function.</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>X</italic>&#x2032;&#x2032;</td>
<td valign="middle" align="center">Intermediate feature map after depthwise convolution and SiLU activation.</td>
<td valign="middle" align="center">L</td>
<td valign="middle" align="center">Total loss (Binary Cross-Entropy).</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>X</italic>^</td>
<td valign="middle" align="center">Intermediate feature map prepared for the SS2D operation.</td>
<td valign="middle" align="center"><italic>C</italic></td>
<td valign="middle" align="center">Number of classes (<italic>C</italic>&#xa0;=&#xa0;2 for binary segmentation: disease vs. background).</td>
</tr>
<tr>
<td valign="middle" align="center">LN(&#xb7;)</td>
<td valign="middle" align="center">Layer Normalization.</td>
<td valign="middle" align="center"><italic>yi</italic>&#x200b;</td>
<td valign="middle" align="center">True label of the <italic>i</italic>-th pixel.</td>
</tr>
<tr>
<td valign="middle" align="center">Lin(&#xb7;)</td>
<td valign="middle" align="center">Linear projection layer (implemented as a 1&#xd7;1 convolution).</td>
<td valign="middle" align="center"><italic>y</italic>^&#x200b;<italic>i</italic>&#x200b;</td>
<td valign="middle" align="center">Predicted probability of the <italic>i</italic>-th pixel belonging to the disease class.</td>
</tr>
<tr>
<td valign="middle" align="center">SiLU(&#xb7;)</td>
<td valign="middle" align="center">Sigmoid Linear Unit activation function.</td>
<td valign="middle" align="center"><italic>yi</italic>&#x200b;=<italic>c</italic>&#x200b;</td>
<td valign="middle" align="center">Indicator function that is 1 if the true label <italic>yi</italic>&#x200b; equals class <italic>c</italic>, else 0.</td>
</tr>
<tr>
<td valign="middle" align="center">DWConv(&#xb7;)</td>
<td valign="middle" align="center">Depthwise Convolution.</td>
<td valign="middle" align="center">&#x2299;</td>
<td valign="middle" align="center">Element-wise multiplication (Hadamard product).</td>
</tr>
<tr>
<td valign="middle" align="center">SS2D(&#xb7;)</td>
<td valign="middle" align="center">2D Selective Scan operation, the core mechanism for long-range dependency modeling.</td>
<td valign="middle" align="center">&#x2295;</td>
<td valign="middle" align="center">Element-wise addition.</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>Fi</italic>&#x200b;</td>
<td valign="middle" align="center">The <italic>i</italic>-th feature map from the decoder, where <italic>i</italic>&#x2208;{1,2,3,4}.</td>
<td valign="middle" align="center">&#x2225;</td>
<td valign="middle" align="center">Concatenation operation along the channel dimension.</td>
</tr>
<tr>
<td valign="middle" align="center"><italic>V</italic>12&#x200b;, <italic>V</italic>123&#x200b;,<italic>V</italic>1234&#x200b;</td>
<td valign="middle" align="center">Fused feature maps at different hierarchical levels.</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
</tr>
</tbody>
</table>
</table-wrap>
<p>The main processes of MSCNN-VSS are introduced in detail as follows.</p>
<p>1. Superpixel segmentation is performed using the Simple Linear Iterative Clustering (SLIC) algorithm from the OpenCV-Python library. We test different numbers of superpixels on images of diseased leaves (both close-up and distant views).</p>
<p>2. Segmenting diseased leaves in UAV imagery is challenging due to the high variability in lesion appearance. The pooling layers in standard U-Ne lose spatial detail, harming lesion localization. Therefore, a multi-scale dilated Inception module is introduced. Its structure is shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2A</bold></xref>, which is a series of parallel 3&#xd7;3 convolution with dilated rates of {1,2,3}, exponentially increasing the receptive field (from 3&#xd7;3 to 7&#xd7;7) without increasing the weight parameters. After each dilated convolution, batch normalization (BN) and ReLU, the branched image feature details are concatenated and aggregated through 1&#xd7;1 convolution to speed up the network training and convergence.</p>
<p>3. Encoder. Following patch embedding, the transformed features from the dilated Inception module are input into the encoder. The encoder is composed of VSS&#xd7;2 blocks, whose fundamental operation is the 2D-selective-scan (SS2D). VSS is designed to overcome the limitations of standard models in capturing long-range dependencies in 2D imagery. Its architecture is depicted in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2B</bold></xref> and is detailed as show in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>V</mml:mi><mml:mi>S</mml:mi><mml:mi>S</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mn>3</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>V</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>S</mml:mi><mml:mi>S</mml:mi><mml:mn>2</mml:mn><mml:mi>D</mml:mi><mml:mi>M</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>L</mml:mi><mml:mi>U</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>W</mml:mi><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>L</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mn>1</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:mi>L</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>V</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>L</mml:mi><mml:mi>U</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>L</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mn>2</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:mi>L</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> are the input and output feature maps of MSVSS, <italic>LN</italic>(&#xb7;), <italic>Lin</italic>(&#xb7;), <italic>SiLU</italic>(&#xb7;), <italic>MSSSM</italic>(&#xb7;) and <italic>DWConv</italic>(&#xb7;) are layer normalization, linear projecting, SiLU activation, MSSSM and DWConv operations, respectively.</p>
<p>SSM adopts the multiscan strategy to model long-range feature dependencies, which significantly increases the feature redundancy. Patch merging for 2&#xd7; down-sampling in the encoder captures long-range dependencies while gradually reducing the spatial dimension, effectively compressing the input into multiscale representations.</p>
<p>4. Decoder. Like encoder, decoder consists of VSS and patch expanding blocks, where patch expanding is 2&#xd7;up-sampling operation.</p>
<p>5. Feature fusion. Feature fusion is leveraged to integrate multi-scale features. It is commonly used in various deep learning models. Its structure is shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2C</bold></xref>. To manage the computational complexity, 1&#xd7;1 convolution reshapes the feature mapping and standardizes the number of channels for decoder features to 64 at all scales. The module hierarchically integrates these multi-scale features using a spatial attention mechanism between adjacent scales. The integrated features from one level interact iteratively with those of the next, enabling adaptive multi-scale fusion. This process is described as show in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x2004;</mml:mtext></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mn>123</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x2004;</mml:mtext></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mn>1234</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>123</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math>
</disp-formula>
<p>where <italic>Atten</italic>(.) denotes attention mechanism block, <italic>F<sub>i</sub></italic>, <italic>i</italic>&#x2208;{1, 2, 3, 4} are the <italic>i</italic>th features generated by the decoder and have been upsampled, with the same resolution, but different numbers of channels.</p>
<p>Taking <italic>V</italic><sub>12</sub> as an example, show in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>:</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x2004;</mml:mo><mml:mo>&#x2004;</mml:mo><mml:mo>&#x2004;</mml:mo><mml:mo>&#x2004;</mml:mo><mml:mo>&#x2004;</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> are the attention maps adaptively learned by the scale-aware block, <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> are concatenated and input into convolution and Softmax layers, the output is split along the channel dimension to obtain <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula>.</p>
<p>(6) Hybrid attention. The integrated fusion feature <italic>V</italic><sub>1234</sub> is input into hybrid attention along the spatial dimension to aggregate the spatial information, generating two 1D average pooling and maxpooling maps, which are concatenated. Hybrid attention can enhance feature representation of the model, which is simply described as follows,</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:mi>H</mml:mi><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mn>1234</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2295;</mml:mo><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mn>1234</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> are the average pooling and maxpooling operations, respectively.</p>
<p>(7) Training model. Softmax classifier is used to detect crop diseases by <italic>Hatten</italic> from <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>. Classification binary cross entropy objective function is employed to measure the loss between the actual and the predicted detection distribution, defined as show in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:mi>J</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>W</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:mrow><mml:mi>&#x2113;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mi>log</mml:mi></mml:mrow></mml:mstyle><mml:mo stretchy="false">(</mml:mo><mml:mi>exp</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>W</mml:mi><mml:mi>c</mml:mi><mml:mi>T</mml:mi></mml:msubsup><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">/</mml:mo><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:mrow><mml:mi>exp</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>W</mml:mi><mml:mi>p</mml:mi><mml:mi>T</mml:mi></mml:msubsup><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>(<italic>i</italic>&#xa0;=&#xa0;1,2,&#x2026;,<italic>N</italic>) is a training image, <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the <italic>i</italic>th pixel, <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msubsup><mml:mi>W</mml:mi><mml:mi>c</mml:mi><mml:mi>T</mml:mi></mml:msubsup><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the feature representation of <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im13"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is its corresponding label, <italic>N</italic> and <italic>C</italic> are the numbers of the pixel and corresponding class in the image, respectively, <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:mi mathvariant="script">l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mo>*</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is the index function, <italic>C</italic>&#xa0;=&#xa0;2 represents the detected result is binary image, containing defect pixel and background pixel.</p>
<p>From the above analysis, the problem of using MSCNN-VSS and UAV images for crop disease detection is described as shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>, including four stages: collecting images, image preprocessing, constructing model, and evaluating model.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The general process of MSCNN-VSS based crop disease detection using UAV imagery. <bold>(A)</bold> Stages. <bold>(B)</bold> Steps.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1733727-g003.tif">
<alt-text content-type="machine-generated">Diagram (A) illustrates a process flow for disease detection using images, detailing steps from image collecting with UAV sensors to model evaluation. Diagram (B) shows a neural network architecture for image processing, beginning with input image normalization and including operations like SiLU activation, depthwise convolution, and multiplication, ending with an output spot image.</alt-text>
</graphic></fig>
</sec>
<sec id="s3">
<label>3</label>
<title>Experiment and analysis</title>
<p>The proposed MSCNN-VSS is extensively evaluated against six state-of-the-art models: Spatial-Context-Attention Network (SCANet) (<xref ref-type="bibr" rid="B13">Qin et&#xa0;al., 2021</xref>), an improved U-Net segmentation model with image processing (IUNet-IP) (<xref ref-type="bibr" rid="B24">Zhu et&#xa0;al., 2024</xref>), PD-TR (<xref ref-type="bibr" rid="B18">Wang H, et al., 2024</xref>), CMTNet (<xref ref-type="bibr" rid="B5">Guo et&#xa0;al., 2025</xref>), VM-UNet (<xref ref-type="bibr" rid="B14">Ruan et&#xa0;al., 2024</xref>), and Multiscale Vision Mamba-UNet (MSVM-UNet) (<xref ref-type="bibr" rid="B6">Hu et al., 2024</xref>). Brief descriptions of these comparative models are provided as follows.</p>
<p>SCANet is a spatial-context-attention network to identify disease based on UAV multi-spectral RSIs.</p>
<p>IUNet-IP is a hybrid architecture to identify leaf diseases and detect crops using deep learning and sophisticated image-processing techniques.</p>
<p>PD-TR is an end-to-end plant diseases detection using a Transformer.</p>
<p>CMTNet is a hybrid CNN-transformer network for UAV-based crop classification in precision agriculture.</p>
<p>VM-UNet is a Vision Mamba UNet for image segmentation.</p>
<p>MSVM-UNet is a multiscale VM-UNet for image segmentation.</p>
<sec id="s3_1">
<label>3.1</label>
<title>UAV dataset preparation</title>
<p>Sensors and cameras installed on UAVs were used to capture images of crop diseased leaves. The experimental dataset was constructed using a commercial Spreading Wing S1000+ UAV equipped with an RGB sensor, which captured images of rust-diseased leaves across multiple crop species. Data acquisition was conducted from July to September 2022 in the Yangling Agricultural Demonstration Zone, Xi&#x2019;an, China. The UAV was operated at a constant altitude of 25 meters, yielding imagery with a ground resolution of approximately 1 cm/pixel. Each image has a resolution of 4000&#xd7;3000 pixels at 72 dpi. The dataset comprises 500 high-resolution images collected under varied scenarios, lighting conditions, shooting angles, and backgrounds, as illustrated in <xref ref-type="fig" rid="f4"><bold>Figure 4A</bold></xref>. To ensure robust evaluation, the dataset was constructed with a balanced distribution across five major crop species and three disease severity levels. The severity was labeled by agronomy experts based on the visible percentage of leaf area affected: &#x2018;Small&#x2019; (1-10%), &#x2018;Medium&#x2019; (11-25%), and &#x2018;Large&#x2019; (&gt;25%). The detailed distribution of samples is shown in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The detailed distribution of samples.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Crop species</th>
<th valign="middle" align="center">Small</th>
<th valign="middle" align="center">Medium</th>
<th valign="middle" align="center">Large</th>
<th valign="middle" align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Maize</td>
<td valign="middle" align="center">30</td>
<td valign="middle" align="center">45</td>
<td valign="middle" align="center">28</td>
<td valign="middle" align="center">103</td>
</tr>
<tr>
<td valign="middle" align="center">Wheat</td>
<td valign="middle" align="center">29</td>
<td valign="middle" align="center">42</td>
<td valign="middle" align="center">25</td>
<td valign="middle" align="center">96</td>
</tr>
<tr>
<td valign="middle" align="center">Soybean</td>
<td valign="middle" align="center">34</td>
<td valign="middle" align="center">48</td>
<td valign="middle" align="center">30</td>
<td valign="middle" align="center">112</td>
</tr>
<tr>
<td valign="middle" align="center">Rapeseed</td>
<td valign="middle" align="center">27</td>
<td valign="middle" align="center">39</td>
<td valign="middle" align="center">24</td>
<td valign="middle" align="center">90</td>
</tr>
<tr>
<td valign="middle" align="center">Rice</td>
<td valign="middle" align="center">31</td>
<td valign="middle" align="center">41</td>
<td valign="middle" align="center">27</td>
<td valign="middle" align="center">99</td>
</tr>
<tr>
<td valign="middle" align="center">Total</td>
<td valign="middle" align="center">151</td>
<td valign="middle" align="center">215</td>
<td valign="middle" align="center">134</td>
<td valign="middle" align="center">500</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>These samples cover maize, wheat, soybean, rapeseed, and rice 5 major crop species, with each crop category further divided into small, medium and large three severity levels. The detailed distribution is shown in Table below.</p>
<p>To ensure effective model training and evaluation, a five-fold cross-validation (FFCV) scheme was employed. The dataset was partitioned at the image level into five folds, ensuring that each fold maintained a nearly identical distribution of crop species and disease severity levels as the overall dataset (as shown in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>). This stratified splitting principle prevents bias and guarantees that each fold is representative of the entire data distribution. In each fold, 400 images (80%) were used for training, and the remaining 100 images (20%) served as an independent test set. The acquired images underwent preprocessing to eliminate background interference and noise. The Simple Linear Iterative Clustering (SLIC) method from the OpenCV-Python library was used for superpixel segmentation. We evaluated superpixel numbers in the range of [100, 300, 500]. Using 300 superpixels was selected as it preserved critical disease features without noticeable loss of spots while drastically reducing the computational load for subsequent analysis, as shown in <xref ref-type="fig" rid="f4"><bold>Figure 4B</bold></xref>. This setting was used for all experiments.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The rust diseased leaf image samples and superpixel images. <bold>(A)</bold> Diseased leaf image samples. <bold>(B)</bold> Superpixel images via three superpixel number.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1733727-g004.tif">
<alt-text content-type="machine-generated">(A) Collection of various plant images showing leaves with different textures, patterns, and possible signs of disease or damage. (B) Series demonstrating superpixel segmentation on two plant images. Image (a) is the original; (b), (c), and (d) show segmentation at increasing K values: 50, 150, 300, with progressively finer segmentation.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Implementation details</title>
<p>To ensure a fair and reproducible comparison among all models, we established a unified training benchmark with fixed hyperparameters, data augmentation, and input resolution across all compared methods.</p>
<p>Input Preprocessing: All UAV images were first processed by the SLIC superpixel algorithm (with 300 superpixels) and then centrally cropped to a uniform resolution of 512&#xd7;512 pixels to ensure consistent input dimensions across the network.</p>
<p>Data Augmentation: To enhance model robustness and prevent overfitting, a standard set of augmentation strategies was applied on-the-fly during training to all models. This included: random horizontal and vertical flipping (probability=0.5), random rotation (&#xb1;&#xa0;30 degrees), and random adjustments to brightness and contrast (variation factor=0.2).</p>
<p>Training Configuration: All models were trained from scratch under the same conditions:</p>
<p>Epochs 300, Batch Size 15, Optimizer Adam, Initial Learning Rate 0.001, Learning Rate Schedule Reduced by 50% every 500 iterations, Loss Function Weighted Cross-Entropy.</p>
<p>The network parameters were initialized using the Kaiming method. All experiments were conducted in the hardware and software environments shown in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Experimental configuration of hardware and software.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Category</th>
<th valign="middle" align="center">Specification</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">CPU</td>
<td valign="middle" align="left">Intel Xeon E5-2667v @ 3.20 GHz</td>
</tr>
<tr>
<td valign="middle" align="left">GPU</td>
<td valign="middle" align="left">NVidia Quadro M4000</td>
</tr>
<tr>
<td valign="middle" align="left">Operating System</td>
<td valign="middle" align="left">Windows 10</td>
</tr>
<tr>
<td valign="middle" align="left">Programming Language</td>
<td valign="middle" align="left">Python 3.6</td>
</tr>
<tr>
<td valign="middle" align="left">Deep Learning Framework</td>
<td valign="middle" align="left">Keras, TensorFlow-GPU 1.8.0</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In addition to PA and mIoU, the following evaluation metrics are used to assess the model, where quantifies the computational cost in the reasoning process, and the model size indicates the storage requirements.</p>
<p>Evaluation Metrics: Pixel Accuracy (PA), mean Intersection over Union (mIoU), parameters (Params), computational complexity (FLOPs), and training time. PA and mIoU are calculated as show in <xref ref-type="disp-formula" rid="eq6">Equations 6</xref> and <xref ref-type="disp-formula" rid="eq7">7</xref>:</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mi>P</mml:mi><mml:mi>A</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mstyle><mml:mo stretchy="false">/</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mi>m</mml:mi><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:mi>U</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mi>q</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>N</italic> is the number of categories of image pixels, <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the predicted and actual pixel of type <italic>i</italic>, <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the total number of class <italic>i</italic> pixels, <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msub><mml:mi>q</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the total number of pixels of actual type <italic>i</italic> and predicted type, <italic>mIoU</italic> is the mean Intersection over Union. The higher <italic>mIoU</italic> indicates the better match between the detected disease area and the actual disease area.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Experiment results</title>
<p><xref ref-type="fig" rid="f5"><bold>Figures&#xa0;5A and B</bold></xref> show two UAV-captured close-range images of rust-infected leaves and their corresponding annotated ground truths. <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5C</bold></xref> displays the superpixel segmented results by SLIC, which divide the spot area into structurally meaningful components. Comparative segmentation results in <xref ref-type="fig" rid="f5"><bold>Figures&#xa0;5D-J</bold></xref> demonstrate the performance of SCANet, IUNet-IP, PD-TR, CMTNet, VM-UNet, MSVM-UNet, and the proposed MSCNN-VSS, respectively.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>The detected spots of 7 models. <bold>(A)</bold> Original rice disease images. <bold>(B)</bold> Ground truth annotations. <bold>(C)</bold> Superpixel images. <bold>(D)</bold> SCANet. <bold>(E)</bold> IUNet-IP. <bold>(F)</bold> PD-TR. <bold>(G)</bold> CMTNet. <bold>(H)</bold> VM-UNet. <bold>(I)</bold> MSV-UNet. <bold>(J)</bold> MSCNN-VSS.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1733727-g005.tif">
<alt-text content-type="machine-generated">Composite image showing different views and analyses of grass. Panel (A) displays two photos of grass. Panel (B) shows two binary masks highlighting certain features. Panel (C) illustrates processed images with outlined structures. Panels (D) to (J) present various black backgrounds with white highlighted patterns, indicating feature detection or analysis.</alt-text>
</graphic></fig>
<p><xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref> illustrates the segmentation performance of the compared models on two close-range UAV images of rust-infected leaves (<xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5A</bold></xref>) and their SLIC superpixel results (<xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5C</bold></xref>). While all models can detect disease spots under challenging conditions, MSCNN-VSS (<xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5J</bold></xref>) achieves the best results, identifying minute spots with minimal false positives due to its VSS block and feature fusion module, which enhance multi-scale feature representation. SCANet and IUNet-IP (<xref ref-type="fig" rid="f5"><bold>Figures&#xa0;5D, E</bold></xref>) show significant false positives, while VM-UNet and MSVM-UNet (<xref ref-type="fig" rid="f5"><bold>Figures&#xa0;5H, I</bold></xref>) capture broader lesions but lack boundary precision.</p>
<p>To further evaluate the performance of MSCNN-VSS, experiments are conducted on the diseased leaf images captured by UAVs at long distances. The comparative detection results of the 7 models are presented in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>The detected spots of 7 models. <bold>(A)</bold> Original disease images. <bold>(B)</bold> Ground truth annotations. <bold>(C)</bold> Superpixel images. <bold>(D)</bold> SCANet. <bold>(E)</bold> IUNet-IP. <bold>(F)</bold> PD-TR. <bold>(G)</bold> CMTNet. <bold>(H)</bold> VM-UNet. <bold>(I)</bold> MSV-UNet. <bold>(J)</bold> MSCNN-VSS.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1733727-g006.tif">
<alt-text content-type="machine-generated">A series of panels depict different image processing results related to forest canopy analysis. Panel A shows aerial images of the forest. Panel B features segmented images with cyan boundaries. Panels C through J display black backgrounds with white speckles representing different stages or methods of data extraction, possibly highlighting various features within the forest canopy. Each panel is labeled with letters A to J for identification.</alt-text>
</graphic></fig>
<p>From <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>, it is found that on long-distance UAV images with complex backgrounds and weaker lesion characteristics, the performance of the 7 models shows significant differentiation, where SCANet and IUNet-IP generate a large number of background false detections and loss a lot of spots, PD-TR and CMTNet seriously miss detections of small lesions. Although VM-UNet and MSVM-UNet can maintain the basic structure, the boundary positioning is ambiguous. In contrast, the proposed MSCNN-VSS achieves the most complete capture and the complete boundary characterization of fine lesions while maintaining the lowest false detection rate. Its multi-level feature fusion mechanism and hybrid attention design effectively overcome the problem of feature attenuation in long-distance imaging.</p>
<p>To further validate the disease detection performance of the proposed model, the seven comparative models are evaluated using PA, mIoU, Parameters, FLOPs, Model size and average model-training time as objective metrics. Comparative experimental results (mean &#xb1; standard deviation) are shown in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>, where 95% Confidence Interval (95% CI) for PA and mIoU is based on the five-fold cross-validation.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>The different experimental set and results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Results Model</th>
<th valign="middle" align="center">PA</th>
<th valign="middle" align="center">mIoU</th>
<th valign="middle" align="center">PA 95% CI</th>
<th valign="middle" align="center">mIoU 95% CI</th>
<th valign="middle" align="center">Params (M)</th>
<th valign="middle" align="center">FLOPs (G)</th>
<th valign="middle" align="center">Size (MB)</th>
<th valign="middle" align="center">Training Time (min)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">SCANet</td>
<td valign="middle" align="center">0.7198 &#xb1; 0.0045</td>
<td valign="middle" align="center">0.6812 &#xb1; 0.0036</td>
<td valign="middle" align="center">0.7198 &#xb1; 0.0056</td>
<td valign="middle" align="center">0.6812 &#xb1; 0.0045</td>
<td valign="middle" align="center">45.2</td>
<td valign="middle" align="center">12.5</td>
<td valign="middle" align="center">180.5</td>
<td valign="middle" align="center">163.4</td>
</tr>
<tr>
<td valign="middle" align="center">IUNet-IP</td>
<td valign="middle" align="center">0.8186 &#xb1; 0.0057</td>
<td valign="middle" align="center">0.7055 &#xb1; 0.0045</td>
<td valign="middle" align="center">0.8186 &#xb1; 0.0071</td>
<td valign="middle" align="center">0.7055 &#xb1; 0.0056</td>
<td valign="middle" align="center">31.8</td>
<td valign="middle" align="center">9.8</td>
<td valign="middle" align="center">127.3</td>
<td valign="middle" align="center">177.6</td>
</tr>
<tr>
<td valign="middle" align="center">PD-TR</td>
<td valign="middle" align="center">0.8409 &#xb1; 0.0014</td>
<td valign="middle" align="center">0.7764 &#xb1; 0.0018</td>
<td valign="middle" align="center">0.8409 &#xb1; 0.0018</td>
<td valign="middle" align="center">0.7764 &#xb1; 0.0024</td>
<td valign="middle" align="center">128.5</td>
<td valign="middle" align="center">35.2</td>
<td valign="middle" align="center">513.8</td>
<td valign="middle" align="center">226.9</td>
</tr>
<tr>
<td valign="middle" align="center">CMTNet</td>
<td valign="middle" align="center">0.8763 &#xb1; 0.0040</td>
<td valign="middle" align="center">0.8130 &#xb1; 0.0032</td>
<td valign="middle" align="center">0.8763 &#xb1; 0.0050</td>
<td valign="middle" align="center">0.8130 &#xb1; 0.0040</td>
<td valign="middle" align="center">95.7</td>
<td valign="middle" align="center">28.9</td>
<td valign="middle" align="center">382.9</td>
<td valign="middle" align="center">254.6</td>
</tr>
<tr>
<td valign="middle" align="center">VM-UNet</td>
<td valign="middle" align="center">0.8912 &#xb1; 0.0037</td>
<td valign="middle" align="center">0.8423 &#xb1; 0.0030</td>
<td valign="middle" align="center">0.8912 &#xb1; 0.0046</td>
<td valign="middle" align="center">0.8423 &#xb1; 0.0037</td>
<td valign="middle" align="center">25.1</td>
<td valign="middle" align="center">7.5</td>
<td valign="middle" align="center">100.5</td>
<td valign="middle" align="center">113.3</td>
</tr>
<tr>
<td valign="middle" align="center">MSVM-UNet</td>
<td valign="middle" align="center">0.9152 &#xb1; 0.0036</td>
<td valign="middle" align="center">0.8575 &#xb1; 0.0038</td>
<td valign="middle" align="center">0.9152 &#xb1; 0.0045</td>
<td valign="middle" align="center">0.8575 &#xb1; 0.0047</td>
<td valign="middle" align="center">38.6</td>
<td valign="middle" align="center">10.2</td>
<td valign="middle" align="center">154.6</td>
<td valign="middle" align="center">124.6</td>
</tr>
<tr>
<td valign="middle" align="center">MSCNN-VSS</td>
<td valign="middle" align="center"><bold>0.9421 &#xb1; 0.0031</bold></td>
<td valign="middle" align="center"><bold>0.9152 &#xb1; 0.0036</bold></td>
<td valign="middle" align="center"><bold>0.9421 &#xb1; 0.0039</bold></td>
<td valign="middle" align="center"><bold>0.9152 &#xb1; 0.0045</bold></td>
<td valign="middle" align="center"><bold>29.5</bold></td>
<td valign="middle" align="center"><bold>8.1</bold></td>
<td valign="middle" align="center"><bold>118.2</bold></td>
<td valign="middle" align="center"><bold>121.7</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values provided  in the <xref ref-type="table" rid="T4"><bold>Table 4</bold></xref> are the results of the method MSCNN-VSS proposed in the paper.</p>
</table-wrap-foot>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>, the model MSCNN-VSS demonstrates an excellent balance between performance and efficiency. This model achieved the optimal results in segmentation accuracy (PA: 0.9421, mIoU): (0.9152), the number of parameters (29.5M) and computational complexity (8.1G FLOPs) are significantly lower than those of Transformer-based models (such as PD-TR and CMTNet), and its model size (118.2MB) is also much smaller than these models, which demonstrates the advantages of the model. Compared with the benchmark model VM-UNet, which is also part of the VSS series, this model achieved a significant improvement of over 5% in mIoU with only a 17.5% increase in parameters and an 8% increase in computational load. This excellent balance between accuracy and efficiency makes MSCNN-VSS particularly suitable for deployment and application in practical agricultural scenarios with limited computing resources.</p>
<p>Based on the VM-UNet baseline, we progressively integrate four key components: Superpixel segmentation, dilated multi-scale Inception modules, advanced feature fusion mechanisms, and attention mechanisms, to quantitatively evaluate their individual and collective contributions to the final detection performance. This structured ablation study meticulously analyzes improvements in PA, mIoU, and training/inference efficiency, demonstrating that our architectural enhancements achieve significant performance gains while maintaining computational feasibility. The results are shown in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>The different experimental set and results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Results Model variant</th>
<th valign="middle" align="center">PA</th>
<th valign="middle" align="center">mIoU</th>
<th valign="middle" align="center">Training time (min)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">VM-UNet (Baseline)</td>
<td valign="middle" align="center">0. 8912</td>
<td valign="middle" align="center">0.8423</td>
<td valign="middle" align="center">113.3</td>
</tr>
<tr>
<td valign="middle" align="left">+ Superpixel</td>
<td valign="middle" align="center">0.8901</td>
<td valign="middle" align="center">0.8412</td>
<td valign="middle" align="center">104.6</td>
</tr>
<tr>
<td valign="middle" align="left">+Dilated Multi-scale Inception</td>
<td valign="middle" align="center">0.9234</td>
<td valign="middle" align="center">0.8765</td>
<td valign="middle" align="center">119.7</td>
</tr>
<tr>
<td valign="middle" align="left">+ Feature Fusion</td>
<td valign="middle" align="center">0.9357</td>
<td valign="middle" align="center">0.9016</td>
<td valign="middle" align="center">117.4</td>
</tr>
<tr>
<td valign="middle" align="left">+ Attention Mechanism</td>
<td valign="middle" align="center">0. 9368</td>
<td valign="middle" align="center">0.9087</td>
<td valign="middle" align="center">118.9</td>
</tr>
<tr>
<td valign="middle" align="left">MSCNN-VSS (Full)</td>
<td valign="middle" align="center">0.9421</td>
<td valign="middle" align="center">0.9152</td>
<td valign="middle" align="center">121.7</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The ablation results in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref> illustrate the progressive performance improvements contributed by each component of the MSCNN-VSS architecture. While the superpixel module slightly degrades detection accuracy, it brings notable computational efficiency. The dilated multi-scale Inception module delivers the most substantial gain, significantly increasing both PA and mIoU. Combining feature fusion and attention mechanisms further enhances detection accuracy, with the attention mechanism particularly improving fine-grained detection capability as reflected in the remarkable mIoU improvement. By synergistically combining all components, the complete model achieves the optimal balance between accuracy and efficiency, attaining the highest scores (0.9421 PA, 0.9152 mIoU) with only a 7.4% increase in training time compared to the baseline, thereby validating the effectiveness of the MSCNN-VSS architectural design.</p>
</sec>
<sec id="s3_4" sec-type="results">
<label>3.4</label>
<title>Results analysis</title>
<p>The above experimental results ultimately verified that each component plays a role in improving the model&#x2019;s accurate disease detection ability. The complete MSCNN-VSS structure achieves the best balance between accuracy and efficiency. As shown in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>, this model has achieved the most advanced results, with a pixel accuracy of 0.9421 and a mIoU of 0.9152. It is far superior to all comparison methods such as CMTNet (0.8763 PA, 0.8130 mIoU) and the original VM-UNet (0.8912PA, 0.8423 mIoU). This significant performance improvement is achieved while maintaining outstanding training efficiency, requiring 121.7 minutes of training time, which is nearly 50% less than that of CMTNet (254.6 minutes) and only 7.4% more than the VM-UNet baseline (113.3 minutes). The ablation experiment results further verify the effectiveness of each architectural component, particularly highlighting the crucial role of the dilated multi-scale Inception module and attention mechanism in capturing complex pathological features while maintaining computational efficiency. These findings jointly verify that MSCNN-VSS is effective and feasible for crop disease detection through optimal balance accuracy and practicality.</p>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusion and future work</title>
<p>This paper proposed a novel MSCNN-VSS model to address the challenges of crop disease detection in UAV imagery. The hybrid architecture effectively combines multi-scale CNN, VSS, and attention mechanisms to achieve a superior balance between segmentation accuracy and computational efficiency. Extensive experiments confirmed that our model outperforms several state-of-the-art benchmarks. The model is effective for enabling practical, large-scale crop disease monitoring in precision agriculture, facilitating timely and targeted pest management. However, it has limitations: the dataset is constrained to five crops and only rust diseases, performance under extreme environmental conditions (e.g., heavy rain, dense cloud cover) remains untested, and residual complexity hinders direct deployment on low-power devices. Future work will focus on extending the model to multi-disease classification and developing lightweight versions for edge deployment, as outlined in the Discussion section, explore lightweight versions of the architecture for mobile deployment to handle high-resolution UAV image data.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>TZ: Funding acquisition, Writing &#x2013; review &amp; editing, Methodology, Writing &#x2013; original draft. DW: Data curation, Validation, Investigation, Writing &#x2013; review &amp; editing, Visualization, Formal analysis. WC: Validation, Conceptualization, Writing &#x2013; review &amp; editing, Formal analysis, Investigation, Software.</p></sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s9" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Abbas</surname> <given-names>A.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<name><surname>Zheng</surname> <given-names>H.</given-names></name>
<name><surname>Alami</surname> <given-names>M.</given-names></name>
<name><surname>Alrefaei</surname> <given-names>A.</given-names></name>
<name><surname>Qamar Abbas</surname> <given-names>Q.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>UAVs in plant disease assessment, efficient monitoring, and detection: A way forward to smart agriculture</article-title>. <source>Agronomy</source> <volume>13</volume>, <elocation-id>1524</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy13061524</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Alonso</surname> <given-names>C.</given-names></name>
<name><surname>Sieber</surname> <given-names>J.</given-names></name>
<name><surname>Zeilinger</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>State space models as foundation models: A control theoretic overview</article-title>. <source>Am. Control Conf.</source>, <fpage>146</fpage>&#x2013;<lpage>153</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.23919/ACC63710.2025.11107969</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bouguettaya</surname> <given-names>A.</given-names></name>
<name><surname>Zarzour</surname> <given-names>H.</given-names></name>
<name><surname>Kechida</surname> <given-names>A.</given-names></name>
<name><surname>Taberkit</surname> <given-names>A.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>A survey on deep learning-based identification of plant and crop diseases from UAV-based aerial images</article-title>. <source>Cluster Computing</source> <volume>26</volume>, <fpage>1297</fpage>&#x2013;<lpage>1317</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10586-022-03627-x</pub-id>, PMID: <pub-id pub-id-type="pmid">35968221</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>De</surname> <given-names>S.</given-names></name>
<name><surname>Brown</surname> <given-names>D.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Multispectral plant disease detection with vision transformer&#x2013;convolutional neural network hybrid approaches</article-title>. <source>Sensors</source> <volume>23</volume>, <elocation-id>8531</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s23208531</pub-id>, PMID: <pub-id pub-id-type="pmid">37896623</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Guo</surname> <given-names>X.</given-names></name>
<name><surname>Feng</surname> <given-names>Q.</given-names></name>
<name><surname>Guo</surname> <given-names>F.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>CMTNet: a hybrid CNN-transformer network for UAV-based hyperspectral crop classification in precision agriculture</article-title>. <source>Sci. Rep.</source> <volume>15</volume>, <fpage>12383</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-025-97052-w</pub-id>, PMID: <pub-id pub-id-type="pmid">40216979</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hu</surname> <given-names>C.</given-names></name>
<name><surname>Cao</surname> <given-names>N.</given-names></name>
<name><surname>Zhou</surname> <given-names>H.</given-names></name>
<name><surname>Guo</surname> <given-names>B.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Medical image classification with a hybrid SSM model based on CNN and transformer</article-title>. <source>Electronics.</source> <volume>13</volume>, <elocation-id>3094</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics13153094</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jia</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>He</surname> <given-names>J.</given-names></name>
<name><surname>Biswas</surname> <given-names>A.</given-names></name>
<name><surname>Siddique</surname> <given-names>K.</given-names></name>
<name><surname>Hou</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Enhancing precision nitrogen management for cotton cultivation in arid environments using remote sensing techniques</article-title>. <source>Field Crops Res.</source> <volume>321</volume>, <fpage>109689</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.fcr.2024.109689</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kerkech</surname> <given-names>M.</given-names></name>
<name><surname>Hafiane</surname> <given-names>A.</given-names></name>
<name><surname>Canals</surname> <given-names>R.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Vine disease detection in UAV multispectral images using optimized image registration and deep learning segmentation approach</article-title>. <source>Comput. Electron. Agric.</source> <volume>174</volume>, <fpage>105446</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105446</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lei</surname> <given-names>S.</given-names></name>
<name><surname>Luo</surname> <given-names>J.</given-names></name>
<name><surname>Tao</surname> <given-names>X.</given-names></name>
<name><surname>Qiu</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Remote sensing detecting of yellow leaf disease of Arecanut based on UAV multisource sensors</article-title>. <source>Remote Sens</source> <volume>13</volume>, <elocation-id>4562</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs13224562</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>D.</given-names></name>
<name><surname>Yang</surname> <given-names>S.</given-names></name>
<name><surname>Du</surname> <given-names>Z.</given-names></name>
<name><surname>Xu</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>P.</given-names></name>
<name><surname>Yu</surname> <given-names>K.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Application of unmanned aerial vehicle optical remote sensing in crop nitrogen diagnosis: A systematic literature review</article-title>. <source>Comput. Electron. Agric.</source> <volume>227</volume>, <fpage>109565</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109565</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Maes</surname> <given-names>W.</given-names></name>
<name><surname>Steppe</surname> <given-names>K.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Perspectives for remote sensing with unmanned aerial vehicles in precision agriculture</article-title>. <source>Trends Plant Sci.</source> <volume>24</volume>, <fpage>152</fpage>&#x2013;<lpage>164</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.tplants.2018.11.007</pub-id>, PMID: <pub-id pub-id-type="pmid">30558964</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Narmilan</surname> <given-names>A.</given-names></name>
<name><surname>Gonzalez</surname> <given-names>F.</given-names></name>
<name><surname>Salgadoe</surname> <given-names>A.</given-names></name>
<name><surname>Powell</surname> <given-names>K.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Detection of white leaf disease in sugarcane using machine learning techniques over UAV multispectral images</article-title>. <source>UAVs</source> <volume>6</volume>, <elocation-id>230</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/UAVs6090230</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Qin</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>B.</given-names></name>
<name><surname>Wu</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Identifying pine wood nematode disease using UAV images and deep learning algorithms</article-title>. <source>Remote Sens.</source> <volume>13</volume>, <elocation-id>162</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs13020162</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ruan</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Xiang</surname> <given-names>S.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>VM-UNet: vision mamba UNet for medical image segmentation</article-title>. <source>arXiv:2402.02491</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2402.02491</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shahi</surname> <given-names>T.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Neupane</surname> <given-names>A.</given-names></name>
<name><surname>Guo</surname> <given-names>W.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Recent advances in crop disease detection using UAV and deep learning techniques</article-title>. <source>Remote Sens.</source> <volume>15</volume>, <elocation-id>2450</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs15092450</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shi</surname> <given-names>Y.</given-names></name>
<name><surname>Dong</surname> <given-names>M.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Multiscale VMamba: hierarchy in hierarchy visual state space model</article-title>. <source>arXiv:2405.14174</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2405.14174</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Singh</surname> <given-names>A.</given-names></name>
<name><surname>Rao</surname> <given-names>A.</given-names></name>
<name><surname>Chattopadhyay</surname> <given-names>P.</given-names></name>
<name><surname>Maurya</surname> <given-names>R.</given-names></name>
<name><surname>Singh</surname> <given-names>L.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Effective plant disease diagnosis using Vision Transformer trained with leafy-generative adversarial network-generated images</article-title>. <source>Expert Syst. Appl.</source> <volume>254</volume>, <fpage>124387</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2024.124387</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Nguyen</surname> <given-names>T.</given-names></name>
<name><surname>Nguyen</surname> <given-names>T.</given-names></name>
<name><surname>Dang</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>PD-TR: End-to-end plant diseases detection using a Transformer</article-title>. <source>Comput. Electron. Agric.</source> <volume>224</volume>, <fpage>109123</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics13153094</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
<name><surname>Lizaga</surname> <given-names>I.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Ge</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>UAS-based remote sensing for agricultural Monitoring: Current status and perspectives</article-title>. <source>Comput. Electron. Agric.</source> <volume>227</volume>, <elocation-id>109501</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109501</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
</person-group> (<year>2022</year>b). 
<article-title>Dilated convolution capsule network for apple leaf disease identification</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.1002312</pub-id>, PMID: <pub-id pub-id-type="pmid">36388492</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Yu</surname> <given-names>C.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
</person-group> (<year>2022</year>a). 
<article-title>Lightweight multi-scale dilated U-net for crop disease leaf image segmentation</article-title>. <source>Electronics</source> <volume>11</volume>, <elocation-id>3947</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics11233947</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
<name><surname>Wang</surname> <given-names>D.</given-names></name>
<name><surname>Yu</surname> <given-names>C.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Apple leaf disease recognition method based on Siamese dilated Inception network with less training samples</article-title>. <source>Comput. Electron. Agric.</source> <volume>213</volume>, <elocation-id>108188</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108188</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
<name><surname>Zhang</surname> <given-names>C.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Modified U-Net for plant diseased leaf image segmentation</article-title>. <source>Comput. Electron. Agric.</source> <volume>204</volume>, <elocation-id>107511</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107511</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>H.</given-names></name>
<name><surname>Lin</surname> <given-names>C.</given-names></name>
<name><surname>Liu</surname> <given-names>G.</given-names></name>
<name><surname>Wang</surname> <given-names>D.</given-names></name>
<name><surname>Qin</surname> <given-names>S.</given-names></name>
<name><surname>Li</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Intelligent agriculture: deep learning in UAV-based remote sensing imagery for crop diseases and pest detection</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1435016</pub-id>, PMID: <pub-id pub-id-type="pmid">39512475</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3123390">Zhiqiang Huo</ext-link>, Queen Mary University of London, United Kingdom</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/694621">Jun Liu</ext-link>, Weifang University of Science and Technology, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3269409">Xingzao Ma</ext-link>, Lingnan Normal University, China</p></fn>
</fn-group>
</back>
</article>