<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1734292</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>RFDAF-Net: a novel region-specific feature decoupling and adaptive fusion network for field soybean disease identification in precision agriculture</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Pan</surname><given-names>Renyong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3259674/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Yang</surname><given-names>Qihang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chen</surname><given-names>Yang</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3258247/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Cao</surname><given-names>Jian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>College of Microelectronics and Artificial Intelligence, Kaili University</institution>, <city>Kaili</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>School of Computer and Information, Qiannan Normal University for Nationalities</institution>, <city>Duyun</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>College of Big Data and Information Engineering, Guizhou University</institution>, <city>Guiyang</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>Micronano and Intelligent Manufacturing Engineering Research Centre of Ministry of Education</institution>, <city>Kaili</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Yang Chen, <email xlink:href="mailto:cy52cv@sgmtu.edu.cn">cy52cv@sgmtu.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-15">
<day>15</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1734292</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>09</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>08</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Pan, Yang, Chen and Cao.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Pan, Yang, Chen and Cao</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-15">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Soybean diseases pose a significant threat to global crop yield and food security, necessitating rapid and accurate identification for effective management. While deep learning offers promising solutions for plant disease recognition, existing models often struggle with the complexities of in-field soybean disease identification, particularly due to high intra-class variations and subtle inter-class differences. </p>
</sec>
<sec>
<title>Methods</title>
<p>To address these challenges, we propose a novel region-specific feature decoupling and adaptive fusion network (RFDAF-Net) designed for robust and precise soybean disease recognition under real-world field conditions. The core of RFDAF-Net consists of two key components: a region-specific feature decoupling (RFD) module that enhances discriminative patterns and suppresses redundant information through a dual-pathway design, explicitly separating shallow, intermediate, and deep features; and a region-specific feature adaptive fusion (RFAF) module that dynamically integrates these multi-scale features via learned spatial attention. This hierarchical feature decomposition effectively isolates discriminative disease signatures while suppressing irrelevant variations. The architecture is flexible, enabling seamless integration with various backbone networks including both convolutional neural networks and Transformers. </p>
</sec>
<sec>
<title>Results</title>
<p>We evaluate RFDAF-Net extensively on a comprehensive soybean disease dataset containing images captured in diverse field environments. Experimental results show that our method significantly outperforms current state-of-the-art models across multiple architectures, achieving a top accuracy of 99.43% when implemented with a Swin-B backbone. </p>
</sec>
<sec>
<title>Discussion</title>
<p>The proposed framework offers an interpretable and field-ready solution for precision crop protection, demonstrating strong generalization ability and practical utility for real-world agricultural applications.</p>
</sec>
</abstract>
<kwd-group>
<kwd>adaptive fusion</kwd>
<kwd>deep learning</kwd>
<kwd>precision agriculture</kwd>
<kwd>region-specific feature decoupling</kwd>
<kwd>soybean disease identification</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for work and/or its publication. This study was supported by the Foundation Research Project of Kaili University (grant No. 2025YB004).</funding-statement>
</funding-group>
<counts>
<fig-count count="9"/>
<table-count count="7"/>
<equation-count count="12"/>
<ref-count count="40"/>
<page-count count="15"/>
<word-count count="6944"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Soybean cultivation underpins global food systems as a primary source of plant-based proteins and edible oils, with production stability directly impacting agricultural economies worldwide (<xref ref-type="bibr" rid="B7">Dilawari et&#xa0;al., 2022</xref>). Climate-induced environmental volatility, particularly erratic precipitation and rising temperatures, has accelerated the spread of destructive diseases including soybean rust, frogeye leaf spot, and sudden death syndrome (<xref ref-type="bibr" rid="B12">Hossain et&#xa0;al., 2024</xref>). Field diagnosis faces persistent challenges due to overlapping visual symptoms during early infection stages, limited access to expert pathology services in rural areas, and phenotypic plasticity under varying field conditions (<xref ref-type="bibr" rid="B17">Lin et&#xa0;al., 2025</xref>; <xref ref-type="bibr" rid="B33">Sharma et&#xa0;al., 2025</xref>). These constraints often trigger indiscriminate fungicide applications, escalating production costs while generating ecological risks through chemical runoff and non-target organism harm. Consequently, automated disease recognition systems have become critical for enabling precision interventions that reduce pesticide dependency while supporting sustainable soybean production (<xref ref-type="bibr" rid="B28">Pranta et&#xa0;al., 2025</xref>).</p>
<p>To address these urgent needs, computer vision-based approaches have become the cornerstone of modern plant phenotyping and disease diagnosis (<xref ref-type="bibr" rid="B38">Upadhyay et&#xa0;al., 2025</xref>). The field has undergone a significant paradigm shift, moving from traditional machine learning methods that relied on hand-crafted feature extraction (e.g., color, texture, and shape histograms) to the end-to-end feature learning capabilities of deep learning (DL) (<xref ref-type="bibr" rid="B30">Rumpf et&#xa0;al., 2010</xref>; <xref ref-type="bibr" rid="B9">Hari and Singh, 2025</xref>). This transition has established DL as the predominant methodology for in-field applications, owing to its superior ability to automatically learn discriminative and hierarchical features from raw image data. Among various DL architectures, convolutional neural networks (CNNs) have been extensively adopted as the backbone for plant disease recognition (<xref ref-type="bibr" rid="B26">Parez et&#xa0;al., 2025</xref>; <xref ref-type="bibr" rid="B5">Chakrabarty et&#xa0;al., 2024</xref>). Pioneering and widely used architectures such as ResNet (<xref ref-type="bibr" rid="B10">He et&#xa0;al., 2016</xref>), Inception (<xref ref-type="bibr" rid="B35">Szegedy et&#xa0;al., 2015</xref>), and EfficientNet <xref ref-type="bibr" rid="B36">Tan and Le (2019)</xref> have demonstrated remarkable accuracy in classifying crop diseases, effectively learning to distinguish subtle patterns indicative of pathological stress. More recently, Visual Transformers (VTs) have emerged as a powerful alternative, leveraging self-attention mechanisms to capture global contextual relationships within an image, often achieving state-of-the-art performance. However, despite their demonstrated prowess (<xref ref-type="bibr" rid="B8">Dosovitskiy, 2020</xref>; <xref ref-type="bibr" rid="B20">Liu et&#xa0;al., 2021</xref>), both CNN and VT architectures often exhibit suboptimal performance when deployed against the inherent complexities of real-world field-based plant disease diagnosis (<xref ref-type="bibr" rid="B15">Kumar et&#xa0;al., 2023</xref>). A significant limiting factor is that these naive networks struggle to capture fine-grained pathological features with high fidelity. The common strategy of merely leveraging deeper hierarchical representations proves insufficient for encapsulating the complete semantic information of a disease, as critical early-stage symptomatic details can be lost or diluted through successive pooling and non-linear transformations (<xref ref-type="bibr" rid="B17">Lin et&#xa0;al., 2025</xref>). While multi-scale feature fusion frameworks, such as feature pyramid networks (FPN) (<xref ref-type="bibr" rid="B18">Lin et&#xa0;al., 2017</xref>), have been proposed to mitigate this issue by combining features from different depths, they often inadvertently learn redundant feature patterns across scales. This redundancy limits their effectiveness, as simply aggregating features without explicit guidance does not necessarily ensure that each scale contributes distinct and complementary information. The fundamental challenge, therefore, lies not only in extracting multi-scale features but in explicitly enabling each level to learn discriminative and non-repetitive feature patterns. An ideal solution would orchestrate the feature extraction process such that shallower layers, with their higher spatial resolution, are sharpened to focus on low-level textural patterns, color variances, and localized edge information indicative of early infection. Concurrently, deeper layers should be refined to excel at integrating these details into a robust high-level semantic understanding of the disease phenotype. How to disentangle and maximize the utility of these heterogeneous representations across scales remains a pivotal and unresolved research question.</p>
<p>To address the challenges of feature redundancy and insufficient feature granularity in multi-scale learning, we propose a novel region-specific feature decoupling and adaptive fusion network (RFDAF-Net) for robust soybean disease recognition in complex field environments. Distinct from previous approaches, our model introduces two mechanism-specific innovations. First, unlike traditional attention mechanisms (<xref ref-type="bibr" rid="B13">Hu et&#xa0;al., 2018</xref>) that solely focus on highlighting salient regions, we design a region-specific feature decoupling (RFD) module equipped with a dual-branch strategy of simultaneous enhancement and suppression. While standard attention tends to converge on the most obvious discriminative parts, the RFD module explicitly suppresses these activated regions in a parallel branch. This mechanism forces the network to decouple features and mine complementary visual cues from non-dominant regions, thereby enriching the feature diversity across varying depths. Second, to overcome the limitations of naive multi-scale fusion methods, we introduce a region-specific feature adaptive fusion (RFAF) module. Instead of treating features from shallow, intermediate, and deep layers equally (<xref ref-type="bibr" rid="B18">Lin et&#xa0;al., 2017</xref>), the RFAF module employs a content-aware gated integration mechanism. It dynamically computes spatial weight maps for each scale, allowing the network to adaptively filter out noise from shallow layers while selectively retaining high-level semantic concepts from deep layers based on the specific input context.</p>
<p>The main contributions of this work are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>We propose a novel region-specific feature decoupling and adaptive fusion network (RFDAF-Net) for identification of soybean diseases. RFDAF-Net explicitly decouples and adaptively reintegrates multi-scale features to enhance the extraction of discriminative patterns while suppressing redundancy, effectively improving feature representativeness.</p></list-item>
<list-item>
<p>The proposed architecture demonstrates broad compatibility with both CNN and Vision Transformer backbones, consistently enhancing soybean disease recognition performance across diverse network frameworks.</p></list-item>
<list-item>
<p>We establish a state-of-the-art performance for in-field soybean disease recognition on a challenging dataset, demonstrating substantial improvements in accuracy over strong baseline models.</p></list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>The application of computer vision and deep learning for plant disease recognition has evolved significantly, progressing from laboratory settings towards in-field settings (<xref ref-type="bibr" rid="B2">Antwi et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B37">Tian et&#xa0;al., 2024</xref>). This section reviews the relevant literature in three key areas: traditional image processing-based methods, the rise of deep learning-based approaches, and recent advancements in addressing the challenges of agricultural environments.</p>
<p>Traditional Image Processing-Based Methods: <xref ref-type="bibr" rid="B30">Rumpf et&#xa0;al. (2010)</xref> proposed an early detection method for sugar beet diseases using support vector machine (SVM) with spectral vegetation indices, achieving up to 97% accuracy in distinguishing diseased leaves and demonstrating potential for presymptomatic disease identification. <xref ref-type="bibr" rid="B29">Prasad et&#xa0;al. (2012)</xref> developed a plant biometric system using Gabor wavelet transform (GWT) and SVM for crop disease detection, achieving robust accuracy of approximately 89% across various conditions. The method aids in guiding farmers on disease control to improve production. <xref ref-type="bibr" rid="B39">Wang et&#xa0;al. (2012)</xref> developed a plant disease recognition system using K-means segmentation and BP neural networks with multiple visual features, achieving 100% accuracy in identifying grape and wheat diseases. However, these traditional approaches often depend on manually designed features and shallow classifiers, which struggle to capture complex and hierarchical disease characteristics, leading to limited adaptability and generalization across diverse disease appearances.</p>
<p>Deep Learning for Plant Disease Recognition: <xref ref-type="bibr" rid="B3">Atila et&#xa0;al. (2021)</xref> employed EfficientNet architectures (B4 and B5) with transfer learning for plant disease classification on the PlantVillage dataset, achieving superior accuracy up to 99.97% and precision up to 99.39%, significantly outperforming other deep learning models. <xref ref-type="bibr" rid="B24">Macdonald et&#xa0;al. (2024)</xref> proposed a novel lightweight CNN with dense residual connections for plant disease classification without pre-training, achieving high performance (96.75% accuracy, 97.62% precision) with only 228K parameters, demonstrating computational efficiency comparable to larger models on the PlantVillage dataset. <xref ref-type="bibr" rid="B1">Albahli (2025)</xref> proposed a lightweight EfficientNetV2-based model integrating RGB, multispectral drone imagery, and IoT sensor data, achieving 94.3% accuracy with 28.5 ms inference time and a 30% parameter reduction, demonstrating strong efficiency and robustness for edge-deployable crop disease diagnosis. While existing methods achieve high accuracy in controlled settings, their performance often degrades in real field conditions due to complex backgrounds, variable lighting, and occlusions. This highlights the critical need for models specifically designed to address the challenges of in-field plant disease recognition.</p>
<p>Challenges in Field-Based Diagnosis: <xref ref-type="bibr" rid="B19">Liu et&#xa0;al. (2024)</xref> proposed a multi-scale deformable convolution network for apple leaf disease detection, utilizing dual-branch convolution and constrained offset intervals to handle varying lesion scales and deformable geometries, achieving 66.8% accuracy in complex natural environments. <xref ref-type="bibr" rid="B16">Li et&#xa0;al. (2024)</xref> applied transfer learning with a frozen Xception backbone to identify 10 tea pests and diseases in complex plantation environments, achieving 98.58% test accuracy without attention mechanisms, demonstrating high practical utility for Yunnan tea cultivation. <xref ref-type="bibr" rid="B14">Huang et&#xa0;al. (2025)</xref> proposed a hybrid ConvNeXt-ViT model with ECA and DropKey for robust apple disease recognition, achieving 99.2% accuracy in lab conditions and 79.3% in natural environments, outperforming ViT/ConvNeXt/ResNet50 by 18.6&#x2013;37.8% in field generalization. While existing deep learning models have laid a solid foundation for automated field plant disease diagnosis, their direct application to complex in-field soybean disease recognition remains constrained. Current architectures still struggle to fully address challenges such as fine-grained feature loss, redundant multi-scale representations, and sensitivity to field noise. In particular, the limitations of existing feature fusion strategies motivate the need for a more sophisticated approach to explicitly disentangle and adaptively integrate feature representations.</p>
</sec>
<sec id="s3" sec-type="materials|methods">
<label>3</label>
<title>Materials and methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>Data collection and pre-processing</title>
<p>All experimental data employed in this research were obtained from the publicly accessible auburn soybean leaf disease image dataset (<xref ref-type="bibr" rid="B4">Bevers et&#xa0;al., 2022</xref>). This dataset comprises soybean leaf images captured under real field conditions across multiple growing seasons (2020&#x2013;2021) in the United States, using both smartphone cameras and digital single-lens reflex (DSLR) cameras to ensure diversity in imaging quality and perspective. It encompasses eight categories of soybean leaf conditions: Bacterial blight, Cercospora leaf blight, Downy mildew, Frogeye leaf spot, Healthy, Potassium deficiency, Soybean rust, and Target spot. The images exhibit significant variation in background contexts, including complex field environments, uniform white backgrounds, and grassy settings, which enhances the robustness and generalizability of models trained on this data. For the purposes of this study, a total of 9,648 images were selected from ASDID after initial screening for quality and label consistency. Example images from each category are provided in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Representative samples of each category.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g001.tif">
<alt-text content-type="machine-generated">A series of eight images showing different leaf conditions. Bacterial blight has dark spots, Cercospora leaf blight shows yellowing, Downy mildew has slight discoloration, Frogeye leaf spot has circular lesions. Healthy leaves appear vibrant green. Potassium deficiency shows yellowing between veins. Rust is visible as small, brown patches. Target spot has circular brown areas on the leaves.</alt-text>
</graphic></fig>
<p>Considering the varied and often high resolutions of images within the dataset, as well as the imbalanced distribution across disease categories, preprocessing is essential prior to model training to improve computational efficiency and stabilize learning. To address this, all soybean leaf images were resized to a uniform dimension of 224&#xd7;224 pixels. The dataset was then partitioned into training, validation, and test sets with a ratio of 7:1:2 per category, ensuring representative sampling across all classes. The training set is used to optimize model parameters through backward propagation, while the validation set enables hyperparameter tuning and early stopping to prevent overfitting. The test set, kept entirely separate and unused in any training phase, provides an unbiased evaluation of the final model&#x2019;s generalization performance on unseen data. In addition, deep learning models typically require large volumes of training data to achieve robust generalization and mitigate overfitting. To augment the dataset, we applied a series of geometric and pixel-level transformations. These included random rotations, horizontal and vertical flipping, color jittering, brightness adjustment, additive Gaussian noise, and contrast limited adaptive histogram equalization (CLAHE). The data distributions before and after augmentation are shown in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>. Such augmentations simulate imaging variations under real-world conditions and enhance the model&#x2019;s ability to recognize disease patterns under diverse environments.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Data distributions before and after augmentation.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Classes</th>
<th valign="middle" colspan="4" align="center">Before augmentation</th>
<th valign="middle" colspan="3" align="center">After augmentation</th>
</tr>
<tr>
<th valign="middle" align="center">Train</th>
<th valign="middle" align="center">Val</th>
<th valign="middle" align="center">Test</th>
<th valign="middle" align="center">Total</th>
<th valign="middle" align="center">Train</th>
<th valign="middle" align="center">Val</th>
<th valign="middle" align="center">Test</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Bacterial blight</td>
<td valign="middle" align="center">339</td>
<td valign="middle" align="center">48</td>
<td valign="middle" align="center">97</td>
<td valign="middle" align="center">484</td>
<td valign="middle" align="center">2373</td>
<td valign="middle" align="center">48</td>
<td valign="middle" align="center">97</td>
</tr>
<tr>
<td valign="middle" align="left">Cercospora leaf blight</td>
<td valign="middle" align="center">1119</td>
<td valign="middle" align="center">159</td>
<td valign="middle" align="center">320</td>
<td valign="middle" align="center">1598</td>
<td valign="middle" align="center">2238</td>
<td valign="middle" align="center">159</td>
<td valign="middle" align="center">320</td>
</tr>
<tr>
<td valign="middle" align="left">Downy mildew</td>
<td valign="middle" align="center">457</td>
<td valign="middle" align="center">65</td>
<td valign="middle" align="center">130</td>
<td valign="middle" align="center">652</td>
<td valign="middle" align="center">2285</td>
<td valign="middle" align="center">65</td>
<td valign="middle" align="center">130</td>
</tr>
<tr>
<td valign="middle" align="left">Frogeye leaf spot</td>
<td valign="middle" align="center">1078</td>
<td valign="middle" align="center">154</td>
<td valign="middle" align="center">308</td>
<td valign="middle" align="center">1540</td>
<td valign="middle" align="center">2156</td>
<td valign="middle" align="center">154</td>
<td valign="middle" align="center">308</td>
</tr>
<tr>
<td valign="middle" align="left">Healthy</td>
<td valign="middle" align="center">1143</td>
<td valign="middle" align="center">163</td>
<td valign="middle" align="center">326</td>
<td valign="middle" align="center">1632</td>
<td valign="middle" align="center">2286</td>
<td valign="middle" align="center">163</td>
<td valign="middle" align="center">326</td>
</tr>
<tr>
<td valign="middle" align="left">Potassium deficiency</td>
<td valign="middle" align="center">724</td>
<td valign="middle" align="center">103</td>
<td valign="middle" align="center">207</td>
<td valign="middle" align="center">1034</td>
<td valign="middle" align="center">2172</td>
<td valign="middle" align="center">103</td>
<td valign="middle" align="center">207</td>
</tr>
<tr>
<td valign="middle" align="left">Soybean rust</td>
<td valign="middle" align="center">1139</td>
<td valign="middle" align="center">162</td>
<td valign="middle" align="center">326</td>
<td valign="middle" align="center">1627</td>
<td valign="middle" align="center">2278</td>
<td valign="middle" align="center">162</td>
<td valign="middle" align="center">326</td>
</tr>
<tr>
<td valign="middle" align="left">Target spot</td>
<td valign="middle" align="center">757</td>
<td valign="middle" align="center">108</td>
<td valign="middle" align="center">216</td>
<td valign="middle" align="center">1081</td>
<td valign="middle" align="center">2271</td>
<td valign="middle" align="center">108</td>
<td valign="middle" align="center">216</td>
</tr>
<tr>
<td valign="middle" align="left">Total</td>
<td valign="middle" align="center">6756</td>
<td valign="middle" align="center">962</td>
<td valign="middle" align="center">1930</td>
<td valign="middle" align="center">9648</td>
<td valign="middle" align="center">18059</td>
<td valign="middle" align="center">962</td>
<td valign="middle" align="center">1930</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>The proposed RFDAF-Net</title>
<p>The overall architecture of the proposed RFDAF-Net is illustrated in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>. The model consists of two core components: a region-specific feature decoupling (RFD) module and a region-specific feature adaptive fusion (RFAF) module, which can be flexibly integrated into various backbone networks such as ResNet or Swin Transformer. The RFD module is designed to enhance discriminative features and suppress redundant information through a dual-pathway structure, enabling explicit separation of fine-grained details in shallow layers and high-level semantics in deeper layers. The RFAF module dynamically fuses these multi-scale disentangled features using a content-aware spatial weighting mechanism to achieve adaptive feature integration. Specifically, an input image is first processed by the backbone network to generate multi-level feature maps. These features are then fed into the RFD module, where low-level features are refined to retain detailed textural and lesion information, while high-level features are purified to emphasize semantic concepts. The disentangled features from different levels are subsequently merged by the RFAF module, which assigns spatially varying weights to emphasize informative regions and suppress noise. The fused feature representation is finally passed to a classifier to produce the disease probability distribution.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The overall framework of the proposed RFDAF-Net. It consists of the backbone network, the RFAF modules, and the RFAF modules.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture for leaf disease detection. The process begins with an input image of a leaf, then progresses through a series of blue modules labeled as the backbone network. These modules include three RFD (Receptive Field Dense) modules highlighting areas of interest. Three RFAF (Receptive Field Aggregation Feature) modules connect to these points, leading to pooling and classification processes, indicating a flow from image input to classification output. The legend identifies components: gray for RFD, peach for RFAF, and green for pooling and classification.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Region-specific feature decoupling module</title>
<p>Current deep learning models applied to soybean disease recognition under field conditions predominantly rely on features from a single network depth for classification. While these features provide a broad receptive field, they often fail to capture sufficiently discriminative details under conditions of high visual similarity, such as those commonly encountered in fine-grained soybean disease identification. These challenges are compounded by low inter-class variance among different diseases and high intra-class variance due to varying symptom manifestations across growth stages and environmental conditions. To mitigate these issues, we introduce a region-specific feature decoupling (RFD) module. This module is designed to explicitly extract hierarchical and complementary information by emphasizing salient regions while suppressing less informative areas. The structure of the RFD module is shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. It employs a dual branch mechanism comprising a feature enhancement branch and a feature suppression branch. The enhancement branch amplifies semantically important regions in the current feature map, whereas the suppression branch attenuates the influence of those same regions in subsequent layers. This process encourages the network to identify new distinctive features in later stages. For example, regions showing high activation in shallow feature maps, which often correspond to early textural symptoms, are reinforced by the enhancement branch. At the same time, the suppression branch reduces the emphasis on these regions in deeper feature maps, directing the network&#x2019;s attention to other discriminative parts of the image. This cross-scale interaction enables the model to learn diverse and complementary visual cues across different depths, significantly improving the discriminative power and robustness of the feature representations for accurate in field soybean disease recognition.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The RFD module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network process. It begins with an image input split into grids, followed by nonlinear transformation and pooling. Outputs undergo element-wise multiplication, with a Softmax function applied to intermediate steps. Schematic includes box labels: Split (S), Nonlinear transformation (N), Pooling (P), and Element-wise multiplication (&#xd7;). Final outputs are shown as processed images with highlighted areas.</alt-text>
</graphic></fig>
<p>The region-specific feature decoupling (RFD) module processes an input tensor <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, where <italic>C</italic> indicates the number of channels, <italic>H</italic> and <italic>W</italic> represent the height and width, respectively. The input is first partitioned into 7 segments along both the horizontal and vertical axes. Each horizontal segment is denoted as <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>7</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> for <italic>i</italic>&#xa0;=&#xa0;1, 2, &#x2026;, 7, and each vertical segment as <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>W</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>7</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> for <italic>j</italic>&#xa0;=&#xa0;1, 2, &#x2026;, 7.</p>
<p>Then, these two outputs undergo nonlinear transformations to yield <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, respectively. Note that both <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msubsup><mml:mi>F</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> have a channel number of 1. Next, these two tensors apply global average pooling (GAP) followed by softmax activation and broadcasting, thereby generating enhanced weights for each direction (<xref ref-type="disp-formula" rid="eq1">Equations 1</xref>, <xref ref-type="disp-formula" rid="eq2">2</xref>):</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>B</mml:mi><mml:mtext>&#xa0;</mml:mtext><mml:mo>[</mml:mo><mml:mtext>Softmax</mml:mtext><mml:mo>&#xa0;</mml:mo><mml:mo>(</mml:mo><mml:mtext>GAP</mml:mtext><mml:mo>(</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>)</mml:mo><mml:mo>)</mml:mo><mml:mo>]</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>B</mml:mi><mml:mtext>&#xa0;</mml:mtext><mml:mo>[</mml:mo><mml:mtext>Softmax</mml:mtext><mml:mo>&#xa0;</mml:mo><mml:mo>(</mml:mo><mml:mtext>GAP</mml:mtext><mml:mo>(</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>)</mml:mo><mml:mo>)</mml:mo><mml:mo>]</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Where <italic>B</italic>[&#xb7;] represents the broadcasting. Here, <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mn>2</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mn>7</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mn>2</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mn>7</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math></inline-formula> contain the enhancement coefficients for the horizontal and vertical directions, respectively. The enhanced feature map <italic>Y<sub>E</sub></italic> is subsequently computed as (<xref ref-type="disp-formula" rid="eq3">Equation 3</xref>):</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mi>E</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>The suppression feature map <italic>Y<sub>S</sub></italic> is defined as (<xref ref-type="disp-formula" rid="eq4">Equation 4</xref>):</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>The suppression weights <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msubsup><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msubsup><mml:mi>s</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> are determined by (<xref ref-type="disp-formula" rid="eq5">Equation 5</xref>):</p>
<p><inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msubsup><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:mtable columnalign="left" equalrows="true" equalcolumns="true"><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3b1;</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd columnalign="left"><mml:mrow><mml:mtext>if</mml:mtext><mml:mo>&#x2004;</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>max</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd columnalign="left"><mml:mrow><mml:mtext>otherwise</mml:mtext></mml:mrow></mml:mtd></mml:mtr></mml:mtable><mml:mo>,</mml:mo><mml:mtext>&#xa0;&#xa0;&#xa0;</mml:mtext></mml:mrow></mml:math></inline-formula><inline-formula id="eq5">
<mml:math display="inline" id="im12a"><mml:mrow><mml:msubsup><mml:mi>s</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:mtable columnalign="left" equalrows="true" equalcolumns="true"><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3b1;</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd columnalign="left"><mml:mrow><mml:mtext>if</mml:mtext><mml:mo>&#x2004;</mml:mo><mml:msubsup><mml:mi>e</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mtext>max</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd columnalign="left"><mml:mrow><mml:mtext>otherwise</mml:mtext></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></inline-formula></p>
<p>Where <inline-formula>
<mml:math display="inline" id="im13"><mml:mi>&#x3b1;</mml:mi></mml:math></inline-formula> is a hyperparameter and <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msubsup><mml:mi>s</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>s</mml:mi><mml:mn>7</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msubsup><mml:mi>s</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>s</mml:mi><mml:mn>7</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math></inline-formula>.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Region-specific feature adaptive fusion module</title>
<p>While the RFD module effectively extracts multi-scale discriminative features, effectively integrating these hierarchical representations remains challenging. Most existing fusion methods simply concatenate or sum multi-level features, ignoring the distinct contributions of shallow, intermediate, and deep feature maps. This often leads to feature redundancy and limits the model&#x2019;s ability to adaptively emphasize the most relevant information across spatial and semantic levels. To address this, we propose a Region-specific Feature Adaptive Fusion (RFAF) module, which dynamically combines features from multiple network depths through a learned weighting mechanism. The RFAF module consists of three dedicated branches processing shallow, intermediate, and deep feature maps, respectively. The shallow branch preserves fine-grained details such as texture and local lesions, the intermediate branch captures transitional patterns, and the deep branch encapsulates high-level semantic information.</p>
<p>The core of the RFAF module lies in its adaptive fusion strategy. Feature maps from each branch are first aligned to a common spatial scale using up-sampling or down-sampling operations. These aligned features are then concatenated and processed through a convolutional layer to generate a compact representation. A sigmoid activation is applied to produce three adaptive weight maps corresponding to each branch. The final output is formed by a weighted summation of the original features using these learned weights, enabling the fusion process to emphasize the most informative features from each level in a context-aware manner. This design allows the RFAF module to effectively integrate complementary information while suppressing less relevant features. By dynamically adjusting the importance of features at different depths, the module significantly enhances the representational capacity of the network for improved soybean disease recognition under challenging field conditions. The structure of the RFAF module is illustrated in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The RFAF module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g004.tif">
<alt-text content-type="machine-generated">Diagram showing a deep learning network architecture with blocks illustrating feature maps. The top section, labeled RFAF, shows a sequence of operations: element-wise multiplication and addition, using colored matrices. The bottom section depicts a network with convolution and upsampling processes from input images. The feature maps are merged, processed by a convolution layer, then activated by a sigmoid function. Operations include element-wise multiplication, addition, and concatenation, indicated by respective symbols.</alt-text>
</graphic></fig>
<p>The RFAF module synthesizes multi-scale feature representations through a gated integration mechanism. Let <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:msup><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msup><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im18"><mml:mrow><mml:msup><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mn>3</mml:mn></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> denote the input feature tensors from the shallow, intermediate, and deep branches, respectively, each capturing distinct levels of semantic abstraction.</p>
<p>To facilitate cross-branch integration, we first apply feature transformation and spatial alignment. Features from branches 2 and 3 are processed through pointwise convolutional layers to project them into a common embedding space with reduced channel dimensionality (<xref ref-type="disp-formula" rid="eq6">Equation 6</xref>):</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:msup><mml:mi mathvariant="script">G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x3d5;</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>W</mml:mi></mml:mstyle><mml:mi>k</mml:mi></mml:msub><mml:mo>*</mml:mo><mml:msup><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>b</mml:mi></mml:mstyle><mml:mi>k</mml:mi></mml:msub><mml:mo>)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>W</mml:mi></mml:mstyle><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im20"><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>b</mml:mi></mml:mstyle><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are learnable parameters of the <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula> convolutions, and <inline-formula>
<mml:math display="inline" id="im22"><mml:mi>&#x3d5;</mml:mi></mml:math></inline-formula> denotes the ReLU activation function. The transformed features <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:msup><mml:mi mathvariant="script">G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msup><mml:mi mathvariant="script">G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> are then resized to the spatial dimensions of <inline-formula>
<mml:math display="inline" id="im25"><mml:mrow><mml:msup><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> via bilinear interpolation, denoted as <inline-formula>
<mml:math display="inline" id="im26"><mml:mrow><mml:msup><mml:mover accent="true"><mml:mi mathvariant="script">G</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im27"><mml:mrow><mml:msup><mml:mover accent="true"><mml:mi mathvariant="script">G</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>. The aligned features are concatenated along the channel dimension to form a composite representation (<xref ref-type="disp-formula" rid="eq7">Equation 7</xref>):</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mi mathvariant="script">H</mml:mi><mml:mo>=</mml:mo><mml:mtext>Concat&#xa0;</mml:mtext><mml:mo>(</mml:mo><mml:msup><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mover accent="true"><mml:mi mathvariant="script">G</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mover accent="true"><mml:mi mathvariant="script">G</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>We then compute a set of spatially adaptive weight matrices through a fusion gate implemented as a 1&#xd7;1 convolution followed by sigmoid activation (<xref ref-type="disp-formula" rid="eq8">Equation 8</xref>):</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mi mathvariant="script">A</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>W</mml:mi></mml:mstyle><mml:mi>a</mml:mi></mml:msub><mml:mtext>&#xa0;</mml:mtext><mml:mo>*</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mi mathvariant="script">H</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>b</mml:mi></mml:mstyle><mml:mi>a</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im28"><mml:mrow><mml:mi mathvariant="script">A</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo stretchy="false">]</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> represents the attention weights for each branch. The final output is obtained via a weighted fusion (<xref ref-type="disp-formula" rid="eq9">Equation 9</xref>):</p>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:msub><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msup><mml:mi mathvariant="script">F</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msup><mml:mover accent="true"><mml:mi mathvariant="script">G</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msup><mml:mover accent="true"><mml:mi mathvariant="script">G</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math>
</disp-formula>
<p>This formulation enables context-aware recombination of multi-scale features, enhancing discriminability while preserving structural details essential for accurate fine-grained recognition under challenging field conditions.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental setup and evaluation metrics</title>
<p>To ensure the reproducibility of our results, we detail the implementation and data settings as follows. The experiments were conducted on a workstation equipped with NVIDIA GeForce RTX 5070 Ti GPU and an Intel Core i5-12600KF CPU. The software environment was configured with Python 3.8 and PyTorch 1.13 on Windows 11. Regarding data settings and preprocessing, all input images were resized to a uniform dimension of 224 &#xd7; 224 pixels using bilinear interpolation. We applied standard Z-score normalization using the ImageNet mean (<italic>&#x3bc;</italic> = [0.485, 0.456, 0.406]) and standard deviation (<italic>&#x3c3;</italic> = [0.229, 0.224, 0.225]). Data augmentation techniques, including random rotation, random flipping, and color jittering, were applied exclusively to the training set to enhance generalization, while the validation and test sets remained unaugmented to ensure unbiased evaluation. For the optimization procedure, we utilized the Stochastic Gradient Descent (SGD) optimizer with a momentum of 0.9 and a weight decay of 5&#xd7;10<sup>&#x2212;4</sup>. The batch size was set to 32. The model was trained for a total of 100 epochs to guarantee convergence. We employed a differential learning rate strategy: the backbone network, initialized with ImageNet-1K pre-trained weights 4, was fine-tuned with a lower learning rate of 2 &#xd7; 10<sup>&#x2212;4</sup>, while the newly added RFD and RFAF modules were initialized randomly and trained with a higher learning rate of 2 &#xd7; 10<sup>&#x2212;3</sup>. The suppression threshold <italic>&#x3b1;</italic> was set to 0.5.&#xa0;A summary of the key experimental configurations is provided in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>. To ensure the reliability of the experimental results, each model configuration was executed five times with different random seeds. The results reported in this paper correspond to the model checkpoint that achieved the highest accuracy on the validation set.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Experimental environment details.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Category</th>
<th valign="middle" align="left">Configuration</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">GPU</td>
<td valign="middle" align="left">NVIDIA GeForce RTX 5070 Ti</td>
</tr>
<tr>
<td valign="middle" align="left">CPU</td>
<td valign="middle" align="left">Intel i5-12600KF</td>
</tr>
<tr>
<td valign="middle" align="left">OS</td>
<td valign="middle" align="left">Windows 11</td>
</tr>
<tr>
<td valign="middle" align="left">Python Version</td>
<td valign="middle" align="left">3.8.18</td>
</tr>
<tr>
<td valign="middle" align="left">PyTorch Version</td>
<td valign="middle" align="left">1.13.1</td>
</tr>
<tr>
<td valign="middle" align="left">Backbone LR</td>
<td valign="middle" align="left">2 &#xd7; 10<sup>&#x2212;4</sup></td>
</tr>
<tr>
<td valign="middle" align="left">New Modules LR</td>
<td valign="middle" align="left">2 &#xd7; 10<sup>&#x2212;3</sup></td>
</tr>
<tr>
<td valign="middle" align="left">Optimizer</td>
<td valign="middle" align="left">SGD (Momentum: 0.9)</td>
</tr>
<tr>
<td valign="middle" align="left">Weight Decay</td>
<td valign="middle" align="left">5 &#xd7; 10<sup>&#x2212;4</sup></td>
</tr>
<tr>
<td valign="middle" align="left">Batch Size</td>
<td valign="middle" align="left">32</td>
</tr>
<tr>
<td valign="middle" align="left">Total Epochs</td>
<td valign="middle" align="left">100</td>
</tr>
<tr>
<td valign="middle" align="left">Threshold (<italic>&#x3b1;</italic>)</td>
<td valign="middle" align="left">0.5</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The performance of the proposed model was quantitatively evaluated using four standard classification metrics: Accuracy (Acc), Precision (Pre), Recall (Rec), and F1-score (F1). These metrics were derived from the confusion matrix, which records true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN). Their mathematical definitions are as follows (<xref ref-type="disp-formula" rid="eq10">Equations 10</xref>&#x2013;<xref ref-type="disp-formula" rid="eq13">13</xref>):</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:mtext>Accuracy</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:mtext>Recall</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq13"><label>(13)</label>
<mml:math display="block" id="M13"><mml:mrow><mml:mtext>F</mml:mtext><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mtext>score</mml:mtext><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mfrac><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>+</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>In addition to these quantitative metrics, interpretability tools such as Grad-CAM and t-SNE were employed to visualize decision regions and feature distributions, further validating the model&#x2019;s reliability and explanatory capacity.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Comparison with state-of-the-art models</title>
<p>To evaluate the effectiveness of the proposed RFDAF-Net, we compared its performance against multiple state-of-the-art classification models on the soybean disease recognition task. As summarized in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>, RFDAF-Net achieves the highest accuracy of 99.43%, outperforming all competing approaches. Among the CNN-based models, VGG16 attained the lowest accuracy at 94.25%, while more modern architectures such as ResNet50 and EfficientNet-B0 reached around 97%. Transformer-based backbones generally delivered stronger performance, with Swin-B achieving 98.24%. Recent specialized architectures including TFANet and DIEC-ViT further improved performance, reaching 98.18% and 99.02%, respectively. Our RFDAF-Net, built upon a Swin-B backbone and enhanced with region-specific feature disentanglement and adaptive fusion mechanisms, attained a top accuracy of 99.43%. This result demonstrates the efficacy of the proposed modules in capturing discriminative multi-scale features and effectively integrating hierarchical information under challenging field conditions. The consistent improvement over strong baselines confirms that RFDAF-Net offers a robust solution for fine-grained agricultural image recognition.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison results with state-of-the-art models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="left">Acc (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">VGG16 (<xref ref-type="bibr" rid="B34">Simonyan and Zisserman, 2014</xref>)</td>
<td valign="middle" align="left">94.25</td>
</tr>
<tr>
<td valign="middle" align="left">ResNet50 (<xref ref-type="bibr" rid="B10">He et&#xa0;al., 2016</xref>)</td>
<td valign="middle" align="left">97.26</td>
</tr>
<tr>
<td valign="middle" align="left">MobileNetV2 (<xref ref-type="bibr" rid="B31">Sandler et&#xa0;al., 2018</xref>)</td>
<td valign="middle" align="left">97.15</td>
</tr>
<tr>
<td valign="middle" align="left">ShuffleNetV2 (<xref ref-type="bibr" rid="B22">Ma et&#xa0;al., 2018</xref>)</td>
<td valign="middle" align="left">96.12</td>
</tr>
<tr>
<td valign="middle" align="left">EfficientNet-B0 (<xref ref-type="bibr" rid="B36">Tan and Le, 2019</xref>)</td>
<td valign="middle" align="left">97.10</td>
</tr>
<tr>
<td valign="middle" align="left">ConvNeXt-B (<xref ref-type="bibr" rid="B21">Liu et&#xa0;al., 2022</xref>)</td>
<td valign="middle" align="left">97.98</td>
</tr>
<tr>
<td valign="middle" align="left">PiT-B (<xref ref-type="bibr" rid="B11">Heo et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="left">97.36</td>
</tr>
<tr>
<td valign="middle" align="left">PVT-B (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="left">97.05</td>
</tr>
<tr>
<td valign="middle" align="left">ViT-B (<xref ref-type="bibr" rid="B8">Dosovitskiy, 2020</xref>)</td>
<td valign="middle" align="left">97.41</td>
</tr>
<tr>
<td valign="middle" align="left">Swin-B (<xref ref-type="bibr" rid="B20">Liu et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="left">98.24</td>
</tr>
<tr>
<td valign="middle" align="left">TFANet (<xref ref-type="bibr" rid="B25">Pan et&#xa0;al., 2023</xref>)</td>
<td valign="middle" align="left">98.18</td>
</tr>
<tr>
<td valign="middle" align="left">DIEC-ViT (<xref ref-type="bibr" rid="B17">Lin et&#xa0;al., 2025</xref>)</td>
<td valign="middle" align="left">99.02</td>
</tr>
<tr>
<td valign="middle" align="left">RFDAF-Net (Ours)</td>
<td valign="middle" align="left">99.43</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Impact of the backbone networks</title>
<p>To evaluate the generalization ability of the proposed RFDAF-Net, we conducted extensive experiments using four different backbone networks: MobileNetV2, ResNet50, ConvNeXt-B, and Swin-B. The performance comparisons between the baseline backbones and their RFDAF-Net-enhanced variants on the validation set are illustrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>. The results demonstrate that RFDAF-Net consistently improves classification accuracy across all backbone architectures. Specifically, when integrated with MobileNetV2, which served as a relatively weaker baseline, RFDAF-Net achieved the most significant performance gain. This suggests that the proposed feature disentanglement and adaptive fusion mechanisms effectively compensate for the limited representational capacity of lightweight backbones. With stronger backbones such as ResNet50 and ConvNeXt-B, RFDAF-Net still provided clear improvements, underscoring its ability to enhance even well-performing models. When combined with Swin-B, which already delivered high baseline accuracy, RFDAF-Net attained near-perfect performance. The marginal but consistent improvement in this case can be attributed to the fact that the classification task is approaching its performance ceiling, leaving limited room for further gains. These observations confirm that RFDAF-Net is robust and architecture-agnostic, providing measurable benefits across a diverse range of backbone networks. Its ability to achieve the largest improvements on weaker backbones is especially promising for practical applications where computational efficiency is critical. To further evaluate the robustness and generalization capability of RFDAF-Net, we compared its performance against baseline backbones on the test set. Quantitative results are presented in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>, which includes Accuracy, Precision, Recall, and F1-score for each model configuration. The results clearly demonstrate that RFDAF-Net consistently enhances performance across all backbone networks. Notably, the absolute performance of RFDAF-Net improves with the capacity of the backbone network&#x2014;RFDAF-Net (Swin-B) achieves the highest results with 99.43% accuracy and 99.50% F1-score, while RFDAF-Net (MobileNetV2) also shows substantial gains, reaching 98.76% accuracy. This trend confirms that the effectiveness of our method is amplified when combined with more powerful feature extractors, yet it still brings significant improvements even on lighter backbones. Moreover, the conclusions drawn from the test set are fully consistent with those from the validation set: RFDAF-Net provides noticeable improvements across all architectures, with the most pronounced gains occurring on weaker backbones. The stable superiority under both validation and test environments strongly attests to the general applicability and robustness of the proposed method.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Impact of the different backbone networks on the validation set.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g005.tif">
<alt-text content-type="machine-generated">Four line graphs show model accuracy over 100 epochs. Top left: MobileNetV2, RDFDAF-Net outperforms baseline. Top right: ResNet50, RDFDAF-Net leads baseline. Bottom left: ConvNeXt-B, RDFDAF-Net surpasses baseline. Bottom right: Swin-B, RDFDAF-Net achieves higher accuracy than baseline.</alt-text>
</graphic></fig>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Impact of the different backbone networks on the test set.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="left">Acc (%)</th>
<th valign="middle" align="left">Pre (%)</th>
<th valign="middle" align="left">Rec (%)</th>
<th valign="middle" align="left">F1 (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">MobileNetV2</td>
<td valign="middle" align="center">97.15</td>
<td valign="middle" align="center">97.47</td>
<td valign="middle" align="center">97.20</td>
<td valign="middle" align="center">97.33</td>
</tr>
<tr>
<td valign="middle" align="left">ResNet50</td>
<td valign="middle" align="center">97.26</td>
<td valign="middle" align="center">97.41</td>
<td valign="middle" align="center">97.32</td>
<td valign="middle" align="center">97.36</td>
</tr>
<tr>
<td valign="middle" align="left">ConvNeXt-B</td>
<td valign="middle" align="center">97.98</td>
<td valign="middle" align="center">98.04</td>
<td valign="middle" align="center">97.93</td>
<td valign="middle" align="center">97.96</td>
</tr>
<tr>
<td valign="middle" align="left">Swin-B</td>
<td valign="middle" align="center">98.24</td>
<td valign="middle" align="center">98.60</td>
<td valign="middle" align="center">97.65</td>
<td valign="middle" align="center">98.09</td>
</tr>
<tr>
<td valign="middle" align="left">RFDAF-Net (MobileNetV2)</td>
<td valign="middle" align="center">98.76</td>
<td valign="middle" align="center">98.87</td>
<td valign="middle" align="center">98.98</td>
<td valign="middle" align="center">98.92</td>
</tr>
<tr>
<td valign="middle" align="left">RFDAF-Net (ResNet50)</td>
<td valign="middle" align="center">98.96</td>
<td valign="middle" align="center">98.94</td>
<td valign="middle" align="center">99.02</td>
<td valign="middle" align="center">98.98</td>
</tr>
<tr>
<td valign="middle" align="left">RFDAF-Net (ConvNeXt-B)</td>
<td valign="middle" align="center">99.28</td>
<td valign="middle" align="center">99.38</td>
<td valign="middle" align="center">99.28</td>
<td valign="middle" align="center">99.32</td>
</tr>
<tr>
<td valign="middle" align="left">RFDAF-Net (Swin-B)</td>
<td valign="middle" align="center">99.43</td>
<td valign="middle" align="center">99.53</td>
<td valign="middle" align="center">99.48</td>
<td valign="middle" align="center">99.50</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Ablation analysis</title>
<p>To evaluate the individual contributions of the proposed components, we conducted ablation experiments using MobileNetV2 as the baseline backbone. The quantitative results are summarized in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>. The baseline MobileNetV2 achieves an accuracy of 97.15%. However, without explicit feature guidance, standard backbones often struggle to balance fine-grained textures with high-level semantics. Introducing the RFAF module alone improves accuracy to 97.92%. This improvement is not merely due to feature aggregation but stems from the RFAF&#x2019;s gated attention mechanism. Unlike simple summation which propagates noise, RFAF dynamically assigns lower weights to irrelevant background clutter in shallow layers while amplifying semantic cues in deep layers, effectively &#x2018;cleaning&#x2019; the representation before classification. Incorporating the RFD module yields a more substantial gain, reaching 98.18% accuracy. This performance leap validates the effectiveness of our region-specific decoupling strategy. By explicitly suppressing the most salient discriminative regions in the parallel branch, the RFD module forces the network to shift its attention to complementary visual evidence, such as subtle lesion margins or early-stage chlorosis patterns. This prevents the model from over-relying on a single dominant feature and enhances robustness against intra-class variations. Finally, combining both modules (RFDAF-Net) achieves the highest performance of 98.76%. This demonstrates a synergistic effect: the RFD module enriches the diversity of extracted features by mining non-salient patterns, while the RFAF module optimally integrates these diverse features by selectively emphasizing the most informative scales. Together, they form a closed-loop system that maximizes feature representativeness and discriminability.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Ablation experiments on the MobileNetV2 backbone.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Backbone</th>
<th valign="middle" align="left">RFD</th>
<th valign="middle" align="left">RFAF</th>
<th valign="middle" align="left">Acc (%)</th>
<th valign="middle" align="left">Pre (%)</th>
<th valign="middle" align="left">Rec (%)</th>
<th valign="middle" align="left">F1 (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">MobileNetV2</td>
<td valign="middle" align="left"/>
<td valign="middle" align="left"/>
<td valign="middle" align="left">97.15</td>
<td valign="middle" align="center">97.47</td>
<td valign="middle" align="center">97.20</td>
<td valign="middle" align="center">97.33</td>
</tr>
<tr>
<td valign="middle" align="left">MobileNetV2</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left"/>
<td valign="middle" align="left">98.18</td>
<td valign="middle" align="center">98.24</td>
<td valign="middle" align="center">98.55</td>
<td valign="middle" align="center">98.38</td>
</tr>
<tr>
<td valign="middle" align="left">MobileNetV2</td>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">97.92</td>
<td valign="middle" align="center">97.77</td>
<td valign="middle" align="center">97.83</td>
<td valign="middle" align="center">97.79</td>
</tr>
<tr>
<td valign="middle" align="left">MobileNetV2</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">98.76</td>
<td valign="middle" align="center">98.87</td>
<td valign="middle" align="center">98.98</td>
<td valign="middle" align="center">98.92</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Visualization analysis</title>
<p>The Grad-CAM (<xref ref-type="bibr" rid="B32">Selvaraju et&#xa0;al., 2017</xref>) visualizations in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref> provide qualitative evidence of the effectiveness of the proposed RFDAF-Net module compared to a standard ResNet50 baseline. The visualizations reveal that RFDAF-Net produces more discriminative and semantically meaningful activation patterns across all feature levels. In the shallow branch, RFDAF-Net focuses sharply on fine-grained details such as lesion boundaries and textural variations, while the baseline model exhibits scattered and less interpretable activations. In the middle and deep branches, RFDAF-Net continues to maintain precise spatial localization of pathological regions, effectively capturing higher-level semantic features without losing resolution or contextual coherence. In contrast, the baseline model relies predominantly on its deep features for localization, with shallow and middle branches providing limited and often noisy contributions. These results demonstrate that RFDAF-Net enables each branch to specialize in capturing features at its respective scale, leading to more hierarchical and interpretable feature learning. The module&#x2019;s ability to retain discriminative information across scales contributes significantly to its improved accuracy and robustness in complex field conditions.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Grad-Cam visualization.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g006.tif">
<alt-text content-type="machine-generated">Comparison of leaf images with heatmap overlays. Two original leaf images are shown on the left. To the right, each image is analyzed under three conditions: &#x201c;Baseline&#x201d; and &#x201c;RFDAF-Net,&#x201d; further divided into &#x201c;Shallow branch,&#x201d; &#x201c;Middle branch,&#x201d; and &#x201c;Deep branch.&#x201d; The heatmaps use color variations to highlight differences in each condition.</alt-text>
</graphic></fig>
<p><xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref> presents a comparative visualization of intermediate feature maps from both the baseline MobileNetV2 and the proposed RFDAF-Net across shallow, middle, and deep branches. The feature maps generated by RFDAF-Net exhibit stronger structural awareness and semantic coherence compared to those of the baseline. In the shallow layers, RFDAF-Net produces feature maps that clearly highlight fine-grained details such as edges, textures, and early symptomatic patterns. The middle-layer features show increased semantic abstraction while retaining spatial precision, effectively capturing transitional patterns indicative of disease progression. The deep features focus on high-level semantic concepts, such as the overall shape and extent of diseased regions, with minimal noise or irrelevant activations. In contrast, the baseline MobileNetV2 fails to maintain such hierarchical discriminability. Its shallow and middle features often appear noisy or semantically ambiguous, while the deep features, though somewhat consolidated, lack the spatial precision and interpretability of those produced by RFDAF-Net. These visual comparisons reinforce the quantitative results, confirming that RFDAF-Net enhances feature learning across all network levels, leading to more informative and task-relevant representations essential for accurate soybean disease recognition.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Feature map visualization.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g007.tif">
<alt-text content-type="machine-generated">Original image of a leaf with spots. Below are three rows labeled Shallow branch, Middle branch, and Deep branch, showing two columns of visual representations. The first column is labeled RFDAF-Net, and the second column is labeled Baseline. Each panel contains colorful grid patterns indicating different data analyses.</alt-text>
</graphic></fig>
<p>The t-SNE (<xref ref-type="bibr" rid="B23">Maaten and Hinton, 2008</xref>) visualization in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref> illustrates the feature distribution of the baseline Swin-B model and the proposed RFDAF-Net in a two-dimensional embedded space. The feature representations generated by RFDAF-Net exhibit significantly improved class separability compared to those of the baseline. RFDAF-Net produces more compact and distinct clustering for each category, with larger inter-class margins and smaller intra-class variances. This indicates that the model learns highly discriminative representations that effectively separate different disease categories while maintaining consistency within each class. In contrast, the baseline Swin-B model shows overlapping clusters and more dispersed feature distributions, particularly for visually similar categories, reflecting its limited ability to capture fine-grained discriminative features.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>t-SNE visualization. <bold>(a)</bold> represents RFDAF-Net. <bold>(b)</bold> represents the Swin-B baseline.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g008.tif">
<alt-text content-type="machine-generated">Two scatterplots compare RFDaf-Net and Baseline classifications of plant health and diseases: bacterial blight, cercospora leaf blight, downey mildew, frogeye, potassium deficiency, soybean rust, and target spot. Each condition is represented by colored clusters. RFDaf-Net shows clearer separation of clusters compared to the Baseline.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Analysis of the hyperparameter <italic>&#x3b1;</italic></title>
<p>The hyperparameter <italic>&#x3b1;</italic> controls the degree of cross-branch feature suppression in the RFD module, specifically regulating how strongly salient regions detected in one branch are suppressed in the next. This encourages subsequent branches to focus on complementary regions and learn diverse features. As shown in <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>, model performance varies significantly with different values of <italic>&#x3b1;</italic>. Accuracy improves as <italic>&#x3b1;</italic> increases from 0.1, peaking at <italic>&#x3b1;</italic> =&#xa0;0.5. Within this range, appropriate suppression effectively prevents feature redundancy across branches. Beyond <italic>&#x3b1;</italic>&#xa0;=&#xa0;0.5, further increasing its value causes excessive suppression, which may remove useful information and reduce accuracy. These results confirm that controlled inter-branch suppression is essential for learning hierarchical and complementary features. The optimal value of <italic>&#x3b1;</italic> =&#xa0;0.5 provides the best trade-off for maintaining feature diversity while minimizing redundancy.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Influence of the hyperparameter <italic>&#x3b1;</italic>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1734292-g009.tif">
<alt-text content-type="machine-generated">Line graph depicting accuracy in percentage on the vertical axis, ranging from 98.8 to 99.2, against values from 0.1 to 1.0 on the horizontal axis. The line shows fluctuations, peaking around 0.5 before gradually declining. Triangular markers highlight data points along the line.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_7">
<label>4.7</label>
<title>Generalization analysis on other crops</title>
<p>To assess the robustness and transferability of RFDAF-Net beyond soybean crops, we extended our evaluation to the Paddy Disease Dataset (<xref ref-type="bibr" rid="B27">Petchiammal et&#xa0;al., 2023</xref>). This dataset comprises 10,407 high-resolution images collected from real-world paddy fields, categorized into one healthy class and nine disease classes (e.g., Blast, Dead Heart). Following standard protocols, the data was partitioned into a training set (7,808 samples, 75%) and a test set (2,599 samples, 25%). As shown in <xref ref-type="table" rid="T6"><bold>Table&#xa0;6</bold></xref>, RFDAF-Net achieved a superior accuracy of 99.15% on this dataset. It significantly outperforms the strong baseline Swin-B (97.45%) by 1.65% and surpasses the recent state-of-the-art model DIEC-ViT (98.94%). These results indicate that the proposed RFD and RFAF modules are not overfitted to specific soybean features. Instead, they demonstrate excellent generalization capabilities, effectively capturing discriminative pathological patterns across different plant species and complex agricultural environments.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Comparison results with state-of-the-art models on the paddy disease dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="left">Acc (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">PiT-B (<xref ref-type="bibr" rid="B11">Heo et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="left">97.26</td>
</tr>
<tr>
<td valign="middle" align="left">PVT-B (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="left">97.40</td>
</tr>
<tr>
<td valign="middle" align="left">ViT-B (<xref ref-type="bibr" rid="B8">Dosovitskiy, 2020</xref>)</td>
<td valign="middle" align="left">97.30</td>
</tr>
<tr>
<td valign="middle" align="left">Swin-B (<xref ref-type="bibr" rid="B20">Liu et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="left">97.45</td>
</tr>
<tr>
<td valign="middle" align="left">ViT-B + EFG (<xref ref-type="bibr" rid="B6">Chang et&#xa0;al., 2024</xref>)</td>
<td valign="middle" align="left">95.00</td>
</tr>
<tr>
<td valign="middle" align="left">DIEC-ViT (<xref ref-type="bibr" rid="B17">Lin et&#xa0;al., 2025</xref>)</td>
<td valign="middle" align="left">98.94</td>
</tr>
<tr>
<td valign="middle" align="left">RFDAF-Net (Ours)</td>
<td valign="middle" align="left">99.15</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Limitations</title>
<p>While RFDAF-Net demonstrates state-of-the-art performance, it is essential to address several practical challenges regarding its deployment in real-world agricultural environments, particularly concerning computational feasibility. To provide a transparent analysis of the model&#x2019;s complexity, we compared the parameter count of our proposed method against the baseline. As shown in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>, introducing the RFD and RFAF modules increases the total number of parameters from 86.74 M (Baseline Swin-B) to 105.01 M. While this additional complexity contributes to the performance gains, it inevitably raises computational costs. In practical agricultural environments, disease diagnosis often relies on resource-constrained edge devices, such as drones or handheld smartphones, which typically have limited memory and processing power. The current model size implies higher inference latency and energy consumption, potentially restricting its feasibility for real-time, large-scale field surveying tasks without hardware acceleration.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Comparison of the number of parameters.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="left">Param (M)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Baseline (Swin-B)</td>
<td valign="middle" align="left">86.74</td>
</tr>
<tr>
<td valign="middle" align="left">RFDAF-Net (Swin-B)</td>
<td valign="middle" align="left">105.01</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In light of these findings, our future research will prioritize bridging the gap between high-performance algorithms and practical utility. Specifically, we aim to investigate model compression techniques, such as knowledge distillation and network pruning, to develop lightweight variants of RFDAF-Net suitable for edge deployment on drones or mobile devices. Furthermore, to mitigate the reliance on extensive expert annotations, we plan to explore semi-supervised or few-shot learning strategies, thereby enhancing the model&#x2019;s adaptability to diverse and evolving agricultural scenarios with minimal data requirements.</p>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusions</title>
<p>In this study, we proposed the RFDAF-Net to address the critical challenges of feature redundancy and insufficient feature granularity in multi-scale learning for soybean disease recognition under complex field conditions. The introduced RFD module effectively disentangles multi-scale features by enhancing discriminative patterns and suppressing redundant information through a dual-pathway mechanism, enabling explicit capture of fine-grained details in shallow layers and high-level semantics in deeper layers. Furthermore, the RFAF module dynamically integrates these decoupled features using content-aware spatial weighting, achieving adaptive multi-scale fusion that significantly enhances representational capacity. Extensive experiments demonstrated that RFDAF-Net consistently outperforms state-of-the-art models across multiple backbone architectures and evaluation metrics. Ablation studies confirmed the individual contributions of both proposed modules, while visualization results using Grad-CAM, feature maps, and t-SNE provided interpretable evidence of the model&#x2019;s ability to learn hierarchical and discriminative features. The analysis of key hyperparameters further validated the robustness and generalizability of the proposed approach.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>RP: Writing &#x2013; original draft, Data curation, Methodology, Software, Conceptualization, Writing &#x2013; review &amp; editing. QY: Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Methodology, Formal analysis. JC: Data curation, Writing &#x2013; original draft, Investigation. YC: Funding acquisition, Conceptualization, Writing &#x2013; review &amp; editing, Writing &#x2013; original draft, Supervision, Methodology.</p></sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Albahli</surname> <given-names>S.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Agrifusionnet: A lightweight deep learning model for multisource plant disease diagnosis</article-title>. <source>Agriculture</source> <volume>15</volume>, <fpage>1523</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture15141523</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Antwi</surname> <given-names>K.</given-names></name>
<name><surname>Bennin</surname> <given-names>K. E.</given-names></name>
<name><surname>Asiedu</surname> <given-names>D. K. P.</given-names></name>
<name><surname>Tekinerdogan</surname> <given-names>B.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>On the application of image augmentation for plant disease detection: A systematic literature review</article-title>. <source>Smart Agric. Technol.</source> <volume>9</volume>, <fpage>100590</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.atech.2024.100590</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Atila</surname> <given-names>&#xdc;.</given-names></name>
<name><surname>U&#xe7;ar</surname> <given-names>M.</given-names></name>
<name><surname>Akyol</surname> <given-names>K.</given-names></name>
<name><surname>U&#xe7;ar</surname> <given-names>E.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Plant leaf disease classification using efficientnet deep learning model</article-title>. <source>Ecol. Inf.</source> <volume>61</volume>, <fpage>101182</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2020.101182</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bevers</surname> <given-names>N.</given-names></name>
<name><surname>Sikora</surname> <given-names>E. J.</given-names></name>
<name><surname>Hardy</surname> <given-names>N. B.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Pictures of diseased soybean leaves by category captured in field and with controlled backgrounds: Auburn soybean disease image dataset (asdid)</article-title>. <source>Dryad</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.5061/dryad.41ns1rnj3</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chakrabarty</surname> <given-names>A.</given-names></name>
<name><surname>Ahmed</surname> <given-names>S. T.</given-names></name>
<name><surname>Islam</surname> <given-names>M. F. U.</given-names></name>
<name><surname>Aziz</surname> <given-names>S. M.</given-names></name>
<name><surname>Maidin</surname> <given-names>S. S.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>An interpretable fusion model integrating lightweight cnn and transformer architectures for rice leaf disease identification</article-title>. <source>Ecol. Inf.</source> <volume>82</volume>, <fpage>102718</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2024.102718</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chang</surname> <given-names>B.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhao</surname> <given-names>X.</given-names></name>
<name><surname>Li</surname> <given-names>G.</given-names></name>
<name><surname>Yuan</surname> <given-names>P.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>A general-purpose edge-feature guidance module to enhance vision transformers for plant disease identification</article-title>. <source>Expert Syst. Appl.</source> <volume>237</volume>, <fpage>121638</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2023.121638</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Dilawari</surname> <given-names>R.</given-names></name>
<name><surname>Kaur</surname> <given-names>N.</given-names></name>
<name><surname>Priyadarshi</surname> <given-names>N.</given-names></name>
<name><surname>Prakash</surname> <given-names>I.</given-names></name>
<name><surname>Patra</surname> <given-names>A.</given-names></name>
<name><surname>Mehta</surname> <given-names>S.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). &#x201c;
<article-title>Soybean: A key player for global food security</article-title>,&#x201d; in <source>Soybean improvement: physiological, molecular and genetic perspectives</source> (
<publisher-name>Springer</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>46</lpage>.
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dosovitskiy</surname> <given-names>A.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>. <source>arXiv preprint arXiv:2010.11929</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hari</surname> <given-names>P.</given-names></name>
<name><surname>Singh</surname> <given-names>M. P.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Adaptive knowledge transfer using federated deep learning for plant disease detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>229</volume>, <fpage>109720</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109720</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Ren</surname> <given-names>S.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>770</fpage>&#x2013;<lpage>778</lpage>.
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Heo</surname> <given-names>B.</given-names></name>
<name><surname>Yun</surname> <given-names>S.</given-names></name>
<name><surname>Han</surname> <given-names>D.</given-names></name>
<name><surname>Chun</surname> <given-names>S.</given-names></name>
<name><surname>Choe</surname> <given-names>J.</given-names></name>
<name><surname>Oh</surname> <given-names>S. J.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Rethinking spatial dimensions of vision transformers</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. <fpage>11936</fpage>&#x2013;<lpage>11945</lpage>.
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hossain</surname> <given-names>M. M.</given-names></name>
<name><surname>Sultana</surname> <given-names>F.</given-names></name>
<name><surname>Mostafa</surname> <given-names>M.</given-names></name>
<name><surname>Ferdus</surname> <given-names>H.</given-names></name>
<name><surname>Rahman</surname> <given-names>M.</given-names></name>
<name><surname>Rana</surname> <given-names>J. A.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Plant disease dynamics in a changing climate: impacts, molecular mechanisms, and climate-informed strategies for sustainable management</article-title>. <source>Discover Agric.</source> <volume>2</volume>, <fpage>132</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s44279-024-00144-w</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Hu</surname> <given-names>J.</given-names></name>
<name><surname>Shen</surname> <given-names>L.</given-names></name>
<name><surname>Sun</surname> <given-names>G.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Squeeze-and-excitation networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>7132</fpage>&#x2013;<lpage>7141</lpage>.
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Huang</surname> <given-names>X.</given-names></name>
<name><surname>Xu</surname> <given-names>D.</given-names></name>
<name><surname>Chen</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>Q.</given-names></name>
<name><surname>Feng</surname> <given-names>P.</given-names></name>
<name><surname>Ma</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Econv-vit: A strongly generalized apple leaf disease classification model based on the fusion of convnext and transformer</article-title>. <source>Inf. Process. Agric</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.inpa.2025.03.001</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kumar</surname> <given-names>Y.</given-names></name>
<name><surname>Singh</surname> <given-names>R.</given-names></name>
<name><surname>Moudgil</surname> <given-names>M. R.</given-names></name>
<name><surname>Kamini</surname></name>
</person-group> (<year>2023</year>). 
<article-title>A systematic review of different categories of plant disease detection using deep learning-based approaches</article-title>. <source>Arch. Comput. Methods Eng.</source> <volume>30</volume>, <fpage>4757</fpage>&#x2013;<lpage>4779</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11831-023-09958-1</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>Z.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
<name><surname>Shen</surname> <given-names>Y.</given-names></name>
<name><surname>Yang</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Deep migration learning-based recognition of diseases and insect pests in yunnan tea under complex environments</article-title>. <source>Plant Methods</source> <volume>20</volume>, <fpage>101</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13007-024-01219-x</pub-id>, PMID: <pub-id pub-id-type="pmid">38970029</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>J.</given-names></name>
<name><surname>Chen</surname> <given-names>X.</given-names></name>
<name><surname>Lou</surname> <given-names>L.</given-names></name>
<name><surname>You</surname> <given-names>L.</given-names></name>
<name><surname>Cernava</surname> <given-names>T.</given-names></name>
<name><surname>Huang</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Diec-vit: Discriminative information enhanced contrastive vision transformer for the identification of plant diseases in complex environments</article-title>. <source>Expert Syst. Appl.</source> <volume>281</volume>, <fpage>127730</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2025.127730</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>T.-Y.</given-names></name>
<name><surname>Doll&#xe1;r</surname> <given-names>P.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Hariharan</surname> <given-names>B.</given-names></name>
<name><surname>Belongie</surname> <given-names>S.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>2117</fpage>&#x2013;<lpage>2125</lpage>.
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>B.</given-names></name>
<name><surname>Huang</surname> <given-names>X.</given-names></name>
<name><surname>Sun</surname> <given-names>L.</given-names></name>
<name><surname>Wei</surname> <given-names>X.</given-names></name>
<name><surname>Ji</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Mcdcnet: Multi-scale constrained deformable convolution network for apple leaf disease detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>222</volume>, <fpage>109028</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109028</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Lin</surname> <given-names>Y.</given-names></name>
<name><surname>Cao</surname> <given-names>Y.</given-names></name>
<name><surname>Hu</surname> <given-names>H.</given-names></name>
<name><surname>Wei</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;
<article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>.
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Mao</surname> <given-names>H.</given-names></name>
<name><surname>Wu</surname> <given-names>C.-Y.</given-names></name>
<name><surname>Feichtenhofer</surname> <given-names>C.</given-names></name>
<name><surname>Darrell</surname> <given-names>T.</given-names></name>
<name><surname>Xie</surname> <given-names>S.</given-names></name>
</person-group> (<year>2022</year>). &#x201c;
<article-title>A convnet for the 2020s</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <fpage>11976</fpage>&#x2013;<lpage>11986</lpage>.
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Ma</surname> <given-names>N.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Zheng</surname> <given-names>H.-T.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Shufflenet v2: Practical guidelines for efficient cnn architecture design</article-title>,&#x201d; in <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name>. <fpage>116</fpage>&#x2013;<lpage>131</lpage>.
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Maaten</surname> <given-names>L. V. D.</given-names></name>
<name><surname>Hinton</surname> <given-names>G.</given-names></name>
</person-group> (<year>2008</year>). 
<article-title>Visualizing data using t-sne</article-title>. <source>J. Mach. Learn. Res.</source> <volume>9</volume>, <fpage>2579</fpage>&#x2013;<lpage>2605</lpage>. Available online at: <uri xlink:href="http://jmlr.org/papers/v9/vandermaaten08a.html">http://jmlr.org/papers/v9/vandermaaten08a.html</uri>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Macdonald</surname> <given-names>W.</given-names></name>
<name><surname>Sari</surname> <given-names>Y. A.</given-names></name>
<name><surname>Pahlevani</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Grow-light smart monitoring system leveraging lightweight deep learning for plant disease classification</article-title>. <source>Artif. Intell. Agric.</source> <volume>12</volume>, <fpage>44</fpage>&#x2013;<lpage>56</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.aiia.2024.03.003</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Pan</surname> <given-names>R.</given-names></name>
<name><surname>Lin</surname> <given-names>J.</given-names></name>
<name><surname>Cai</surname> <given-names>J.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Liu</surname> <given-names>J.</given-names></name>
<name><surname>Wen</surname> <given-names>X.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>A two-stage feature aggregation network for multi-category soybean leaf disease identification</article-title>. <source>J. King Saud University-Computer Inf. Sci.</source> <volume>35</volume>, <fpage>101669</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jksuci.2023.101669</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Parez</surname> <given-names>S.</given-names></name>
<name><surname>Dilshad</surname> <given-names>N.</given-names></name>
<name><surname>Lee</surname> <given-names>J. W.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>A channel attention-driven optimized cnn for efficient early detection of plant diseases in resource constrained environment</article-title>. <source>Agriculture</source> <volume>15</volume>, <fpage>127</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture15020127</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Petchiammal</surname> <given-names>K.</given-names></name>
<name><surname>Murugan</surname> <given-names>B.</given-names></name>
<name><surname>Arjunan</surname> <given-names>P.</given-names></name>
</person-group> (<year>2023</year>). &#x201c;
<article-title>Paddy doctor: A visual image dataset for automated paddy disease classification and benchmarking</article-title>,&#x201d; in <conf-name>Proceedings of the 6th Joint International Conference on Data Science &amp; Management of Data (10th ACM IKDD CODS and 28th COMAD)</conf-name>. <fpage>203</fpage>&#x2013;<lpage>207</lpage>.
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Pranta</surname> <given-names>A. S. U. K.</given-names></name>
<name><surname>Fardin</surname> <given-names>H.</given-names></name>
<name><surname>Debnath</surname> <given-names>J.</given-names></name>
<name><surname>Hossain</surname> <given-names>A.</given-names></name>
<name><surname>Sakib</surname> <given-names>A. H.</given-names></name>
<name><surname>Ahmed</surname> <given-names>M. R.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>A novel maxvit model for accelerated and precise soybean leaf and seed disease identification</article-title>. <source>Computers</source> <volume>14</volume>, <fpage>197</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/computers14050197</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Prasad</surname> <given-names>S.</given-names></name>
<name><surname>Kumar</surname> <given-names>P.</given-names></name>
<name><surname>Hazra</surname> <given-names>R.</given-names></name>
<name><surname>Kumar</surname> <given-names>A.</given-names></name>
</person-group> (<year>2012</year>). &#x201c;
<article-title>Plant leaf disease detection using gabor wavelet transform</article-title>,&#x201d; in <conf-name>International Conference on Swarm, Evolutionary, and Memetic Computing</conf-name>. <fpage>372</fpage>&#x2013;<lpage>379</lpage>.
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rumpf</surname> <given-names>T.</given-names></name>
<name><surname>Mahlein</surname> <given-names>A.-K.</given-names></name>
<name><surname>Steiner</surname> <given-names>U.</given-names></name>
<name><surname>Oerke</surname> <given-names>E.-C.</given-names></name>
<name><surname>Dehne</surname> <given-names>H.-W.</given-names></name>
<name><surname>Pl&#xfc;mer</surname> <given-names>L.</given-names></name>
</person-group> (<year>2010</year>). 
<article-title>Early detection and classification of plant diseases with support vector machines based on hyperspectral reflectance</article-title>. <source>Comput. Electron. Agric.</source> <volume>74</volume>, <fpage>91</fpage>&#x2013;<lpage>99</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2010.06.009</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Sandler</surname> <given-names>M.</given-names></name>
<name><surname>Howard</surname> <given-names>A.</given-names></name>
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Zhmoginov</surname> <given-names>A.</given-names></name>
<name><surname>Chen</surname> <given-names>L.-C.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Mobilenetv2: Inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>.
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Selvaraju</surname> <given-names>R. R.</given-names></name>
<name><surname>Cogswell</surname> <given-names>M.</given-names></name>
<name><surname>Das</surname> <given-names>A.</given-names></name>
<name><surname>Vedantam</surname> <given-names>R.</given-names></name>
<name><surname>Parikh</surname> <given-names>D.</given-names></name>
<name><surname>Batra</surname> <given-names>D.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Grad-cam: Visual explanations from deep networks via gradient-based localization</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name>. <fpage>618</fpage>&#x2013;<lpage>626</lpage>.
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sharma</surname> <given-names>V.</given-names></name>
<name><surname>Tripathi</surname> <given-names>A. K.</given-names></name>
<name><surname>Mittal</surname> <given-names>H.</given-names></name>
<name><surname>Nkenyereye</surname> <given-names>L.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Soyatrans: A novel transformer model for fine-grained visual classification of soybean leaf disease diagnosis</article-title>. <source>Expert Syst. Appl.</source> <volume>260</volume>, <fpage>125385</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2024.125385</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Simonyan</surname> <given-names>K.</given-names></name>
<name><surname>Zisserman</surname> <given-names>A.</given-names></name>
</person-group> (<year>2014</year>). 
<article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <source>arXiv preprint arXiv:1409.1556</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1409.1556</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Szegedy</surname> <given-names>C.</given-names></name>
<name><surname>Liu</surname> <given-names>W.</given-names></name>
<name><surname>Jia</surname> <given-names>Y.</given-names></name>
<name><surname>Sermanet</surname> <given-names>P.</given-names></name>
<name><surname>Reed</surname> <given-names>S.</given-names></name>
<name><surname>Anguelov</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2015</year>). &#x201c;
<article-title>Going deeper with convolutions</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>1</fpage>&#x2013;<lpage>9</lpage>.
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tan</surname> <given-names>M.</given-names></name>
<name><surname>Le</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Efficientnet: Rethinking model scaling for convolutional neural networks</article-title>,&#x201d; in <conf-name>International conference on machine learning (PMLR)</conf-name>. <fpage>6105</fpage>&#x2013;<lpage>6114</lpage>.
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tian</surname> <given-names>Q.</given-names></name>
<name><surname>Zhao</surname> <given-names>G.</given-names></name>
<name><surname>Yan</surname> <given-names>C.</given-names></name>
<name><surname>Yao</surname> <given-names>L.</given-names></name>
<name><surname>Qu</surname> <given-names>J.</given-names></name>
<name><surname>Yin</surname> <given-names>L.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Enhancing practicality of deep learning for crop disease identification under field conditions: insights from model evaluation and crop-specific approaches</article-title>. <source>Pest Manage. Sci.</source> <volume>80</volume>, <fpage>5864</fpage>&#x2013;<lpage>5875</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/ps.8317</pub-id>, PMID: <pub-id pub-id-type="pmid">39030887</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Upadhyay</surname> <given-names>A.</given-names></name>
<name><surname>Chandel</surname> <given-names>N. S.</given-names></name>
<name><surname>Singh</surname> <given-names>K. P.</given-names></name>
<name><surname>Chakraborty</surname> <given-names>S. K.</given-names></name>
<name><surname>Nandede</surname> <given-names>B. M.</given-names></name>
<name><surname>Kumar</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Deep learning and computer vision in plant disease detection: a comprehensive review of techniques, models, and trends in precision agriculture</article-title>. <source>Artif. Intell. Rev.</source> <volume>58</volume>, <fpage>92</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-024-11100-x</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Li</surname> <given-names>G.</given-names></name>
<name><surname>Ma</surname> <given-names>Z.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
</person-group> (<year>2012</year>). &#x201c;
<article-title>Image recognition of plant diseases based on backpropagation networks</article-title>,&#x201d; in <conf-name>2012 5th International Congress on Image and Signal Processing (IEEE)</conf-name>. <fpage>894</fpage>&#x2013;<lpage>900</lpage>.
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Xie</surname> <given-names>E.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Fan</surname> <given-names>D.-P.</given-names></name>
<name><surname>Song</surname> <given-names>K.</given-names></name>
<name><surname>Liang</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;
<article-title>Pyramid vision transformer: A versatile backbone for dense prediction without convolutions</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. <fpage>568</fpage>&#x2013;<lpage>578</lpage>.
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1937850">Parvathaneni Naga Srinivasu</ext-link>, Amrita Vishwa Vidyapeetham University, India</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3176502">Elham Mohammed Thabit A. Alsaadi</ext-link>, University of Karbala, Iraq</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3287834">Sandhya N.</ext-link>, Vallurupalli Nageswara Rao Vignana Jyothi Institute of Engineering &amp;Technology (VNRVJIET), India</p></fn>
</fn-group>
</back>
</article>