<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1763650</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>HDA-YOLO: a hierarchical and densely-fused attention network for rice pest detection in complex agricultural environments</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Yuan</surname><given-names>Shuo</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Duan</surname><given-names>Ying</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Su</surname><given-names>Hongting</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhou</surname><given-names>Xinhui</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3124262/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Hao</surname><given-names>Yinfeng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Computer and Information Engineering, Henan University</institution>, <city>Kaifeng</city>, <state>Henan</state>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Henan Key Laboratory of Big Data Analysis and Processing, Henan University</institution>, <city>Kaifeng</city>, <state>Henan</state>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Xinhui Zhou, <email xlink:href="mailto:zxhui@henu.edu.cn">zxhui@henu.edu.cn</email>; Yinfeng Hao, <email xlink:href="mailto:haoyinfeng@henu.edu.cn">haoyinfeng@henu.edu.cn</email></corresp>
<fn fn-type="equal" id="fn003">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work and share first authorship</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-03">
<day>03</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1763650</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>29</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Yuan, Duan, Su, Zhou and Hao.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Yuan, Duan, Su, Zhou and Hao</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-03">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Rapid and intelligent identification of rice pests serves as the core sensing technology for precision plant protection and smart rice farming systems, providing critical support for intelligent cultivation decisions. To address the challenges of insufficient robustness and low precision of existing lightweight detection models in complex agricultural environments, this study proposes HDA-YOLO, an improved lightweight YOLOv8 model based on a hierarchical and densely-fused attention mechanism, for fast and high-precision pest detection. To enhance feature fidelity, the model incorporates asymmetric dynamic downsampling (ADDS) and a multi-scale cascade pre-fusion (MCPF) module into the backbone network. To achieve dynamic, content-aware feature fusion, a hierarchical attention-driven dense fusion network (HADF-Net) is constructed, integrating an intra-scale self-attention module (ISAM) and an inter-scale cross-attention module (ICAM). Furthermore, the C2f module is upgraded to a multi-scale context (MSC) module to improve adaptability to variations in target scale. Experimental results on the self-built RicePest_12 dataset demonstrate that HDA-YOLO, while maintaining a lightweight architecture (3.93M parameters, 12.02 GFLOPs), achieves significant improvements over the baseline YOLOv8n model, with mAP@50, F1-score, and Recall increasing by 2.4%, 3.8%, and 4.8%, respectively. In comparison with the Transformer-based RT-DETR-R18 model, HDA-YOLO achieves a 4.8 percentage points higher mAP@50, while its computational cost is only 22% and its parameter count is only 20% of RT-DETR-R18. Moreover, the proposed model has been successfully deployed on a mobile application, achieving real-time and accurate identification of field pests and demonstrating significant potential in the field of smart rice agriculture.</p>
</abstract>
<kwd-group>
<kwd>attention mechanism</kwd>
<kwd>deep learning</kwd>
<kwd>mobile application</kwd>
<kwd>rice pest detection</kwd>
<kwd>smart rice agriculture</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the Key Research and Promotion Projects in Henan Province (Grant Number: 252102111180), and the National Natural Science Foundation of China (Grant Number: 32503263).</funding-statement>
</funding-group>
<counts>
<fig-count count="14"/>
<table-count count="4"/>
<equation-count count="17"/>
<ref-count count="30"/>
<page-count count="16"/>
<word-count count="7760"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Rice is one of the most vital food crops globally, and its stable production is intrinsically linked to food security and the livelihoods of farmers (<xref ref-type="bibr" rid="B3">Britto and Kronzucker, 2004</xref>). However, rice yield is severely threatened by diseases and infestations caused by a diverse range of insect pests, leading to substantial economic losses annually (<xref ref-type="bibr" rid="B1">Ali et&#xa0;al., 2021</xref>). Therefore, achieving efficient and accurate identification and monitoring of rice pests is not only a critical component of pest management and control but also holds profound significance for the advancement of smart agriculture (<xref ref-type="bibr" rid="B18">Santiteerakul et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B19">Savary et&#xa0;al., 2012</xref>).</p>
<p>Early studies primarily employed traditional machine learning algorithms, using handcrafted features such as scale-invariant feature transform (SIFT), histogram of oriented gradients (HOG), and local binary patterns (LBP) combined with classifiers such as support vector machine (SVM) and K-nearest neighbors (KNN) to achieve pest recognition (<xref ref-type="bibr" rid="B24">Waqas et&#xa0;al., 2025</xref>). Although these methods achieved certain accuracy under controlled conditions, their robustness in complex field environments was limited, being easily affected by variations in illumination, background interference, and the morphological diversity of pests (<xref ref-type="bibr" rid="B12">Liu et&#xa0;al., 2025</xref>). For instance, <xref ref-type="bibr" rid="B27">Xiong et&#xa0;al. (2024)</xref> pointed out that conventional pest detection methods are not only time-consuming and labor-intensive but also often fail to achieve real-time monitoring and rapid response, further highlighting their limitations in modern agricultural applications. Moreover, <xref ref-type="bibr" rid="B30">Zheng et&#xa0;al. (2024)</xref> emphasized that due to the high similarity among different pests, significant intra-class variations, and complex backgrounds, traditional methods struggle to accurately and quickly identify multiple rice pests, resulting in recognition accuracy significantly lower than that of deep learning models.</p>
<p>With the advancement of deep learning, convolutional neural networks (CNNs) and Vision Transformers have shown significant advantages in image recognition, leading to the widespread adoption of object detection algorithms in agricultural scenarios. Among these, one-stage object detectors, exemplified by the You Only Look Once (YOLO) series (<xref ref-type="bibr" rid="B16">Redmon et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B17">Redmon and Farhadi, 2018</xref>), have become a mainstream approach for agricultural pest identification due to their excellent balance between speed and accuracy (<xref ref-type="bibr" rid="B2">Badgujar et&#xa0;al., 2024</xref>). These models extract hierarchical features through deep backbone networks and leverage feature pyramid networks (FPN) (<xref ref-type="bibr" rid="B10">Lin et&#xa0;al., 2017</xref>) and their enhanced variants, like PANet (<xref ref-type="bibr" rid="B11">Liu et&#xa0;al., 2018</xref>), for multi-scale feature fusion, thereby enabling rapid localization and accurate classification of conventional targets.</p>
<p>To further enhance performance, a previous study has developed the MTD-YOLO model based on YOLOv8, which incorporates MobileNetV3 as the backbone network and integrates Triplet Attention and Dynamic Head, effectively improving feature representation capabilities and significantly increasing detection confidence and accuracy across multiple rice pest datasets (<xref ref-type="bibr" rid="B29">Zhang et&#xa0;al., 2025</xref>). Meanwhile, <xref ref-type="bibr" rid="B15">Rahman et&#xa0;al. (2020)</xref> proposed a CNN-based method for rice disease and pest recognition, demonstrating the feasibility of lightweight models for mobile deployment under complex and heterogeneous field conditions; <xref ref-type="bibr" rid="B13">Pan et&#xa0;al. (2023)</xref> further introduced the two-stage RiceNet method, which effectively enhances recognition robustness in challenging field backgrounds.</p>
<p>Nevertheless, when directly applying existing lightweight models, such as YOLOv8n, to real-world rice pest detection in the field, their performance is still constrained by a series of inherent challenges. First is the issue of information fidelity: pest images captured in the field often contain numerous small targets. As these weak visual features pass through the successive downsampling layers of a CNN, the sharp decline in spatial resolution can easily cause them to be submerged in background information, leading to irreversible information loss (<xref ref-type="bibr" rid="B4">Feng et&#xa0;al., 2023</xref>). Second is the challenge of feature discriminability: the background of rice paddies is exceedingly complex, and the color and texture of pests often bear a high resemblance to rice stems and leaves, creating a natural camouflage. This places stringent demands on the model&#x2019;s ability to extract highly discriminative features from a cluttered environment (<xref ref-type="bibr" rid="B8">Hu et&#xa0;al., 2023</xref>). Finally, there is the trade-off between efficiency and accuracy: to enable real-time deployment on mobile or edge devices, the model must remain lightweight. However, this is typically achieved at the cost of network depth and width, which further exacerbates the aforementioned challenges (<xref ref-type="bibr" rid="B5">Hafiz, 2023</xref>).</p>
<p>To address these challenges, researchers have explored various avenues, such as designing more efficient feature fusion necks [e.g., BiFPN (<xref ref-type="bibr" rid="B22">Tan et&#xa0;al., 2020</xref>)] to enhance the interaction of multi-scale information, or embedding different types of attention mechanisms (<xref ref-type="bibr" rid="B25">Woo et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B7">Hu et&#xa0;al., 2020</xref>) into the network to guide the model&#x2019;s focus toward critical feature regions. However, these improvements are often modular or plug-in in nature. Although such modular strategies can effectively optimize specific nodes, they remain insufficient to fundamentally overcome the systemic degradation and bottlenecks that feature information encounters throughout the end-to-end processes of extraction, transmission, and fusion. This underscores the need for a comprehensive and lightweight framework capable of systematically and synergistically addressing the aforementioned challenges.</p>
<p>The objective of this study is to address the challenges of rice pest detection in complex field environments by developing a lightweight yet high-performance object detection model, termed HDA-YOLO (Hierarchical and Densely-fused Attention YOLO). The core design philosophy of this model is the systematic and synergistic optimization of the three critical stages of the network&#x2014;feature extraction, feature fusion, and feature interpretation&#x2014;to maximize the fidelity and interaction efficiency of the information flow throughout the entire network. Specifically, four synergistic core innovations are introduced:</p>
<list list-type="order">
<list-item>
<p>an Asymmetric Dynamic Downsampling (ADDS) module, which reduces information loss during the downsampling process in a content-adaptive manner;</p></list-item>
<list-item>
<p>a Multi-scale Cascade Pre-fusion (MCPF) module, employed at the end of the backbone to pre-fuse features for an enhanced output;</p></list-item>
<list-item>
<p>a Hierarchical Attention-Driven Dense Fusion Network (HADF-Net), which is built upon a dense topology and guided by a hierarchical attention mechanism, elevating feature fusion from a &#x201c;static merge&#x201d; to a &#x201c;dynamic inference&#x201d; process;</p></list-item>
<list-item>
<p>a Multi-Scale Context (MSC) module, which performs fine-grained analysis through parallel branches with multiple receptive fields to improve adaptability to varying target scales.</p></list-item>
</list>
<p>Through the deep coupling of these four innovations, HDA-YOLO constructs an end-to-end collaborative feature evolution architecture, enabling global information flow optimization from low-level perception to high-level semantic interpretation. On the self-built dataset, HDA-YOLO demonstrates significantly superior detection accuracy compared to the baseline model and has been successfully deployed in a WeChat Mini Program, demonstrating its significant application potential for intelligent agricultural monitoring scenarios.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data acquisition</title>
<p>Given the scarcity and inherent limitations of currently available public rice pest datasets, this study aimed to construct a more challenging dataset that closely reflects real-world agricultural scenarios. To achieve this, images from two large public datasets were integrated and curated: IP102 (<xref ref-type="bibr" rid="B26">Wu et&#xa0;al., 2019</xref>) and Pest_V2 (<xref ref-type="bibr" rid="B14">Quach et&#xa0;al., 2024</xref>). A dual-construction strategy was employed to enhance the dataset&#x2019;s specificity and complexity.</p>
<p>First, acknowledging the significant morphological differences of rice pests across various life cycle stages, we subdivided the images within the Pest_V2 dataset. Different growth stages of the same pest (e.g., larva, adult) were annotated as distinct and independent categories. Second, to further increase the challenge posed by background complexity, pest images from the IP102 dataset that were both relevant to rice and featured more intricate backgrounds were manually selected and extracted to supplement the new dataset.</p>
<p>Through the aforementioned process, a customized dataset named RicePest_12 was ultimately constructed. This dataset comprises a total of 2,807 high-quality images, covering 12 common categories of rice pests, such as the Asiatic Rice Borer, Brown Plant Hopper, and Paddy Stem Maggot (as shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>). The creation of this dataset provides a high-quality and challenging experimental foundation for our research, ensuring the reliability and validity of the model evaluation.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Examples of the 12 rice pest species used in this study.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g001.tif">
<alt-text content-type="machine-generated">Twelve photographs arranged in a grid each show a different rice field pest, including adult and larval stages: asiatic rice borer, asiatic rice borer larva, brown plant hopper, paddy stem maggot, rice gall midge, rice leaf caterpillar, rice leaf roller, rice leaf roller larva, rice leaf hopper, rice water weevil, small brown plant hopper, and yellow rice borer. Each insect or larva is positioned on a green rice plant, labeled below with its name.</alt-text>
</graphic></fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Data preprocessing</title>
<p>Although the constructed RicePest_12 dataset is of high quality and presents significant challenges, its total volume of 2,807 images is still relatively insufficient. Furthermore, the dataset exhibits a notable disparity in the number of samples among different categories, presenting a typical long-tail distribution. This data imbalance issue can cause the model to develop a bias towards the majority classes during training, thereby weakening its generalization ability for rare categories and potentially leading to overfitting.</p>
<p>To address the aforementioned issues, this study employed a comprehensive data augmentation strategy to expand the dataset and balance the class distribution. A variety of data augmentation techniques were applied to the original images, including geometric transformations (e.g., random horizontal and vertical flips, rotation, scaling, translation, and perspective transformation) and appearance transformations (e.g., blurring). Furthermore, to enhance the model&#x2019;s ability to discern complex backgrounds and reduce the false detection rate, unlabeled pure background images were strategically introduced into the training set as negative samples.</p>
<p>Through this augmentation and expansion, the final dataset size was increased from 2,807 to 6,374 images. The distribution of sample quantities across the various categories is detailed in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>. Prior to model training, this augmented dataset was randomly partitioned into training, validation, and test sets according to an 8:1:1 ratio.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Distribution of images across categories in the RicePest_12 dataset before and after data augmentation.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Category</th>
<th valign="middle" align="center">Original number of images</th>
<th valign="middle" align="center">Augmented number of images</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Asiatic Rice Borer (adult)</td>
<td valign="middle" align="center">187</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Asiatic Rice Borer (larvae)</td>
<td valign="middle" align="center">237</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Brown Plant Hopper</td>
<td valign="middle" align="center">251</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Paddy Stem Maggot</td>
<td valign="middle" align="center">67</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Rice Gall Midge</td>
<td valign="middle" align="center">124</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Rice Leaf Caterpillar</td>
<td valign="middle" align="center">103</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Rice Leaf Roller (adult)</td>
<td valign="middle" align="center">172</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Rice Leaf Roller (larvae)</td>
<td valign="middle" align="center">574</td>
<td valign="middle" align="center">574</td>
</tr>
<tr>
<td valign="middle" align="center">Rice Leaf Hopper</td>
<td valign="middle" align="center">237</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Rice Water Weevil</td>
<td valign="middle" align="center">420</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Small Brown Plant Hopper</td>
<td valign="middle" align="center">245</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Yellow Rice Borer</td>
<td valign="middle" align="center">190</td>
<td valign="middle" align="center">500</td>
</tr>
<tr>
<td valign="middle" align="center">Background</td>
<td valign="middle" align="center">0</td>
<td valign="middle" align="center">300</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Model overview</title>
<p>This paper proposes an object detection framework named HDA-YOLO, as illustrated in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>. This framework is designed to enhance the detection accuracy of multi-scale rice pests against complex backgrounds, achieving an effective integration of high precision and a lightweight structure. The overall architecture of the model is based on YOLOv8, but its core lies in the systemic, end-to-end optimization of the network&#x2019;s information flow.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Architecture of HDA-YOLO, featuring a backbone with Asymmetric Dynamic Downsampling (ADDS) and Multi-scale Cascade Pre-fusion (MCPF) modules for enhanced feature fidelity, and a neck with a Hierarchical Attention-Driven Dense Fusion network (HADF-Net) and Multi-Scale Context (MSC) module supporting dynamic multi-scale fusion.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g002.tif">
<alt-text content-type="machine-generated">Architecture of HDA-YOLO. The data flow starts with an input image, progressing through the backbone (incorporating ADDS and MCPF) and the neck (incorporating HADF-Net and MSC), and concludes at the prediction head. Side panels show block diagrams for ISAM and ICAM modules, each detailing calculation steps involving features, queries, keys, values, softmax, and output generation. The final output displays the original image with bounding boxes and detection confidence scores for brown plant hoppers.</alt-text>
</graphic></fig>
<p>In the backbone network, HDA-YOLO leverages the synergistic action of an ADDS module and a MCPF module. This combination enhances the fidelity of feature extraction right from the source, providing a high-quality feature foundation for subsequent network layers. Building upon this, a novel feature fusion system was constructed: the HADF-Net. Structurally, this network employs a dense aggregation topology, utilizing cross-layer &#x201c;shortcuts&#x201d; to counteract information dilution. Mechanistically, it is guided by hierarchical attention, which uses an intra-scale self-attention module (ISAM) to enhance features and an inter-scale cross-attention module (ICAM) to achieve an intelligent mapping from fused features to precise predictions. Simultaneously, to further bolster the network&#x2019;s multi-scale analysis capabilities during feature fusion, the C2f module in the model&#x2019;s neck has been upgraded to a MSC module. The MSC module performs fine-grained analysis through parallel branches with multiple receptive fields, significantly improving the model&#x2019;s adaptability to variations in target scale.</p>
<p>Regarding the training strategy, to enhance the model&#x2019;s generalization ability to real-world field environments, images of pests under diverse lighting conditions and at various growth stages, as well as pictures with a multitude of field backgrounds, were incorporated into the dataset. This was done to increase the overall diversity and complexity of the data.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Model improvements</title>
<sec id="s2_4_1">
<label>2.4.1</label>
<title>MCPF module</title>
<p>To address the issue of shallow spatial information loss in the traditional spatial pyramid pooling fusion (SPPF) (<xref ref-type="bibr" rid="B6">He et&#xa0;al., 2015</xref>) module, which results from it operating solely on the single, deepest feature map (P5), this study proposes the MCPF module, and its core idea is to re-architect the terminal stage of the backbone network from a simple &#x201c;context aggregation&#x201d; unit into an active &#x201c;multi-source feature pre-fusion&#x201d; unit. The objective is to generate a more comprehensive and detail-rich feature map before the inputs proceed to the detection neck by cascading the fusion of multi-scale features from P2 through P5.</p>
<p>As illustrated in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>, MCPF abandons the single-input constraint, instead accepting feature maps from multiple stages of the backbone network (P2, P3, P4, P5) as parallel inputs. Internally, it follows a &#x201c;downsample-and-fuse&#x201d; cascade workflow, progressively integrating high-resolution features into the deeper feature stream. The computational process of the module is formulated as shown in <xref ref-type="disp-formula" rid="eq1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="eq4">4</xref>:</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The architecture of the proposed MCPF module in comparison with the SPPF module. <bold>(A)</bold> SPPF operates on a single input feature map. <bold>(B)</bold> MCPF integrates multi-scale feature maps in a cascade.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g003.tif">
<alt-text content-type="machine-generated">Diagram comparing two neural network modules SPPF and MCPF. SPPF uses sequential Conv1x1, three yellow MaxPool2d layers, Concat, and another Conv1x1. MCPF incorporates FusionBlocks from P2 to P5, followed by yellow MaxPool2d and Conv1x1. A detailed sub-diagram shows the structure of FusionBlocks, including the stride-2 MaxPool2d and Concat layers. Yellow indicates max pooling with stride one, pink indicates stride two.</alt-text>
</graphic></fig>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mtext>M</mml:mtext><mml:mrow><mml:mtext>s</mml:mtext><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mtext>Concat</mml:mtext><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mtext>P</mml:mtext><mml:mn>3</mml:mn></mml:msub><mml:mo stretchy="false">]</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mtext>M</mml:mtext><mml:mrow><mml:mtext>s</mml:mtext><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mtext>Concat</mml:mtext><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>P</mml:mtext><mml:mn>4</mml:mn></mml:msub><mml:mo stretchy="false">]</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mn>5</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mtext>Concat</mml:mtext><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>P</mml:mtext><mml:mn>5</mml:mn></mml:msub><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mtext>MCPF</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mtext>s</mml:mtext><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mn>5</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msub><mml:mtext>P</mml:mtext><mml:mtext>i</mml:mtext></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mtext>R</mml:mtext><mml:mrow><mml:mtext>B</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mtext>C</mml:mtext><mml:mtext>i</mml:mtext></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mtext>H</mml:mtext><mml:mtext>i</mml:mtext></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mtext>W</mml:mtext><mml:mtext>i</mml:mtext></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msub><mml:mtext>M</mml:mtext><mml:mrow><mml:mtext>s</mml:mtext><mml:mo>=</mml:mo><mml:mtext>k</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the max-pooling operation with a stride of k, and the final output is <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mtext>Y</mml:mtext><mml:mrow><mml:mtext>MCPF</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>.</p>
</sec>
<sec id="s2_4_2">
<label>2.4.2</label>
<title>ADDS module</title>
<p>Starting from the fundamental operations of the network, this study designs an ADDS module. As illustrated in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>, this module begins by applying average pooling to the input feature channels and then splits them into two halves. These halves are processed through two complementary, parallel paths:</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The architecture of the proposed ADDS module and its key components. <bold>(A)</bold> The ODConv block. <bold>(B)</bold> The SimAM attention mechanism. <bold>(C)</bold> The overall architecture of the ADDS module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g004.tif">
<alt-text content-type="machine-generated">Three neural network module diagrams: (A) ODConv illustrates a sequence of weights and activations incorporating GAP, FC, ReLU, Sigmoid, and Softmax functions; (B) SimAM depicts 3D weight generation, expansion, and fusion for an input feature map; (C) ADDS shows an architecture where the input undergoes AvgPool before splitting into two parallel paths&#x2014;one featuring ODConv and the other consisting of MaxPool, SimAM, and Conv1x1&#x2014;which are then merged via Concat for the final output.</alt-text>
</graphic></fig>
<p>One path employs Omnidimensional Dynamic Convolution (ODConv) (<xref ref-type="bibr" rid="B9">Li et&#xa0;al., 2022</xref>) to transform and downsample the features in a content-adaptive manner, aiming to preserve rich patterns and textures; The other path uses a combination of max-pooling and the SimAM attention mechanism (<xref ref-type="bibr" rid="B28">Yang et&#xa0;al., 2021</xref>) to focus on capturing and refining the most salient core structural features. Finally, the outputs of these two paths are concatenated, generating a downsampled feature map with higher information fidelity that contains both adaptively transformed information and preserved key salient structures. The overall computational process of this module can be expressed by the following <xref ref-type="disp-formula" rid="eq5">Equations 5</xref>&#x2013;<xref ref-type="disp-formula" rid="eq8">8</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>O</mml:mi><mml:mi>D</mml:mi><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mi>b</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>(</mml:mo><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mtext>SimAM</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mtext>MaxPool</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>b</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mtext>ADDS</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>Concat</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mtext>,Y</mml:mtext></mml:mrow><mml:mi>b</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>b</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are the two parts of the feature <inline-formula>
<mml:math display="inline" id="im6"><mml:mtext>X</mml:mtext></mml:math></inline-formula> after it has undergone average pooling and been split. <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mi>b</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represent the outputs from the dynamic convolution downsampling branch and the salient feature retention branch, respectively. The final output is denoted as <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mtext>ADDS</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>.</p>
</sec>
<sec id="s2_4_3">
<label>2.4.3</label>
<title>HADF-Net</title>
<p>To address the limitations of networks like PANet and BiFPN, which are employed by YOLOv8 and are constrained by static fusion pathways and issues with information fidelity, the study proposed a novel HADF-Net. The core of this network lies in its ability to elevate the feature fusion process from a passive &#x201c;static merge&#x201d; into an active, content-aware &#x201c;dynamic inference.&#x201d; This is achieved by using a hierarchical attention mechanism as the central driving force, built upon a dense aggregation topology. Consequently, the network can better adapt to complex and highly variable detection scenarios.</p>
<p>As illustrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, HADF-Net structurally establishes denser, wider-span cross-layer &#x201c;shortcuts&#x201d; than those found in BiFPN. This design repeatedly re-injects the original high-resolution features from the backbone network into the bottom-up fusion path. Such a dense aggregation topology creates more direct pathways for information flow within the network. This physically ensures the maximum possible fidelity of shallow spatial details and provides an extremely information-rich feature pool for the subsequent attention mechanisms to operate on.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Comparison of feature fusion network architectures. <bold>(A)</bold> PANet. <bold>(B)</bold> BiFPN. <bold>(C)</bold> Our proposed HADF-Net.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g005.tif">
<alt-text content-type="machine-generated">Three network architecture diagrams compare PANet, BiFPN, and HADF-Net. Each diagram consists of stacked colored nodes labeled P2 to P5 with connecting arrows. PANet shows straightforward horizontal and vertical connections. BiFPN adds curved cross-connections. HADF-Net integrates red ISAM nodes along vertical paths and teal ICAM nodes branching from several outputs, increasing connection complexity.</alt-text>
</graphic></fig>
<p>Simultaneously, the HADF-Net&#x2019;s information flow is governed by a hierarchical attention mechanism inspired by the Transformer(<xref ref-type="bibr" rid="B23">Vaswani et&#xa0;al., 2017</xref>). As shown in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>, this mechanism achieves dynamic feature enhancement and fusion by deploying specific types of attention at different levels. First, before features enter the fusion path, an ISAM is used to perform global contextual enhancement on the features at each level. Its computation is as follows (<xref ref-type="disp-formula" rid="eq9">Equation 9</xref>):</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>The core modules of the Hierarchical Attention mechanism. <bold>(A)</bold> ISAM module. <bold>(B)</bold> ICAM module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g006.tif">
<alt-text content-type="machine-generated">Two labeled diagrams compare ISAM and ICAM attention modules in neural networks. Both diagrams show query, key, and value pathways using pointwise convolutions, matrix multiplication, softmax, and output elements. ICAM includes an additional 2H by 2W by C value pathway with a 3x3 convolution before merging.</alt-text>
</graphic></fig>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>X</mml:mi><mml:mi>T</mml:mi></mml:msup><mml:msub><mml:mi>W</mml:mi><mml:mi>q</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>X</mml:mi><mml:mi>T</mml:mi></mml:msup><mml:msub><mml:mi>W</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>T</mml:mi></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>X</mml:mi><mml:mi>T</mml:mi></mml:msup><mml:msub><mml:mi>W</mml:mi><mml:mi>v</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>Subsequently, at the fusion nodes within the neck of the network, ICAM replaces traditional static fusion methods. This module utilizes the deep features, <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msub><mml:mtext>X</mml:mtext><mml:mrow><mml:mtext>deep</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, as a &#x201c;query&#x201d; to dynamically &#x201c;probe&#x201d; and integrate information from the aligned shallow features, <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msubsup><mml:mtext>X</mml:mtext><mml:mrow><mml:mtext>shallow</mml:mtext></mml:mrow><mml:mo>'</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula>. The core computation are as follows (<xref ref-type="disp-formula" rid="eq10">Equations 10</xref>, <xref ref-type="disp-formula" rid="eq11">11</xref>):</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:mtext>X</mml:mtext><mml:msub><mml:mo>'</mml:mo><mml:mrow><mml:mtext>shallow</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mtext>down</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mtext>shallow</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mtext>cross</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>e</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>e</mml:mi><mml:mi>p</mml:mi></mml:mrow><mml:mi>T</mml:mi></mml:msubsup><mml:msub><mml:mi>W</mml:mi><mml:mi>q</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>e</mml:mi><mml:mi>p</mml:mi></mml:mrow><mml:mi>T</mml:mi></mml:msubsup><mml:msub><mml:mi>W</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>T</mml:mi></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mtext>shallow</mml:mtext></mml:mrow><mml:mo>'</mml:mo></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>T</mml:mi></mml:msup><mml:msub><mml:mi>W</mml:mi><mml:mi>v</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the dimension of the key vectors, used for scaling the dot-product results.</p>
<p>This hierarchical attention design, which operates on an &#x201c;enhance-then-select&#x201d; principle, enables the HADF-Net to intelligently adjust the information flow based on the input content. Consequently, it provides a higher-quality and more information-focused feature map for the final prediction.</p>
</sec>
<sec id="s2_4_4">
<label>2.4.4</label>
<title>MSC module</title>
<p>As illustrated in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref>, to optimize the multi-scale contextual analysis capability during feature fusion, this study proposes a MSC module to replace the original C2f module. The primary innovation of the MSC lies in its core computational unit: the bottleneck unit from the C2f module is replaced by a multi-receptive field block (MRFB).</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Architectural comparison of the proposed MSC module and the original Bottleneck. <bold>(A)</bold> The proposed MSC module. <bold>(B)</bold> The core MRFB. <bold>(C)</bold> The original Bottleneck unit.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g007.tif">
<alt-text content-type="machine-generated">Three block diagrams compare neural network modules: (A) MSC with convolution, split, multi-stage residual fusion blocks, concatenation, and output; (B) MRFB with linear projection, multi-scale convolutions, concatenation, and skip connections; (C) Bottleneck with convolution, batch normalization, SiLU activation, and skip connections, all using labeled arrows to indicate data flow and tensor dimensions.</alt-text>
</graphic></fig>
<p>The Bottleneck in the C2f structure is formed by serially connected convolutions of a single size, and the homogeneity of its receptive field often makes it difficult to effectively handle variations in object scale. In contrast, the MRFB within the MSC employs a multi-branch parallel structure. It extracts multi-granularity features by using group convolution kernels of different sizes, thereby forming multiple subspaces with different receptive fields. This design allows the module to better adapt to changes in object scale.</p>
<p>Its subspace fusion strategy is inspired by the Inception architecture (<xref ref-type="bibr" rid="B21">Szegedy et&#xa0;al., 2015</xref>), employing channel concatenation to merge the subspaces from different receptive fields into a high-dimensional, nonlinear feature space. This enhances the multi-scale representation capability within a single feature map. Specifically, the module is designed with parallel 3&#xd7;3, 5&#xd7;5, and 7&#xd7;7 group convolutions. Subsequently, these subspaces are fused via channel concatenation, followed by batch normalization and an activation function. Finally, a 1&#xd7;1 convolution is used for information refinement. This core process can be mathematically represented as (<xref ref-type="disp-formula" rid="eq12">Equation 12</xref>):</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>M</mml:mi><mml:mi>R</mml:mi><mml:mi>F</mml:mi><mml:mi>B</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mi>G</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>p</mml:mi><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mo>}</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>5</mml:mn><mml:mo>,</mml:mo><mml:mn>7</mml:mn><mml:mi>,</mml:mi><mml:mo>&#x2026;</mml:mo></mml:mrow></mml:msub><mml:mo stretchy="false">]</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im13"><mml:mrow><mml:mi>G</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>p</mml:mi><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>(.) denotes the group convolution operation with a kernel size of k&#xd7;k.</p>
</sec>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Model performance evaluation</title>
<sec id="s2_5_1">
<label>2.5.1</label>
<title>Experimental setup</title>
<p>The model training for this study was conducted using the PyTorch framework on GPU computing resources. The detailed configurations of the experimental environment and the model&#x2019;s hyperparameters are presented in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Experimental environment and hyperparameter settings for model training.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Configuration</th>
<th valign="middle" align="left">Parameter</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Operating System</td>
<td valign="middle" align="left">Ubuntu20.04</td>
</tr>
<tr>
<td valign="middle" align="left">Deep Learning Framework</td>
<td valign="middle" align="left">PyTorch 1.10.0</td>
</tr>
<tr>
<td valign="middle" align="left">CUDA Version</td>
<td valign="middle" align="left">11.3</td>
</tr>
<tr>
<td valign="middle" align="left">Python Version</td>
<td valign="middle" align="left">3.8</td>
</tr>
<tr>
<td valign="middle" align="left">GPU</td>
<td valign="middle" align="left">V100-32GB (32GB)</td>
</tr>
<tr>
<td valign="middle" align="left">Epochs</td>
<td valign="middle" align="left">100</td>
</tr>
<tr>
<td valign="middle" align="left">Input Image Size</td>
<td valign="middle" align="left">512 &#xd7; 512</td>
</tr>
<tr>
<td valign="middle" align="left">Batch Size</td>
<td valign="middle" align="left">32</td>
</tr>
<tr>
<td valign="middle" align="left">Dataloader Workers</td>
<td valign="middle" align="left">4</td>
</tr>
<tr>
<td valign="middle" align="left">Learning Rate</td>
<td valign="middle" align="left">0.01</td>
</tr>
<tr>
<td valign="middle" align="left">Optimizer</td>
<td valign="middle" align="left">Adam</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_5_2">
<label>2.5.2</label>
<title>Evaluation metrics</title>
<p>To comprehensively evaluate the performance of the model, five key metrics were adopted: Precision, Recall, mean Average Precision (mAP), F1-score, and Giga Floating-Point Operations (GFLOPs). These evaluation metrics are defined in <xref ref-type="disp-formula" rid="eq13">Equations 13</xref>&#x2013;<xref ref-type="disp-formula" rid="eq17">17</xref> as follows:</p>
<disp-formula id="eq13"><label>(13)</label>
<mml:math display="block" id="M13"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq14"><label>(14)</label>
<mml:math display="block" id="M14"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq15"><label>(15)</label>
<mml:math display="block" id="M15"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:msubsup><mml:mo>&#x222b;</mml:mo><mml:mn>0</mml:mn><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>R</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mi>d</mml:mi><mml:mi>R</mml:mi></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq16"><label>(16)</label>
<mml:math display="block" id="M16"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:mi>m</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>C</mml:mi></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:mrow><mml:mi>A</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq17"><label>(17)</label>
<mml:math display="block" id="M17"><mml:mrow><mml:mtable equalrows="true" equalcolumns="true"><mml:mtr><mml:mtd><mml:mrow><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where TP (True Positives) is the number of correctly predicted positive samples, FP is the number of incorrectly predicted positive samples, and FN is the number of incorrectly predicted negative samples. C represents the total number of detection classes, and AP is the Average Precision for a single class. Furthermore, GFLOPs is used to measure the computational complexity of a model, defined as the number of billion floating-point operations required to complete a single forward pass. A lower GFLOPs value indicates higher computational efficiency and a lower demand on hardware resources.</p>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and analysis</title>
<sec id="s3_1">
<label>3.1</label>
<title>Ablation experiments</title>
<p>To systematically validate the effectiveness of each innovative design proposed in this study, a series of detailed ablation experiments were conducted using YOLOv8n as the baseline model. For ease of presentation and analysis, the four modules&#x2014;MCPF, ADDS, MSC, and HADF-Net&#x2014;are abbreviated as A, B, C, and D, respectively. The ablation study was performed using the control variable method. All experiments were conducted on our self-built rice pest dataset, with completely consistent data augmentation strategies and experimental environments maintained throughout to ensure the fairness of the comparisons.</p>
<p>The ablation study results in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref> clearly reveal how our proposed modules work synergistically to progressively enhance the model&#x2019;s performance. In the initial stage, integrating module A (MCPF) enhanced the model&#x2019;s ability to capture potential targets, increasing Recall by 0.7 percentage points, though this also introduced some discriminative ambiguity, causing a slight dip in mAP@50 to 87.2%. Building on this, the introduction of module B (ADDS) effectively resolved this issue. As a highly efficient downsampling module, ADDS significantly refined the features, greatly strengthening the model&#x2019;s discriminative power. This led to a 1.2 percentage point leap in mAP@50 to 88.4%, with Precision also reaching its peak of 91.0%. Furthermore, the addition of module C (MSC) promoted a rebalancing between Precision and Recall. The Recall was observed to improve by 1.3 percentage points, while mAP@50 concurrently exhibited a steady increase, reaching 89.0%. Moreover, the more comprehensive metric, mAP@50-95, also demonstrated an improvement of 1.3 percentage points. Finally, with the integration of module D (HADF-Net), the complete HDA-YOLO model achieved optimal performance across all metrics. Precision returned to its peak of 91.0%, while Recall achieved a significant growth of 3.8 percentage points, pushing the F1-score to 88.2%. The mAP@50 also successfully reached 90.0%. This series of performance evolutions provides strong evidence that our four proposed modules each contribute unique and crucial performance gains, working in synergy to ultimately achieve a leap in the model&#x2019;s overall performance.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Ablation study of HDA-YOLO components.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">A</th>
<th valign="middle" align="center">B</th>
<th valign="middle" align="center">C</th>
<th valign="middle" align="center">D</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50-95</th>
<th valign="middle" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Baseline(YOLOv8n)</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.882</td>
<td valign="middle" align="center">0.808</td>
<td valign="middle" align="center">0.876</td>
<td valign="middle" align="center">0.618</td>
<td valign="middle" align="center">0.844</td>
</tr>
<tr>
<td valign="middle" align="left">Baseline+A</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.872</td>
<td valign="middle" align="center">0.815</td>
<td valign="middle" align="center">0.872</td>
<td valign="middle" align="center">0.616</td>
<td valign="middle" align="center">0.843</td>
</tr>
<tr>
<td valign="middle" align="left">Baseline+A+B</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.910</td>
<td valign="middle" align="center">0.805</td>
<td valign="middle" align="center">0.884</td>
<td valign="middle" align="center">0.623</td>
<td valign="middle" align="center">0.854</td>
</tr>
<tr>
<td valign="middle" align="left">Baseline+A+B+C</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.889</td>
<td valign="middle" align="center">0.818</td>
<td valign="middle" align="center">0.890</td>
<td valign="middle" align="center">0.636</td>
<td valign="middle" align="center">0.852</td>
</tr>
<tr>
<td valign="middle" align="left">HDA-YOLO (Ours)</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">0.910</td>
<td valign="middle" align="center">0.856</td>
<td valign="middle" align="center">0.90</td>
<td valign="middle" align="center">0.642</td>
<td valign="middle" align="center">0.882</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>[A checkmark (&#x221a;) indicates that the corresponding module was added to the configuration. A, B, C, and D represent the MCPF, ADDS, MSC, and HADF-Net modules, respectively.]</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Comparative experiments with different models</title>
<p>To comprehensively evaluate the overall performance of the HDA-YOLO model, fair comparative experiments were conducted against several current mainstream lightweight object detection models on our self-built rice pest dataset. The comparison included models such as EfficientDet-D0, the Transformer-based RT-DETR-R18, and several representative models from the YOLO series. All models were evaluated using a unified training strategy and testing environment. The experimental results are summarized in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Performance comparison with state-of-the-art lightweight models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50-95</th>
<th valign="middle" align="center">F1-score</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">Parameters(M)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">EfficientDet-D0</td>
<td valign="middle" align="center">0.848</td>
<td valign="middle" align="center">0.606</td>
<td valign="middle" align="center">0.649</td>
<td valign="middle" align="center">0.377</td>
<td valign="middle" align="center">0.707</td>
<td valign="middle" align="center">23.37</td>
<td valign="middle" align="center">10.10</td>
</tr>
<tr>
<td valign="middle" align="left">RT-DETR-R18</td>
<td valign="middle" align="center">0.885</td>
<td valign="middle" align="center">0.809</td>
<td valign="middle" align="center">0.852</td>
<td valign="middle" align="center">0.610</td>
<td valign="middle" align="center">0.845</td>
<td valign="middle" align="center">53.85</td>
<td valign="middle" align="center">19.51</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv5n</td>
<td valign="middle" align="center">0.860</td>
<td valign="middle" align="center">0.738</td>
<td valign="middle" align="center">0.840</td>
<td valign="middle" align="center">0.581</td>
<td valign="middle" align="center">0.795</td>
<td valign="middle" align="center">7.19</td>
<td valign="middle" align="center">2.51</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv8n</td>
<td valign="middle" align="center">0.882</td>
<td valign="middle" align="center">0.808</td>
<td valign="middle" align="center">0.876</td>
<td valign="middle" align="center">0.618</td>
<td valign="middle" align="center">0.844</td>
<td valign="middle" align="center">8.21</td>
<td valign="middle" align="center">3.01</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv9t</td>
<td valign="middle" align="center">0.888</td>
<td valign="middle" align="center">0.744</td>
<td valign="middle" align="center">0.852</td>
<td valign="middle" align="center">0.601</td>
<td valign="middle" align="center">0.81</td>
<td valign="middle" align="center">7.86</td>
<td valign="middle" align="center"><bold>2.01</bold></td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv11n</td>
<td valign="middle" align="center">0.828</td>
<td valign="middle" align="center">0.771</td>
<td valign="middle" align="center">0.843</td>
<td valign="middle" align="center">0.594</td>
<td valign="middle" align="center">0.799</td>
<td valign="middle" align="center"><bold>6.45</bold></td>
<td valign="middle" align="center">2.59</td>
</tr>
<tr>
<td valign="middle" align="left">HDA-YOLO</td>
<td valign="middle" align="center"><bold>0.910</bold></td>
<td valign="middle" align="center"><bold>0.856</bold></td>
<td valign="middle" align="center"><bold>0.90</bold></td>
<td valign="middle" align="center"><bold>0.642</bold></td>
<td valign="middle" align="center"><bold>0.882</bold></td>
<td valign="middle" align="center">12.02</td>
<td valign="middle" align="center">3.93</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>(The best performance for each metric is highlighted in bold.)</p></fn>
</table-wrap-foot>
</table-wrap>
<p>As shown by the detection metrics in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>, the proposed HDA-YOLO model significantly outperforms all other compared models across all core metrics related to detection accuracy. Compared to the YOLOv8n baseline, our model achieves notable improvements of 2.4 and 3.8 percentage points in mAP@50 and F1-score, respectively. Furthermore, its Precision and Recall are higher by 2.8 and 4.8 percentage points. It is particularly noteworthy that when compared to the Transformer-based RT-DETR-R18, although the latter also exhibits strong performance, its computational cost (53.85 GFLOPs) and parameter count (19.51M) are far higher than our model&#x2019;s. HDA-YOLO achieves superior detection accuracy while using only approximately 22% of the computational resources and 20% of the parameters, demonstrating the remarkable efficiency of deep optimization within a CNN architecture. Furthermore, when compared with YOLOv9t, which is extremely optimized for a low parameter count, HDA-YOLO leverages a moderate resource investment (3.93M vs. 2.01M parameters) in exchange for a substantial performance advantage of nearly 5 percentage points in mAP@50 (90.0% vs. 85.2%). Meanwhile, compared with YOLOv11n, which has the lowest computational complexity (6.45 GFLOPs), its Precision, Recall, and mAP@50 are 82.8%, 77.1%, and 84.3%, respectively. In contrast, HDA-YOLO improves these metrics to 91.0%, 85.6%, and 90.0% with only a slight increase in computational cost, demonstrating a more favorable trade-off between performance and efficiency.</p>
<p>The above experimental results demonstrate that for the task of rice pest detection, HDA-YOLO establishes a new benchmark for maximizing detection accuracy within an acceptable computational budget. By delivering significant precision gains (ranging from 2.4 to 6.0 percentage points in <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:mi>m</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>@</mml:mo><mml:mn>50</mml:mn></mml:mrow></mml:math></inline-formula>) compared to various mainstream lightweight models&#x2014;including the YOLO series and the Transformer-based RT-DETR-R18&#x2014;the model demonstrates a superior performance-efficiency trade-off. Although there is a modest increase in GFLOPs and parameter count, HDA-YOLO&#x2019;s overall complexity remains strictly within the lightweight range (12.02 GFLOPs and 3.93M parameters). This highlights its significant application potential for intelligent agricultural monitoring scenarios with high demands on detection accuracy.</p>
<p>To visually demonstrate the practical advantages of HDA-YOLO, <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref> provides a comparison of its detection results against those of several mainstream lightweight models on a set of challenging images. It can be clearly observed that HDA-YOLO, through its systematic end-to-end optimization, exhibits significant superiority across multiple complex scenarios. For the detection of small and dense objects (as shown in the fourth and sixth columns of <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>), HDA-YOLO accurately detects and distinguishes each individual, effectively resolving issues of missed detections, false detections, and inaccurate bounding box localization that are present in other models. This performance improvement reflects the synergistic effect of its dense fusion topology and the MSC module in combating information dilution and refining multi-scale analysis. Simultaneously, in distinguishing between the background and morphologically similar targets (as seen in the first, second, and third columns of <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>), HDA-YOLO demonstrates stronger robustness. It accurately identifies pests while effectively suppressing interference from similar-looking stems and leaves in the background. This enhanced capability stems from the effectiveness of the hierarchical attention mechanism and the ADDS module in optimizing feature representation. These qualitative results visually validate the synergistic effect of HDA-YOLO&#x2019;s innovations in improving model accuracy while reducing false and missed detection rates, proving its significant application potential for pest monitoring in smart agriculture.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Qualitative comparison of detection results on challenging images. (Red solid circles indicate false detections, while yellow solid circles highlight missed detections.).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g008.tif">
<alt-text content-type="machine-generated">Comparison grid of insect detection models applied to agricultural images, showing original images in the top row and outputs from seven detection algorithms in subsequent rows, each with labeled bounding boxes or markers indicating identified pests within crops.</alt-text>
</graphic></fig>
<p>To evaluate model performance at a fine-grained level, <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref> once again presents a comparison of the confusion matrices between the YOLOv8 baseline model (left) and the proposed HDA-YOLO model (right) on the test set. Confusion matrices not only provide an intuitive representation of overall classification accuracy but also reveal misclassification patterns and confusion relationships among different classes.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Comparison of confusion matrices: baseline model (left) vs. our model (right).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g009.tif">
<alt-text content-type="machine-generated">Side-by-side confusion matrices illustrating the classification performance of YOLOv8 (Baseline) and HDA-YOLO (Ours) across twelve rice pest classes and background. The matrices visualize the distribution of true versus predicted labels, providing a comparative view of the classification results for both models.</alt-text>
</graphic></fig>
<p>As can be seen from the statistical results in <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>, the overall detection accuracy of the proposed HDA-YOLO in this study is significantly better than that of the YOLOv8 baseline model. As can be seen from the comparison, the proposed model exhibits a significant increase in the proportion of main diagonal entries (indicating correct classifications) and a noticeable reduction in off-diagonal misclassifications. This systematically validates its superior recognition robustness and classification reliability in complex scenarios. Specifically, the YOLOv8 model achieved the lowest detection accuracy of only 69% for the brown plant hopper category, followed by the asiatic rice borer larva category with an accuracy of 70%. This outcome is likely attributable to the high morphological similarity between these two pest species and the rice field background, resulting in substantial pixel-level interference in the images. In contrast, the proposed model in this study achieved detection accuracies of 76% and 74% for the two aforementioned categories, respectively. These results demonstrate a notable improvement in accuracy and highlight the model&#x2019;s enhanced capability to extract pest features in complex field environments. Nevertheless, despite the significant overall improvement in accuracy, a small number of failure modes can still be observed from the confusion matrix. One type of error arises from background confusion; for example, the brown plant hopper still exhibits an approximately 24% miss-detection rate due to its strong camouflage and high visual similarity to the background. Another source of error stems from inter-class similarity, where feature overlap among morphologically similar species leads to misclassification; for instance, about 12% of brown plant hoppers are misclassified as small brown plant hoppers, and 12% of asiatic rice borers are misidentified as yellow rice borers. These results indicate that there remains room for further optimization when addressing extreme camouflage conditions and fine-grained recognition of very small targets.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Heatmap analysis of the model&#x2019;s internal states</title>
<p>To further enhance the interpretability of the improved model and provide intuitive qualitative evidence for our proposed innovations, this study focuses on the analysis of the MCPF module, the MSC module, and the HADF-Net. By employing the Grad-CAM technique (<xref ref-type="bibr" rid="B20">Selvaraju et&#xa0;al., 2019</xref>), we generate comparative feature heatmaps from before and after processing by our modules, or against the baseline modules, to improve the model&#x2019;s explainability. Grad-CAM (Gradient-weighted Class Activation Mapping) is a technique designed to explain the decision-making process of convolutional neural networks. This method generates heatmaps to visualize the key image regions the model focuses on when making a decision, thereby helping to understand the reasoning behind a specific prediction.</p>
<p>In the heatmaps generated by Grad-CAM, deep red indicates that the model pays extremely high attention to that image region, making it a key basis for its decision. Yellow areas represent regions of lower, yet still significant, importance, while blue areas indicate that the features in that location have a minimal impact on the model&#x2019;s prediction. As shown in <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>, the feature heatmaps output at the terminal end of the backbone network by the baseline YOLOv8n&#x2019;s SPPF module and the proposed MCPF module were first compared. The heatmap from the baseline&#x2019;s SPPF module exhibits distinct global context characteristics and a strong center bias. Its activation area broadly covers the center of the image but lacks a precise response to specific, particularly small-sized, pest targets, for which the activation signal is weak. In contrast, the MCPF module proposed in this paper generates a feature map with extremely high spatial information fidelity by performing a cascade fusion of multi-scale features from P2 to P5. Its heatmap activation regions are more concentrated and effectively cover the key areas of the target. This result demonstrates that the design of the MCPF effectively overcomes the issue of spatial detail loss in deep networks, providing a higher-quality feature foundation with richer localization information for the subsequent detection neck than that provided by SPPF.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Class activation map (CAM) comparison of SPPF and MCPF. <bold>(A)</bold> Original image. <bold>(B)</bold> SPPF heatmap. <bold>(C)</bold> MCPF heatmap.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g010.tif">
<alt-text content-type="machine-generated">Three rows and three columns compare pest detection on green plant leaves. Column one shows original images of pests on leaves. Column two shows SPPF heatmaps highlighting detected pest regions. Column three presents MCPF heatmaps with different highlighted regions for pest detection.</alt-text>
</graphic></fig>
<p>As depicted in <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref>, a further analysis was conducted on the feature heatmaps output at the same network depth by the standard C2f module and our proposed MSC module. When processing images with complex background textures that are similar to the pest targets, the heatmap of the standard C2f module exhibits large-scale redundant attention on background interference. Its activation area is diffuse and fails to precisely focus on the pest itself. Considering the complex background characteristics of rice paddies, where numerous stems and leaves closely resemble pests in both color and shape, insufficient attention to key target regions may introduce significant interference and increase the risk of false detections. In contrast, the proposed MSC module, through its multi-receptive field parallel branches, can simultaneously capture both the fine-grained details and the broader context of the target. As a result, the activation area in its heatmap is more compact and more accurately focused on the pest target, effectively distinguishing between critical and non-critical regions.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>CAM comparison of C2f and MSC. <bold>(A)</bold> Original image. <bold>(B)</bold> C2f heatmap. <bold>(C)</bold> MSC heatmap.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g011.tif">
<alt-text content-type="machine-generated">Three rows labeled Image 1, Image 2, and Image 3 each show three columns of plant close-ups with insects: the first column presents original photos, the second displays C2f heatmaps, and the third shows MSC heatmaps highlighting areas of interest with color gradients overlaying the insect and leaf areas.</alt-text>
</graphic></fig>
<p>Finally, <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref> illustrates the complete process of end-to-end optimization and progressive refinement that the HADF-Net performs on the feature maps. The initial feature map generated by standard convolution (<xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12B</bold></xref>) exhibits a very weak and diffuse activation response, with the entire heatmap showing large blue areas of low response. The process begins in the backbone with the ISAM. After processing (<xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12C</bold></xref>), the activation pattern transforms into a coherent region with a holistic perception of the target, indicating that the model has initially constructed long-range contextual information. Building on this, the enhanced features enter the dense aggregation neck for fusion. At this stage, the feature activation area (<xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12D</bold></xref>) becomes more compact and enriched with detailed information from different levels, which visually demonstrates the effectiveness of HADF-Net&#x2019;s dense topology in counteracting information dilution. As the culmination of the workflow, this information flow is integrated by the ICAM, where attention is ultimately focused on the most discriminative features that the prediction task relies upon. This forms highly refined activation hotspots with clear contours (<xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12E</bold></xref>). This series of progressive changes clearly showcases how HADF-Net upgrades feature processing from a passive pipeline into an active, multi-level, synergistic, and intelligent perception system that transitions from global enhancement to precise localization.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Visualization of the feature map&#x2019;s evolution at different stages of the HADF-Net.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g012.tif">
<alt-text content-type="machine-generated">Three rows labeled Image 1, Image 2, and Image 3 each show a sequence of five panels: a clear macro photograph of insects or pests on green plant leaves, followed by four progressively transformed or processed feature maps using different neural network stages. The rightmost column shows more defined heatmap patterns highlighting detected pests.</alt-text>
</graphic></fig>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Mobile application for pest recognition</title>
<p>Based on the proposed improved model for rice pests, an agricultural pest recognition application was developed in the form of a WeChat Mini Program. This application integrates the lightweight deep learning model with a user-friendly interface to achieve intelligent recognition and visual analysis of rice pest images. The system follows the detection workflow illustrated in <xref ref-type="fig" rid="f13"><bold>Figure&#xa0;13</bold></xref>: First, a farmer captures or uploads an image of a field crop through the Mini Program. Subsequently, the image is transmitted to a cloud server for real-time detection. Finally, the recognition result, along with corresponding pest control recommendations, is returned to the user&#x2019;s device and automatically saved to the local history records.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>The workflow of the intelligent pest recognition system based on the WeChat Mini Program.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g013.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a pest identification system using a WeChat mini program, where users capture images on a mobile device, upload to a cloud server for intelligent pest recognition via an HDA-YOLO model, and receive results for visualization and storage.</alt-text>
</graphic></fig>
<p>The system adopts a multi-terminal collaborative architecture. The WeChat Mini Program frontend is developed based on WXML/WXSS and integrates camera access and image preview functionalities. The backend program is based on the Flask framework, developed using PyCharm, and deployed on a rented Tencent Cloud server, providing RESTful API endpoints. The history module utilizes a local caching strategy, supporting the persistent storage and visual retrieval of detection data. Currently, the system supports the recognition of 12 common rice pest species. In addition, it is equipped with a professional knowledge base that provides information on pest morphological characteristics and control methods. Through a simple image upload operation, users can instantly obtain accurate identification results.</p>
<p>Regarding the display and analysis of detection results, the system interface is shown in <xref ref-type="fig" rid="f14"><bold>Figure&#xa0;14</bold></xref>. The results screen intuitively displays the category and bounding box of the target pest in the uploaded image, along with the model&#x2019;s prediction confidence. It also provides descriptions of the corresponding pest&#x2019;s morphological characteristics and recommended control measures. Users can click on a result to view detailed information and save it to their history records. Experimental results show that the system achieves an average recognition accuracy of 88.2% on the 12 rice pest classes. For high-frequency pests such as rice planthoppers and rice leaf rollers, the recognition accuracy exceeds 92%, which is sufficient to meet the demands of practical field applications. Future work aims to improve transmission efficiency by implementing image tiling and to cover a broader range of pests, offering farmers more precise field management support.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>The user interface for displaying detection results in the WeChat Mini Program.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1763650-g014.tif">
<alt-text content-type="machine-generated">Three smartphone screenshots display a rice pest identification app interface. Left panel: rice water weevil on a leaf, outlined in red boxes, confidence 96.43 percent. Center panel: multiple small brown plant hoppers identified on a green stalk, confidence 96.23 percent. Right panel: yellow rice borer moth outlined on a rice plant, confidence 90.05 percent. Each panel includes sections for pest features and prevention methods.</alt-text>
</graphic></fig>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion and future work</title>
<p>Existing lightweight detection models often suffer from insufficient accuracy when detecting multi-scale and easily camouflaged rice pests in complex agricultural scenarios. To address this issue, this study proposes a novel and deeply optimized YOLOv8-based object detection model, termed HDA-YOLO. The core of this model is an end-to-end synergistic optimization framework. It jointly optimizes three key stages: feature extraction, feature fusion, and feature interpretation, thereby enhancing the fidelity and interaction efficiency of information flow across the entire network. The model systematically introduces several synergistic technical innovations. In the backbone network, a high-fidelity feature foundation is constructed using the ADDS module and the MCPF module. These foundational components provide high-quality input to the core HADF-Net, which significantly boosts model performance through dynamic fusion and interpretation. Simultaneously, to further enhance the network&#x2019;s multi-scale analysis capabilities during feature fusion, the C2f module in the neck was upgraded to a MSC module.</p>
<p>Extensive experimental results have fully validated the effectiveness of the proposed solution. On our self-built rice pest dataset, HDA-YOLO achieved a mAP@50 of 90.0% and an F1-score of 88.2%, representing significant improvements of 2.4 and 3.8 percentage points over the baseline model, respectively, and demonstrated particularly excellent performance in small object detection. Furthermore, the feature map visualization analysis intuitively confirmed the synergistic effect of the various improved modules in enhancing feature discriminability and precisely focusing on targets. Ultimately, the HDA-YOLO model was integrated into a WeChat Mini Program-based automatic rice pest identification system, demonstrating its high accuracy and significant application potential for intelligent agricultural monitoring scenarios.</p>
<p>While HDA-YOLO demonstrates superior performance, this study acknowledges certain limitations. In the pursuit of maximum identification accuracy, the model&#x2019;s computational load has increased moderately. Currently, experiments are primarily focused on a single-crop dataset, and the model&#x2019;s robustness across diverse datasets and extreme environmental variations&#x2014;such as severe occlusion, challenging illumination, and motion blur&#x2014;requires further reinforcement. Additionally, energy management and long-term stability in resource-constrained scenarios have not yet been comprehensively investigated. Future research will prioritize model quantization, multi-environment data augmentation, and domain adaptation techniques. These efforts aim to continuously enhance the model&#x2019;s generalization capability and application potential in complex agricultural scenarios, ultimately providing more robust technical support for the advancement of intelligent agriculture.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The data analyzed in this study is subject to the following licenses/restrictions: The data will be made available upon reasonable request and with permission from the corresponding author. Requests to access these datasets should be directed to Xinhui Zhou, <email xlink:href="mailto:zxhui@henu.edu.cn">zxhui@henu.edu.cn</email>.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>SY: Investigation, Resources, Writing &#x2013; original draft, Data curation, Writing &#x2013; review &amp; editing, Validation, Methodology. YD: Validation, Data curation, Methodology, Writing &#x2013; original draft, Software, Investigation. HS: Investigation, Writing &#x2013; original draft, Data curation, Methodology. XZ: Funding acquisition, Writing &#x2013; review &amp; editing, Resources, Project administration, Methodology, Conceptualization, Supervision. YH: Writing &#x2013; review &amp; editing, Supervision, Conceptualization, Methodology, Resources, Project administration.</p></sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ali</surname> <given-names>M.</given-names></name>
<name><surname>Nessa</surname> <given-names>B.</given-names></name>
<name><surname>Khatun</surname> <given-names>M.</given-names></name>
<name><surname>Salam</surname> <given-names>M.</given-names></name>
<name><surname>Kabir</surname> <given-names>M.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>A way forward to combat insect pest in rice</article-title>. <source>Bangladesh Rice J.</source> <volume>25</volume>, <fpage>1</fpage>&#x2013;<lpage>22</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3329/brj.v25i1.55176</pub-id>, PMID: <pub-id pub-id-type="pmid">40208441</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Badgujar</surname> <given-names>C. M.</given-names></name>
<name><surname>Poulose</surname> <given-names>A.</given-names></name>
<name><surname>Gan</surname> <given-names>H.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Agricultural object detection with You Only Look Once (YOLO) algorithm: a bibliometric and systematic literature review</article-title>. <source>Comput. Electron. Agric.</source> <volume>223</volume>, <elocation-id>109090</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109090</pub-id>, PMID: <pub-id pub-id-type="pmid">41743167</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Britto</surname> <given-names>D. T.</given-names></name>
<name><surname>Kronzucker</surname> <given-names>H. J.</given-names></name>
</person-group> (<year>2004</year>). 
<article-title>Bioengineering nitrogen acquisition in rice: can novel initiatives in rice genomics and physiology contribute to global food security</article-title>? <source>BioEssays</source> <volume>26</volume>, <fpage>683</fpage>&#x2013;<lpage>692</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/bies.20040</pub-id>, PMID: <pub-id pub-id-type="pmid">15170866</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Feng</surname> <given-names>Q.</given-names></name>
<name><surname>Xu</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Deep learning-based small object detection: a survey</article-title>. <source>Math. Biosci. Eng.</source> <volume>20</volume>, <fpage>6551</fpage>&#x2013;<lpage>6590</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3934/mbe.2023282</pub-id>, PMID: <pub-id pub-id-type="pmid">37161118</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hafiz</surname> <given-names>A. M.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>A survey on light-weight convolutional neural networks: trends, issues and future scope</article-title>. <source>J. Mobile Multimedia</source> <volume>19</volume>, <fpage>1277</fpage>&#x2013;<lpage>1298</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13052/jmm1550-4646.1957</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Ren</surname> <given-names>S.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2015</year>). 
<article-title>Spatial pyramid pooling in deep convolutional networks for visual recognition</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>37</volume>, <fpage>1904</fpage>&#x2013;<lpage>1916</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2015.2389824</pub-id>, PMID: <pub-id pub-id-type="pmid">26353135</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hu</surname> <given-names>J.</given-names></name>
<name><surname>Shen</surname> <given-names>L.</given-names></name>
<name><surname>Albanie</surname> <given-names>S.</given-names></name>
<name><surname>Sun</surname> <given-names>G.</given-names></name>
<name><surname>Wu</surname> <given-names>E.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Squeeze-and-excitation networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>42</volume>, <fpage>2011</fpage>&#x2013;<lpage>2023</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2019.2913372</pub-id>, PMID: <pub-id pub-id-type="pmid">31034408</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hu</surname> <given-names>Y.</given-names></name>
<name><surname>Deng</surname> <given-names>X.</given-names></name>
<name><surname>Lan</surname> <given-names>Y.</given-names></name>
<name><surname>Chen</surname> <given-names>X.</given-names></name>
<name><surname>Long</surname> <given-names>Y.</given-names></name>
<name><surname>Liu</surname> <given-names>C.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Detection of rice pests based on self-attention mechanism and multi-scale feature fusion</article-title>. <source>Insects</source> <volume>14</volume>, <elocation-id>280</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/insects14030280</pub-id>, PMID: <pub-id pub-id-type="pmid">36975965</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Zhou</surname> <given-names>A.</given-names></name>
<name><surname>Yao</surname> <given-names>A.</given-names></name>
</person-group> (<year>2022</year>). &#x201c;
<article-title>Omni-dimensional dynamic convolution</article-title>,&#x201d; in <conf-name> 10th International Conference on Learning Representations, ICLR 2022. (Virtual Event: OpenReview.net)</conf-name>. Available online at: <uri xlink:href="https://openreview.net/forum?id=DmpCfq6Mg39">https://openreview.net/forum?id=DmpCfq6Mg39</uri>. (Accessed February 9, 2026).
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>T.-Y.</given-names></name>
<name><surname>Doll&#xe1;r</surname> <given-names>P.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Hariharan</surname> <given-names>B.</given-names></name>
<name><surname>Belongie</surname> <given-names>S.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (
<publisher-name>IEEE Computer Society</publisher-name>, <publisher-loc>Los Alamitos, CA</publisher-loc>), <fpage>936</fpage>&#x2013;<lpage>944</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2017.106</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>S.</given-names></name>
<name><surname>Qi</surname> <given-names>L.</given-names></name>
<name><surname>Qin</surname> <given-names>H.</given-names></name>
<name><surname>Shi</surname> <given-names>J.</given-names></name>
<name><surname>Jia</surname> <given-names>J.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Path aggregation network for instance segmentation</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (
<publisher-name>IEEE Computer Society</publisher-name>, <publisher-loc>Los Alamitos, CA</publisher-loc>), <fpage>8759</fpage>&#x2013;<lpage>8768</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2018.00913</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>Q.</given-names></name>
<name><surname>Min</surname> <given-names>W.</given-names></name>
<name><surname>Geng</surname> <given-names>G.</given-names></name>
<name><surname>Jiang</surname> <given-names>S.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Solutions and challenges in AI-based pest and disease recognition</article-title>. <source>Comput. Electron. Agric.</source> <volume>238</volume>, <elocation-id>110775</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110775</pub-id>, PMID: <pub-id pub-id-type="pmid">41743167</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Pan</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>T.</given-names></name>
<name><surname>Wu</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>RiceNet: a two-stage machine learning method for rice disease identification</article-title>. <source>Biosyst. Eng.</source> <volume>225</volume>, <fpage>25</fpage>&#x2013;<lpage>40</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2022.11.007</pub-id>, PMID: <pub-id pub-id-type="pmid">41743167</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Quach</surname> <given-names>L.-D.</given-names></name>
<name><surname>Nguyen</surname> <given-names>Q. K.</given-names></name>
<name><surname>Nguyen</surname> <given-names>Q. A.</given-names></name>
<name><surname>Le</surname> <given-names>T. T. L.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Rice pest dataset supports the construction of smart farming systems</article-title>. <source>Data Brief</source> <volume>52</volume>, <elocation-id>110046</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.dib.2024.110046</pub-id>, PMID: <pub-id pub-id-type="pmid">38299106</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rahman</surname> <given-names>C. R.</given-names></name>
<name><surname>Arko</surname> <given-names>P. S.</given-names></name>
<name><surname>Ali</surname> <given-names>M. E.</given-names></name>
<name><surname>Khan</surname> <given-names>M. A. I.</given-names></name>
<name><surname>Apon</surname> <given-names>S. H.</given-names></name>
<name><surname>Nowrin</surname> <given-names>F.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>Identification and recognition of rice diseases and pests using convolutional neural networks</article-title>. <source>Biosyst. Eng.</source> <volume>194</volume>, <fpage>112</fpage>&#x2013;<lpage>120</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2020.03.020</pub-id>, PMID: <pub-id pub-id-type="pmid">41743167</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Redmon</surname> <given-names>J.</given-names></name>
<name><surname>Divvala</surname> <given-names>S.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>Farhadi</surname> <given-names>A.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>You only look once: Unified, real-time object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (
<publisher-name>IEEE Computer Society</publisher-name>, <publisher-loc>Los Alamitos, CA</publisher-loc>), <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Redmon</surname> <given-names>J.</given-names></name>
<name><surname>Farhadi</surname> <given-names>A.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>YOLOv3: An incremental improvement</article-title>. <source>arXiv [Preprint]. abs/1804.02767</source>. Available online at: <uri xlink:href="https://arxiv.org/abs/1804.02767">https://arxiv.org/abs/1804.02767</uri>. (Accessed February 9, 2026).
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Santiteerakul</surname> <given-names>S.</given-names></name>
<name><surname>Sopadang</surname> <given-names>A.</given-names></name>
<name><surname>Tippayawong</surname> <given-names>K. Y.</given-names></name>
<name><surname>Tamvimol</surname> <given-names>K.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>The role of smart technology in sustainable agriculture: a case study of Wangree plant factory</article-title>. <source>Sustainability</source> <volume>12</volume>, <elocation-id>4640</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/su12114640</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Savary</surname> <given-names>S.</given-names></name>
<name><surname>Horgan</surname> <given-names>F.</given-names></name>
<name><surname>Willocquet</surname> <given-names>L.</given-names></name>
<name><surname>Heong</surname> <given-names>K. L.</given-names></name>
</person-group> (<year>2012</year>). 
<article-title>A review of principles for sustainable pest management in rice</article-title>. <source>Crop Prot.</source> <volume>32</volume>, <fpage>54</fpage>&#x2013;<lpage>63</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cropro.2011.10.012</pub-id>, PMID: <pub-id pub-id-type="pmid">41743167</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Selvaraju</surname> <given-names>R. R.</given-names></name>
<name><surname>Cogswell</surname> <given-names>M.</given-names></name>
<name><surname>Das</surname> <given-names>A.</given-names></name>
<name><surname>Vedantam</surname> <given-names>R.</given-names></name>
<name><surname>Parikh</surname> <given-names>D.</given-names></name>
<name><surname>Batra</surname> <given-names>D.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title>. <source>Int. J. Comput. Vis.</source> <volume>128</volume>, <fpage>336</fpage>&#x2013;<lpage>359</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11263-019-01228-7</pub-id>, PMID: <pub-id pub-id-type="pmid">41746348</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Szegedy</surname> <given-names>C.</given-names></name>
<name><surname>Liu</surname> <given-names>W.</given-names></name>
<name><surname>Jia</surname> <given-names>Y.</given-names></name>
<name><surname>Sermanet</surname> <given-names>P.</given-names></name>
<name><surname>Reed</surname> <given-names>S.</given-names></name>
<name><surname>Anguelov</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2015</year>). &#x201c;
<article-title>Going deeper with convolutions</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (
<publisher-name>IEEE Computer Society</publisher-name>, <publisher-loc>Los Alamitos, CA</publisher-loc>), <fpage>1</fpage>&#x2013;<lpage>9</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2015.7298594</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Tan</surname> <given-names>M.</given-names></name>
<name><surname>Pang</surname> <given-names>R.</given-names></name>
<name><surname>Le</surname> <given-names>Q. V.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>EfficientDet: Scalable and efficient object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (
<publisher-name>IEEE Computer Society</publisher-name>, <publisher-loc>Los Alamitos, CA</publisher-loc>), <fpage>10778</fpage>&#x2013;<lpage>10787</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR42600.2020.01079</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Vaswani</surname> <given-names>A.</given-names></name>
<name><surname>Shazeer</surname> <given-names>N.</given-names></name>
<name><surname>Parmar</surname> <given-names>N.</given-names></name>
<name><surname>Uszkoreit</surname> <given-names>J.</given-names></name>
<name><surname>Jones</surname> <given-names>L.</given-names></name>
<name><surname>Gomez</surname> <given-names>A. N.</given-names></name>
<etal/>
</person-group>. (<year>2017</year>). &#x201c;
<article-title>Attention is all you need</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems (NeurIPS)</source>, vol. <volume>30</volume>. (
<publisher-name>Curran Associates, Inc</publisher-name>, <publisher-loc>Red Hook, NY</publisher-loc>).
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Waqas</surname> <given-names>M.</given-names></name>
<name><surname>Naseem</surname> <given-names>A.</given-names></name>
<name><surname>Humphries</surname> <given-names>U. W.</given-names></name>
<name><surname>Hlaing</surname> <given-names>P. T.</given-names></name>
<name><surname>Dechpichai</surname> <given-names>P.</given-names></name>
<name><surname>Wangwongchai</surname> <given-names>A.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Applications of machine learning and deep learning in agriculture: a comprehensive review</article-title>. <source>Green Technol. Sustain.</source> <volume>3</volume>, <elocation-id>100199</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.grets.2025.100199</pub-id>, PMID: <pub-id pub-id-type="pmid">41743167</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Woo</surname> <given-names>S.</given-names></name>
<name><surname>Park</surname> <given-names>J.</given-names></name>
<name><surname>Lee</surname> <given-names>J.-Y.</given-names></name>
<name><surname>Kweon</surname> <given-names>I. S.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>CBAM: Convolutional block attention module</article-title>,&#x201d; in <conf-name>Proceedings of the European Conference on Computer Vision ECCV 2018: 15th European Conference, Munich, Germany, September 8&#x2013;14, 2018, Proceedings, Part VII</conf-name>. (<publisher-loc>Cham, Switzerland</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>3</fpage>&#x2013;<lpage>19</lpage>.
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>X.</given-names></name>
<name><surname>Zhan</surname> <given-names>C.</given-names></name>
<name><surname>Lai</surname> <given-names>Y.-K.</given-names></name>
<name><surname>Cheng</surname> <given-names>M.-M.</given-names></name>
<name><surname>Yang</surname> <given-names>J.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>IP102: A large-scale benchmark dataset for insect pest recognition</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (
<publisher-name>IEEE Computer Society</publisher-name>, <publisher-loc>Los Alamitos, CA</publisher-loc>), <fpage>8779</fpage>&#x2013;<lpage>8788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00899</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xiong</surname> <given-names>P.</given-names></name>
<name><surname>Zhang</surname> <given-names>C.</given-names></name>
<name><surname>He</surname> <given-names>L.</given-names></name>
<name><surname>Zhan</surname> <given-names>X.</given-names></name>
<name><surname>Han</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Deep learning-based rice pest detection research</article-title>. <source>PloS One</source> <volume>19</volume>, <fpage>e0313387</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0313387</pub-id>, PMID: <pub-id pub-id-type="pmid">39509376</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>R.-Y.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<name><surname>Xie</surname> <given-names>X.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>SimAM: A simple, parameter-free attention module for convolutional neural networks</article-title>,&#x201d; in <source>Proceedings of the 38th International Conference on Machine Learning (ICML)</source>, vol. <volume>139</volume> . Eds. 
<person-group person-group-type="editor">
<name><surname>Meila</surname> <given-names>M.</given-names></name>
<name><surname>Zhang</surname> <given-names>T.</given-names></name>
</person-group> (
<publisher-name>PMLR</publisher-name>), <fpage>11863</fpage>&#x2013;<lpage>11874</lpage>. Available online at: <uri xlink:href="https://proceedings.mlr.press/v139/yang21o.html">https://proceedings.mlr.press/v139/yang21o.html</uri>. (Accessed February 9, 2026)
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>F.</given-names></name>
<name><surname>Tian</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Yang</surname> <given-names>N.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Gao</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>MTD-YOLO: an improved YOLOv8-based rice pest detection model</article-title>. <source>Electronics</source> <volume>14</volume>, <elocation-id>2912</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics14142912</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>Y.</given-names></name>
<name><surname>Zheng</surname> <given-names>W.</given-names></name>
<name><surname>Du</surname> <given-names>X.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>A lightweight rice pest detection algorithm based on improved YOLOv8</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>29888</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-024-81587-5</pub-id>, PMID: <pub-id pub-id-type="pmid">39623058</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1937850">Parvathaneni Naga Srinivasu</ext-link>, Amrita Vishwa Vidyapeetham University, India</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2773845">Aruna Pavate</ext-link>, Thakur College of Engineering and Technology, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3324799">Rohini Patil</ext-link>, Terna Engineering College, India</p></fn>
</fn-group>
</back>
</article>