<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1778883</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Focal-HAIN: a lightweight model with adaptive modulation and hierarchical interaction for real-time crop pest and disease monitoring</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Liu</surname><given-names>Wei</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3328039/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Xu</surname><given-names>Li</given-names></name>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Chang</surname><given-names>Xingzhi</given-names></name>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Long</surname><given-names>Xiaohan</given-names></name>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
</contrib>
</contrib-group>
<aff id="aff1"><institution>School of Cyberspace Security, Changzhou College of Information Technology</institution>, <city>Changzhou</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Wei Liu, <email xlink:href="mailto:liuwei1@czcit.edu.cn">liuwei1@czcit.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-04">
<day>04</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1778883</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>14</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>06</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Liu, Xu, Chang and Long.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Liu, Xu, Chang and Long</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-04">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>To address the problems of low detection accuracy, severe background interference, and poor real-time performance existing in current object detection models in complex agricultural monitoring scenarios, we proposed Focal-HAIN (F-HAIN), a lightweight object detection model tailored for embedded platforms.</p>
</sec>
<sec>
<title>Methods</title>
<p>Built on the YOLOv5 architecture with design insights from RT-DETR, the proposed model incorporates two key structural enhancements to improve multi-scale feature representation and localization precision. Firstly, focus modulation was integrated into the neck network, and the F-SPPELAN module was designed to achieve adaptive and precise modulation of the feature channel based on the focus loss-guided attention mechanism. This effectively suppresses background noise and enhances the model&#x2019;s response to small targets. Secondly, the HAIN module was constructed. By introducing a deep interlacing fusion strategy, feature interaction operations within the scale are embedded into the cross-scale feature aggregation path, thereby enhancing the correlation among multi-scale features and improving positioning accuracy. This study conducted comprehensive experiments on the IP102 dataset and deployed the model on a Raspberry Pi 4B embedded device for real-time performance verification.</p>
</sec>
<sec>
<title>Results</title>
<p>The experimental results show that the mAP50 of F-HAIN can reach 90.1%. Under the same experimental conditions, compared with models such as RT-DETR, YOLOv5, YOLOv8, YOLOv10, and YOLOv11, the performance of F-HAIN on mAP50 increased by 5.5%, 6.8%, 4.9%, 5.4%, and 3.0%, respectively. Meanwhile, F-HAIN maintains a high-speed inference of 161 FPS on a high-performance workstation and was successfully deployed in an IoT-based collaborative system where a Raspberry Pi 4B serves as the edge acquisition terminal.</p>
</sec>
<sec>
<title>Discussion</title>
<p>These findings demonstrate that F-HAIN effectively balances high detection accuracy with computational efficiency, providing a robust and deployable solution for real-time agricultural monitoring on resource-constrained edge devices.</p>
</sec>
</abstract>
<kwd-group>
<kwd>crop pest and disease detection</kwd>
<kwd>focal modulation</kwd>
<kwd>HAIN</kwd>
<kwd>lightweight</kwd>
<kwd>real-time monitoring</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The funding for this research was provided by Professor Xingzhi Chang&#x2019;s Qing-Lan Project of Jiangsu Province and Engineering Technology Research and Development Center of Jiangsu Provincial Department of Education-Research and Development Center for Industrial Big Data and Industrial Intelligence Engineering Technology.</funding-statement>
</funding-group>
<counts>
<fig-count count="15"/>
<table-count count="4"/>
<equation-count count="6"/>
<ref-count count="26"/>
<page-count count="18"/>
<word-count count="7696"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>For modern precision agriculture, real-time, accurate detection of crop pests and diseases is a vital technical underpinning, as it directly bears on global food security and agricultural product quality (<xref ref-type="bibr" rid="B15">Nyawose et&#xa0;al., 2025</xref>). According to FAO statistics and related studies, pathogens and pests cause annual crop losses of 20%&#x2013;40%, translating to billions of US dollars in direct economic damages for the agricultural sector (<xref ref-type="bibr" rid="B1">FAO, 2025</xref>). Currently, farmland pest and disease monitoring relies heavily on expert field surveys and farmer visual observations. These methods are labor-consuming, costly, and highly subjective, with judgments varying significantly between observers. Most critically, they perform poorly in detecting early-stage diseases or small-scale micropest outbreaks&#x2014;where distinguishing visual cues are faint or absent&#x2014;failing to meet the real-time monitoring demands of large-scale intensive farming. This often leads to missed optimal intervention windows and subsequent widespread pest and disease spread (<xref ref-type="bibr" rid="B5">Chu et&#xa0;al., 2023</xref>). The problem is exacerbated in on-site mobile monitoring scenarios requiring rapid responses, which is precisely the target application of the embedded platform-based detection system developed in this study.</p>
<p>In smart agriculture, IoT frameworks have significantly advanced context-aware decision support. For instance, A. Khan et&#xa0;al. proposed an IoT-assisted system for real-time soil fertility mapping to optimize fertilizer recommendations (<xref ref-type="bibr" rid="B9">Khan et&#xa0;al., 2022a</xref>). Similarly, ensemble machine learning models have been utilized by R. N. Bashir et&#xa0;al. for intelligent Reference Evapotranspiration (ETo) forecasting to enhance irrigation precision (<xref ref-type="bibr" rid="B3">Bashir et&#xa0;al., 2023</xref>). Furthermore, A. Khan et&#xa0;al. addressed saline soil reclamation by employing LSTM-based architectures to predict soil evapotranspiration (ETs) and improve the leaching process (<xref ref-type="bibr" rid="B10">Khan et&#xa0;al., 2022b</xref>). However, while these systems effectively manage abiotic factors like water and nutrients, they often lack the high-fidelity visual sensing required for early biological threat detection.</p>
<p>In recent years, YOLO-based target-detection algorithms have been widely adopted for crop disease and pest monitoring. Researchers have proposed various task-specific enhancements for different crops and use cases. Chen et&#xa0;al. improved localization of disease regions in complex backgrounds by adding an attention mechanism and a feature-pyramid enhancement strategy (<xref ref-type="bibr" rid="B4">Chen et&#xa0;al., 2022</xref>). To address detection of small or dense disease targets, Liu et&#xa0;al. integrated a nested residual Transformer module into the YOLOv5 model, enhancing feature extraction for tiny lesions (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2022</xref>). Lightweight design has also increased model practicality (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2022</xref>). Li et&#xa0;al. developed a lightweight YOLO-JD network for jute pest and disease identification, reducing computational cost while maintaining high accuracy (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2022</xref>). Qi a et&#xa0;al. modified YOLOv5 with visual attention for tomato virus detection, improving specificity by focusing on key lesion areas (<xref ref-type="bibr" rid="B17">Qi et&#xa0;al., 2022</xref>).</p>
<p>At the crop-application level, different detection methods have been developed for specific crops. Soeb et&#xa0;al. built a tea disease detection system, YOLO-T, based on YOLOv7 for rapid identification of common leaf diseases (<xref ref-type="bibr" rid="B18">Soeb et&#xa0;al., 2023</xref>), and Bao et&#xa0;al. applied unmanned aerial vehicle remote sensing with an improved DDMA-YOLO model for large-scale leaf disease monitoring in tea gardens (<xref ref-type="bibr" rid="B2">Bao et&#xa0;al., 2023</xref>). For apple diseases, Zhu et&#xa0;al. proposed the EADD-YOLO model to identify diverse leaf diseases in complex backgrounds (<xref ref-type="bibr" rid="B26">Zhu et&#xa0;al., 2023</xref>). Gao et&#xa0;al. developed BAM-Net, which incorporates spatial and channel attention to segment overlapping lesions (<xref ref-type="bibr" rid="B7">Gao et&#xa0;al., 2023</xref>). Khan et&#xa0;al. created a real-time apple leaf disease diagnostic system to support immediate field decisions (<xref ref-type="bibr" rid="B11">Khan et&#xa0;al., 2022</xref>).</p>
<p>In the field of agricultural pest and disease detection, targeted optimization of YOLO-series models has become a mainstream research direction, with most studies focusing on improving detection performance for specific crops or scenarios. For instance, aiming at vulnerable crops like strawberries, Li et&#xa0;al. introduced the DAC module based on YOLOv4, which effectively enhanced the detection accuracy of powdery mildew and other diseases (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2022</xref>). Similarly, Maize-YOLO achieved high-precision detection of maize pests by leveraging multi-scale feature fusion (<xref ref-type="bibr" rid="B21">Yang et&#xa0;al., 2023</xref>), while Yue et&#xa0;al. proposed the YOLOv7-GCA model for pepper diseases, optimizing multi-scale lesion recognition through global context attention (<xref ref-type="bibr" rid="B22">Yue et&#xa0;al., 2024</xref>). Collectively, these studies validate that scenario-specific optimizations&#x2014;such as adapting to plastic-film covering or complex weed backgrounds&#x2014;can significantly improve the adaptability of deep learning models in unstructured agricultural environments, laying a foundation for the deployment of detection algorithms from laboratories to embedded edge devices (<xref ref-type="bibr" rid="B16">Qi et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B6">Gao et&#xa0;al., 2026</xref>).</p>
<p>Beyond model structure optimization, advances in loss functions and practical function expansion have also promoted the development of this field. Zhang et&#xa0;al. proposed Focal EIoU loss function enhances the stability and convergence speed of bounding box regression, providing a theoretical basis for improving detection precision (<xref ref-type="bibr" rid="B24">Zhang et&#xa0;al., 2022</xref>). Wen et&#xa0;al. proposed the pest-yolo model, which further achieved the simultaneous detection and counting of multiple pests, addressing the practical needs of agricultural production (<xref ref-type="bibr" rid="B20">Wen et&#xa0;al., 2022</xref>). However, existing research still faces prominent bottlenecks, with small target representation being the most persistent challenge. In the early stages of the disease, the lesions are scarce and small similarly, early-stage pests are diminutive, and their visual features are easily obscured by complex leaf textures or crop shadows (<xref ref-type="bibr" rid="B14">Liu et&#xa0;al., 2022</xref>). This feature-scarcity issue is exacerbated in dynamic field environments with varying illumination or severe occlusion (<xref ref-type="bibr" rid="B19">Wang et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B23">Zhang et&#xa0;al., 2025</xref>). Notably, most current studies focus on single-crop or single-pest scenarios, lacking sufficient exploration of cross-crop generalization capabilities. Moreover, the optimization for small targets is often limited to general multi-scale fusion strategies, failing to fully account for the unique morphological and spatial distribution characteristics of agricultural small targets, which restricts the further improvement of detection performance in real-world field applications.</p>
<p>While the YOLO series algorithms deliver passable performance in certain application scenarios, they still fall short in structured natural environments. When it comes to small targets&#x2014;such as mild crop disease symptoms&#x2014;their detection accuracy can barely meet real-world agricultural requirements. In addition, in the face of complex on-site backgrounds (e.g., weed interference, fluctuating lighting conditions), the algorithms&#x2019; stability remains wanting (<xref ref-type="bibr" rid="B8">Jiang et&#xa0;al., 2023</xref>). To tackle these pain points, researchers have devised Transformer-based detection architectures, with DETR (<xref ref-type="bibr" rid="B25">Zhao et&#xa0;al., 2024</xref>) standing out as a typical representative. Building on this foundation, the real-time variant RT-DETR has been optimized to boost inference speed significantly. RT-DETR demonstrates advantages in global feature modeling and end-to-end design, offering improved handling of occluded targets. However, for embedded agricultural applications requiring real-time performance, YOLO-based architectures provide a better balance of accuracy and computational efficiency.</p>
<p>Based on the YOLOv5 architecture with design insights from RT-DETR&#x2019;s attention mechanisms, we proposed F-HAIN, a new lightweight method for real-time crop pest and disease monitoring. F-HAIN improves small-target accuracy and robustness in complex scenes via two innovations: the F-SPPELAN module and the HAIN module. Deployed on a Raspberry Pi 4B embedded platform, the enhanced model enables efficient operation under the hardware constraints typical of grassroots agricultural applications. The aim is to meet the accuracy requirements of practical agricultural monitoring while offering a cost-effective, user-friendly surveillance solution for pest and disease detection at the grassroots.</p>
<p>The key contributions of this study are summarized as follows:</p>
<p>A novel lightweight model, F-HAIN, is proposed for accurate, real-time crop pest and disease monitoring, built upon the YOLOv5 architecture while incorporating design principles from RT-DETR.</p>
<p>The F-SPPELAN Module is introduced to the Neck network, integrating the Focal Modulation mechanism to achieve precise adaptive modulation of feature channels, which effectively suppresses background noise and enhances the response of small targets.</p>
<p>The Hierarchical Adaptive Interaction Network (HAIN) module is designed, utilizing a deep interlaced fusion strategy to integrate intra-scale feature interaction into the cross-scale feature aggregation path, which significantly improves multi-scale feature representation capacity and localization accuracy.</p>
<p>The F-HAIN model is deployed and validated on the Raspberry Pi 4B embedded device, demonstrating its high efficiency and practicality for low-latency monitoring.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Experimental data</title>
<sec id="s2_1">
<label>2.1</label>
<title>Dataset acquisition</title>
<p>This study utilizes the IP102 dataset as the primary data source for experimentation. As the first large-scale benchmark dataset dedicated to crop pest detection in agricultural computer vision, IP102, a dataset specifically designed for agricultural pest detection, is extensively utilized across various research domains, including plant image recognition and pest management. The IP102 dataset, for instance, encompasses a wide range of crops such as strawberries, beans, and tomatoes, and includes various conditions like mold and leaf-spot diseases. Unlike smaller datasets, IP102&#x2019;s key advantage is that all images were captured in real field environments, exhibiting complex background clutter, partial occlusions, and overlapping leaves. The dataset&#x2019;s intrinsic characteristics closely replicate the real-world challenges encountered by detection algorithms in grassroots agricultural monitoring, rendering it ideal for assessing the robustness of proposed models, particularly their capacity to detect small and inconspicuous targets.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Dataset filtering and partitioning</title>
<p>To enhance the experiment&#x2019;s relevance, this study manually selected images from the IP102 dataset and chose three crops&#x2014;bean, strawberry, and tomato&#x2014;for evaluation. This study retained 5,483 annotated images representing 12 pest and disease types. Typical examples are shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Examples of all classes of pest and disease in the dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g001.tif">
<alt-text content-type="machine-generated">Twelve-panel grid showing close-up photos of infected leaves and fruits from strawberries, beans, and tomatoes. Each panel depicts a different plant disease including leaf spots, blights, molds, mildew, rust, and spider mites, with visual symptoms like lesions, discoloration, blight, mold growth, and leaf damage clearly visible on each affected plant part.</alt-text>
</graphic></fig>
<p>The detailed distribution of samples is as follows:</p>
<p>Strawberry samples included 7 pest and disease categories: &#x201c;Angular Leaf Spot&#x201d; (541 images), &#x201c;Anthracnose Fruit Rot&#x201d; (194), &#x201c;Blossom Blight&#x201d; (356 images), &#x201c;Gray Mold&#x201d; (539 images), &#x201c;Leaf Spot&#x201d; (650 images), &#x201c;Powdery Mildew Fruit&#x201d; (258 images), and &#x201c;Powdery Mildew Leaf&#x201d; (558 images). Tomato samples included 3 categories: &#x201c;Tomato Blight&#x201d; (531 images), &#x201c;Leaf Mold&#x201d; (477 images), and&#x201d;Spider Mites&#x201d; (446 images). Bean samples included 2 categories: &#x201c;Angular Leaf Spot&#x201d; (489 images) and &#x201c;Bean Rust&#x201d; (414 images). The detailed images of the 12 selected pests and diseases are shown in <xref ref-type="table" rid="T1"><bold>Table 1</bold></xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Image information of 12 selected pest and disease.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Class number</th>
<th valign="middle" align="center">Name</th>
<th valign="middle" align="center">Abbreviation</th>
<th valign="middle" align="center">Number of class images</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">0</td>
<td valign="middle" align="center">Strawberry Leaf Spot</td>
<td valign="middle" align="center">S-LS</td>
<td valign="middle" align="center">650</td>
</tr>
<tr>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">Strawberry Anthracnose Fruit Rot</td>
<td valign="middle" align="center">S-AFR</td>
<td valign="middle" align="center">194</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">Strawberry Angular Leaf Spot</td>
<td valign="middle" align="center">S-ALS</td>
<td valign="middle" align="center">541</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="middle" align="center">Strawberry Blossom Blight</td>
<td valign="middle" align="center">S-BB</td>
<td valign="middle" align="center">356</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">Strawberry Gray Mold</td>
<td valign="middle" align="center">S-GM</td>
<td valign="middle" align="center">539</td>
</tr>
<tr>
<td valign="middle" align="center">5</td>
<td valign="middle" align="center">Strawberry Powdery Mildew Fruit</td>
<td valign="middle" align="center">S-PMF</td>
<td valign="middle" align="center">258</td>
</tr>
<tr>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">Strawberry Powdery Mildew Leaf</td>
<td valign="middle" align="center">S-PML</td>
<td valign="middle" align="center">558</td>
</tr>
<tr>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">Beans Angular Leaf Spot</td>
<td valign="middle" align="center">B-ALS</td>
<td valign="middle" align="center">489</td>
</tr>
<tr>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">Beans Rust</td>
<td valign="middle" align="center">B-R</td>
<td valign="middle" align="center">414</td>
</tr>
<tr>
<td valign="middle" align="center">9</td>
<td valign="middle" align="center">Tomato Leaf Mold</td>
<td valign="middle" align="center">T-LM</td>
<td valign="middle" align="center">477</td>
</tr>
<tr>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">Tomato Blight</td>
<td valign="middle" align="center">T-B</td>
<td valign="middle" align="center">531</td>
</tr>
<tr>
<td valign="middle" align="center">11</td>
<td valign="middle" align="center">Tomato Spider Mites</td>
<td valign="middle" align="center">T-SM</td>
<td valign="middle" align="center">446</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To prevent bias from data partitioning and ensure reproducibility, the dataset was randomly divided into training, validation, and test sets in a 7:1:2 ratio. The training set contains 3,838 images used to learn model parameters, the validation set contains 548 images used for hyperparameter tuning and periodic performance checks during training, and the test set contains 1,097 images reserved for independent evaluation of the final model&#x2019;s detection performance.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Model design and improvement</title>
<sec id="s3_1">
<label>3.1</label>
<title>Basic model: YOLOv5</title>
<p>YOLOv5, a typical single-stage object detector under the YOLO family, simplifies object detection into an end-to-end regression task. This allows it to predict both the location and category of objects in one single forward pass. YOLOv5 was selected as the baseline architecture due to its proven efficiency on</p>
<p>embedded platforms and well-established implementation ecosystem. While RT-DETR offers advantages in certain scenarios, YOLOv5&#x2019;s balance of accuracy, speed, and resource efficiency makes it more suitable for agricultural edge deployment. Such a structural design cuts down computational overhead markedly and achieves high inference speed, making it a promising option for real-time detection on resource-limited embedded platforms&#x2014;well-aligned with the on-site monitoring needs of agricultural micropest detection. As illustrated in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>, the YOLOv5 architecture comprises four core modules: input module, Backbone network, Neck network, and detection Head. </p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Network structure diagram of YOLOv5.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g002.tif">
<alt-text content-type="machine-generated">Neural network diagram illustrating a plant disease detection process. Input and output images of a leaf with visible lesions are connected through layers titled SPPF, CBL, Conv2d, BatchNorm, SiLU, Upsample, Concat, and Head, representing different model components and data flow.</alt-text>
</graphic></fig>
<p>For multi-scale feature extraction, the Backbone employs CSPDarknet53 to extract multi-scale features 156 from the input images. The input image (640&#xd7;640) is downsampled via convolutional layers to yield three 157 feature maps at different scales: P3, P4, and P5. P3 is a high-resolution feature map with rich spatial detail 158 and is best suited for detecting small objects, whereas P4 and P5 have lower spatial resolution but contain 159 more abstract semantic information and are therefore better for medium and large objects.</p>
<p>The Neck module integrates Feature Pyramid Network (FPN) and Path Aggregation Network (PAN) to realize multi-scale feature fusion: FPN transmits high-level semantic information from top to bottom to lower layers, while PAN conveys low-level precise localization information from bottom to top. This bidirectional aggregation approach merges contextual and spatial information across different scales, thereby enhancing the detection capability for objects of various sizes. Nevertheless, in practical agricultural micropest detection scenarios, the original YOLOv5 still has certain limitations: the feature fusion in the Neck module lacks targeted enhancement for small targets, and the computational cost of the backbone network still needs optimization to adapt to long-term stable operation on embedded devices (e.g., Raspberry Pi 4B). For the final prediction, an anchor-based strategy is adopted: each feature map is preconfigured with fixed-size anchor boxes, and the network predicts bounding-box coordinate offsets, confidence scores, and class probabilities via convolution operations. Subsequently, non-maximum suppression (NMS) is used for post-processing to remove redundant detections and obtain the final results.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Proposed improved architecture: F-HAIN</title>
<p>Although YOLOv5 can achieve efficient real-time detection, in actual agricultural scenarios, due to the complex background and the small size of pest and disease targets, it still encounters some challenges. To overcome these limitations and maintain computational efficiency, we proposed the F-HAIN method and made targeted improvements to it. To address this limitation, this paper proposes a novel model, F-HAIN, whose network architecture is illustrated in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. The model replaces the core SPPF module in YOLOv5 with a newly designed F-SPPELAN module, which enhances multi-scale target feature expression through parallel multi-scale pooling and dynamic feature weighting while preserving real-time performance. Additionally, F-HAIN incorporates the efficient hybrid encoder&#x2019;s HAIN module, which processes high-level backbone features. By implementing global interaction on feature maps via a multi-head attention mechanism, the HAIN module reduces computational overhead and improves processing speed. These architectural modifications collectively mitigate critical challenges in complex scenarios, including multi-target occlusion, small target missed detection, and insufficient detection speed.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Network structure diagram of F-HAIN.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g003.tif">
<alt-text content-type="machine-generated">Deep learning network architecture diagram for leaf lesion detection, showing labeled blocks for CBS, F-SPPELAN, RepC3, HAIN, and query selection modules, with arrows indicating data flow from input image of infected leaf to segmented output.</alt-text>
</graphic></fig>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>F-SPPELAN module design</title>
<sec id="s3_2_1_1">
<label>3.2.1.1</label>
<title>Deep integration of focal modulation</title>
<p>Detecting crop pests and diseases is challenging because targets are small and backgrounds are complex. The SPPELAN module of YOLOV9, which extracts and enhances features across multiple scales, has difficulty balancing a global receptive field with local-detail preservation and could improve spatial feature refinement. We integrate Focal Modulation into the SPPELAN module to address these limitations. Unlike channel-attention methods that rely on global average pooling, Focal Modulation adaptively adjusts spatial weights for each pixel in the feature map. Its main advantages are: (1) extracting multi-scale context by capturing spatial information from near to far using hierarchical depthwise convolutions; (2) dynamically evaluating the contribution of different scales to each position via a gating mechanism; and (3) enhancing features of small lesions while suppressing background noise to achieve finer spatial refinement. Traditional multi-scale extraction and enhancement methods struggle to suppress redundant information and emphasize discriminative features when targets are small and backgrounds are cluttered. Although the original SPPELAN module handled multi-scale receptive fields and adaptively improved channel features, it lacked this fine-grained spatial modulation.</p>
<p>To address this challenge, we propose the F-SPPELAN module, which integrates focal modulation into the SPPELAN architecture. This design enables fine-grained, adaptive modulation of feature channels to amplify salient information and suppress background noise.</p>
</sec>
<sec id="s3_2_1_2">
<label>3.2.1.2</label>
<title>Mathematical description of F-SPPELAN</title>
<p>The F-SPPELAN implementation proceeds in three stages: linear projection, hierarchical context aggregation with gating selection, and feature modulation. The working principle diagram of the F-SPPELAN is shown in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Principle diagram of the F-SPPELAN.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g004.tif">
<alt-text content-type="machine-generated">Diagram of a neural network module showing input feature processing through a one-by-one convolution, hierarchical context aggregation with three depth-wise convolutions, gated aggregation, modulator generation, and output feature computation. Mathematical operations and flow directions are annotated throughout.</alt-text>
</graphic></fig>
<p>(1) Linear projection: the input feature map <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:mi>F</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is projected by a 1&#xd7;1 convolution and split into three parts: the query vector <italic>Query(Q)</italic>, the initial context quantity <italic>Ctx</italic>, and the gating signal <italic>Gate(G)</italic>. As shown in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>.</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>G</mml:mi><mml:mo stretchy="false">]</mml:mo><mml:mo>=</mml:mo><mml:mtext>Linear</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:mi>x</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mi>C</mml:mi></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:mi>G</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math>
</disp-formula>
<p>Here, <italic>L</italic> denotes the set focal level.</p>
<p>(2) Hierarchical Aggregation and Gating: contexts at multiple scales are extracted via a sequence of depth-wise convolutions whose kernel sizes increase with level <italic>l</italic>. As shown in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>.</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mtext>focal</mml:mtext><mml:mo>_</mml:mo><mml:mtext>factor</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mtext>focal</mml:mtext><mml:mo>_</mml:mo><mml:mtext>window</mml:mtext></mml:mrow></mml:math>
</disp-formula>
<p>At each layer the aggregated context <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is multiplied by its corresponding gating signal <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mi>G</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>for adaptive fusion. As shown in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>.</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>L</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>L</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>The resulting <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> thus encodes multi-scale features from local to global.</p>
<p>(3) Feature modulation: the aggregated context is passed through another linear layer h and used as a modulator, performing element-wise (Hadamard) multiplication with the query vector <italic>Q</italic> to achieve adaptive feature enhancement. The final F-SPPELAN output is produced by the output projection layer. As shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>.</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>Q</mml:mi><mml:mo>&#x2299;</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>It is essential to distinguish the proposed Focal Modulation mechanism from existing attention modules and recent YOLO variants. Unlike channel-based attention such as SE or spatial-channel fusion like CBAM, which primarily perform feature re-weighting through global pooling, Focal Modulation utilizes a hierarchical gated aggregation strategy. This approach enables the model to capture multi-scale contexts and establish long-range dependencies with linear computational complexity, avoiding the quadratic overhead and loss of inductive bias associated with Transformer-based architectures. Compared to the standard CSP-based feature fusion in YOLO models, Focal Modulation provides a more refined spatial awareness, which is critical for isolating subtle pest features from complex field environments.</p>
</sec>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>HAIN module application and optimization</title>
<p>F-SPPELAN improves channel feature quality and small-target feature extraction, but cross-scale fusion remains limited by information inconsistency and insufficient long-range dependency modeling in the feature-pyramid Neck. To address this, we replace the FPN with the HAIN module and integrate it into the network. The HAIN working-principle diagram is shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Principle diagram of the HAIN.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g005.tif">
<alt-text content-type="machine-generated">Block diagram of a neural network feature refinement process showing backbone feature inputs C3, C4, C5 passing through gates, attention subnet, and an AIFI self-attention module, resulting in refined outputs P3, P4, and P5.</alt-text>
</graphic></fig>
<p>HAIN&#x2019;s core innovation lies in the deep interlaced fusion strategy, which embeds intra-scale adaptive interaction (AIFI) mechanisms into the hierarchical feature aggregation (HIFA) path. This ensures that the feature pyramid not only aggregates multi-scale information but also dynamically refines the information flow at every fusion step.</p>
<p>(1). Structure of the HAIN Module</p>
<p>The HAIN module is constructed as an enhanced Feature Pyramid Network (FPN), specifically designed for multi-scale feature fusion. It follows the common FPN structure of a top-down pathway combined with lateral connections, but with adaptive interaction embedded at key fusion points.</p>
<p>The HAIN process involves:</p>
<p>a) Top-down Aggregation (HIFA): High-level semantic features <italic>P<sub>i</sub></italic><sub>+ 1</sub> are upsampled and fused with low-level features (<italic>C<sub>i</sub></italic>) via lateral connections. This constitutes the hierarchical path.</p>
<p>b) Adaptive Interaction (AIFI): At each lateral connection, an AIFI module dynamically calibrates the fused features, enriching them with requisite local and global context before forwarding to the next level.</p>
<p>Different from standard feature pyramids, HAIN establishes a deep interlaced fusion strategy. While conventional FPN/PAN architectures focus on sequential cross-scale summation, HAIN embeds Intra-scale Adaptive Interaction (AIFI) directly into the Hierarchical Feature Aggregation (HIFA) path. This architecture ensures that semantic refinement and multi-scale aggregation occur simultaneously, mitigating the information loss typical of static weighting methods.</p>
<p>(2). Deep Interlaced Fusion Mechanism</p>
<p>HAIN achieves deep fusion by treating AIFI not as a standalone sequential block, but as a gate and refinement unit within the HIFA path.</p>
<p>a) Intra-Scale Interaction (AIFI) for Context Modeling: For the highest-level feature <italic>C</italic>5 (which has the largest receptive field but lowest spatial resolution), a simplified AIFI block based on Transformer self-attention is first applied to capture global contextual dependencies. This operation converts the global feature to <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msup><mml:mi>C</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mn>5</mml:mn></mml:mrow></mml:math></inline-formula>, as shown in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>.</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msub><mml:msup><mml:mi>C</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mn>5</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>I</mml:mi><mml:mi>F</mml:mi><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mn>5</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mn>5</mml:mn></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>b) Adaptive Gated Fusion (Interlaced HIFA and AIFI): The core depth fusion occurs at the lateral connections. Instead of a simple element-wise addition, HAIN uses an Adaptive Gated Fusion (AGF) mechanism guided by AIFI principles. This mechanism dynamically weights the contribution of the high-level semantic feature (<inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:mi>U</mml:mi><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>) and the low-level spatial feature (<italic>C<sub>i</sub></italic>). The refined feature <italic>P<sub>i</sub></italic>at level <italic>i</italic> is calculated by <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2297;</mml:mo><mml:mi>U</mml:mi><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Where:</p>
<p><inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>: The spatial feature from the backbone network. <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:mi>U</mml:mi><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>: The upsampled semantic feature from the higher layer.</p>
<p>Wspatial and Wsemantic: Adaptive Weight Maps generated by a lightweight attention sub-network (derived from AIFI principles), acting as dynamic gates to balance the contribution of the two features.</p>
<p><italic>Refine</italic> (&#xb7;): A lightweight local refinement convolution (e.g., 3&#xd7;3 Conv) applied after the fusion.</p>
<p>By applying this adaptive gating at every step of the hierarchical fusion, HAIN effectively models long-range dependencies through <italic>C</italic><sup>&#x2032;</sup>5 and simultaneously ensures spatial-semantic consistency across different scales, which is critical for accurate bounding box regression and improving small target detection.</p>
</sec>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Comparative analysis and architectural innovation</title>
<p>To clarify the structural novelty of Focal-HAIN, we contrast its design rationale with incremental YOLO neck variants through two fundamental paradigm shifts tailored for agricultural monitoring.</p>
<p>(1) From Static Pooling to Focal-Aware Modulation: Unlike recent YOLO models (e.g., YOLOv9 to v11) that utilize static hierarchical pooling (such as SPPELAN) to expand receptive fields, our F-SPPELAN implements a focal-aware spatial modulation mechanism. While static pooling often leads to the loss of fine-grained textures in micro-pests, Focal Modulation adaptively weights spatial contexts. This allows the neck to selectively amplify pest-related features while attenuating high-frequency environmental noise like soil and leaf veins.</p>
<p>(2) From Sequential Fusion to Deep Interlaced Interaction: As summarized in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>, the HAIN module distinguishes itself from existing FPN/PAN variants through its structural positioning. While traditional adaptive-weighting structures primarily rely on simple learnable scalars for cross-scale summation, HAIN introduces a Deep Interlaced Fusion strategy. This strategy embeds Intra-scale Adaptive Interaction (AIFI) directly into the Hierarchical Feature Aggregation (HIFA) pathway. By doing so, it ensures that semantic refinement and multi-scale fusion occur simultaneously, effectively mitigating the semantic inconsistency often encountered when detecting micro-scale pests against chaotic agricultural backgrounds.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Functional comparison between the proposed modules and existing representative methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Module</th>
<th valign="middle" align="left">Reference method</th>
<th valign="middle" align="center">Comparison of mechanisms</th>
<th valign="middle" align="left">Solved bottleneck</th>
<th valign="middle" align="center">Key distinction</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">F-SPPELAN</td>
<td valign="middle" align="left">SPPF/<break/>CBAM</td>
<td valign="middle" align="center">Replaces static pooling with<break/>Focal Modulation.</td>
<td valign="middle" align="center">Background noise and scale sensitivity.</td>
<td valign="middle" align="center">Adaptive focal-aware spatial-channel modulation.</td>
</tr>
<tr>
<td valign="middle" align="left">HAIN</td>
<td valign="middle" align="left">FPN/PAN<break/>variants</td>
<td valign="middle" align="center">Replaces weighted sum with Deep Interlaced Fusion (AIFI+HIFA).</td>
<td valign="middle" align="center">Semantic inconsistency in micro-pests.</td>
<td valign="middle" align="center">Embedded intra-scale interaction during aggregation.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental analysis and performance evaluation</title>
<p>To ensure a rigorous and objective evaluation, we conducted all experiments under standardized conditions. To ensure the objectivity of the comparative experiments, we adopted a set of standardized hyperparameters derived from the empirical defaults of the YOLOv5 and RT-DETR frameworks. While it is acknowledged that specific architectures might benefit from bespoke tuning, our extensive pre-experiments indicated that this unified configuration allows all candidate models to reach stable convergence without significant performance bias. Specifically, we employed a warm-up strategy and a cosine annealing learning rate scheduler to mitigate the sensitivity of different architectures to the initial learning rate, thereby ensuring that the performance gains of Focal-HAIN stem from structural innovations rather than hyperparameter optimization.</p>
<p>Based on this unified experimental framework, we evaluated algorithm performance by comparing the detection outcomes of the baseline and improved models. The evaluation metrics employed include precision (P), recall (R), mean average precision (mAP) and F1 score. To evaluate the real-time potential, the inference speed (FPS) was measured on PC. For practical application, the model was integrated into a cloud-edge collaborative framework where the Raspberry Pi 4B captures pest images and transmits them to the server for near-instantaneous detection.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental configuration</title>
<p>All training and evaluations used a single, consistent parameter set. Experiments ran on an Intel(R) Core(TM) i9-14900K (24 cores) with CUDA 12.6 and an NVIDIA RTX 4090 GPU (24GB VRAM), using the PyTorch 2.0.1 framework on Windows 11. Models were trained from scratch without using any pre-trained weights to ensure that performance gains were solely attributable to architectural innovations. Hyperparameters were: batch size 16, total epochs 300, and an initial learning rate of 1&#xd7;10<sup>&#x2212;4</sup>, with image 307 resolution standardized to 640&#xd7;640. To enhance data diversity, we applied standard data augmentation strategies, including Mosaic, Mixup, and random horizontal flipping.</p>
<p>Considering the structural differences in the Neck between Focal-HAIN and the baseline models, a specialized learning rate protocol was adopted: a Warm-up strategy (first 3 epochs) was used to stabilize initial gradients, followed by a Cosine Annealing scheduler (minimum learning rate of 1 &#xd7; 10<sup>&#x2212;6</sup> to ensure convergence. During the inference phase, we set the Non-Maximum Suppression (NMS) IoU threshold at 0.45 and the confidence threshold at 0.25. GIoU loss was applied uniformly. This standardized configuration demonstrates that the improvements in Focal-HAIN are a direct result of architectural innovations rather than hyperparameter bias.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Analysis of basic stage results</title>
<p>To assess model performance across categories, we visualized the predictions and computed a normalized confusion matrix. The normalized confusion matrix reports, for each category, the proportion of correct and incorrect predictions, thereby removing the effects of class imbalance and more intuitively conveying classification performance. <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref> presents the model&#x2019;s normalized confusion matrix on the data. Overall, the model performs well in most categories, as evidenced by the high diagonal values. However, a granular analysis of the off-diagonal elements reveals two primary failure patterns. First, inter-class misclassification is observed among visually similar diseases on the same host, such as Strawberry Leaf Spot (S-LS) and Strawberry Angular Leaf Spot (S-ALS), due to their overlapping color distributions and lesion morphologies. Second, background confusion is most noticeable in Beans Angular Leaf Spot (B-ALS) and Tomato Spider Mites (T-SM). For B-ALS, the sharp geometric edges of the lesions are occasionally confused with fragmented ground textures or soil shadows in the background. In the case of T-SM, the micro-scale, granular appearance of mites against the leaf veins poses a significant challenge for distinguishing target features from high-frequency background noise.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Normalized confusion matrix.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g006.tif">
<alt-text content-type="machine-generated">Confusion matrix normalized heatmap with categories on both axes, displaying high prediction accuracy along the diagonal for most classes, ranging from zero point eight zero to one point zero zero, and a color bar indicating intensity.</alt-text>
</graphic></fig>
<p><xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref> presents distributions of bounding-box labels. The top-left bar chart reports the instance count per label category, indicating the categorical distribution. The top-right plot maps bounding boxes in coordinate space: the central light-cyan cluster denotes many small, concentrated boxes, whereas the peripheral purple frames denote larger boxes, indicating hierarchical size variation. The bottom-left scatter of (x, y) coordinates shows a dense cluster near (0.4, 0.6), indicating most objects lie in the central region. The bottom-right (width, height) heatmap reveals a strong positive correlation: small widths align with small heights (dark cluster at low values) and larger widths align with larger heights, indicating bounding boxes in this dataset generally preserve relatively stable aspect ratios.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Correlogram distribution graphs of the bounding box labels.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g007.tif">
<alt-text content-type="machine-generated">Four-panel data visualization showing: top left, a multicolored vertical bar chart depicting instance counts for various categories labeled along the x-axis; top right, a geometric pattern of overlapping cyan and magenta rectangles centered on a white background; bottom left, a blue two-dimensional histogram with density increasing towards the center, x and y axes labeled zero to one; bottom right, another blue two-dimensional histogram with density increasing from lower left to upper right, width and height axes labeled zero to one.</alt-text>
</graphic></fig>
<p><xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref> assesses model performance via three curves: precision-recall (a), precision-confidence (b), and recall-confidence (c). In <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8a</bold></xref> classes like S-BB, T-LM and T-SM maintain around 0.9 precision across wide recall ranges, reflecting robust feature learning, while B-ALS underperforms (0.654) likely due to insufficient samples or high intra-class variability. The all classes curve yields an mAP50 of 0.899, verifying strong cross-category generalization. Linking subplots, <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8b</bold></xref> shows precision surges with confidence and plateaus near 1.0, hitting perfect precision at a 0.983 threshold&#x2014;proving high-confidence predictions are error-free. <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8c</bold></xref> reveals a precision-recall trade-off: recall peaks at 0.96 at low confidence but drops sharply above 0.8. Collectively, these curves guide practical deployment: low thresholds maximize capture, while 0.983 filters noise for reliable detections.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Model performance curves. <bold>(a)</bold> Model Precision curve. <bold>(b)</bold> Model Recall curve. <bold>(c)</bold> Precision-Recall curve.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g008.tif">
<alt-text content-type="machine-generated">Triple-panel figure displaying model evaluation metrics. Top left: Line chart showing precision versus confidence for multiple models, with a legend indicating model names and a blue line for &#x201c;all classes.&#x201d; Top right: Line chart showing recall versus confidence for the same models and legend. Bottom: Precision-recall curve with lines for each model, corresponding legend, and average precision metrics listed beside model names. All charts use confidence or recall on the x-axis and precision or recall on the y-axis.</alt-text>
</graphic></fig>
<p><xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref> depicts the 300-epoch training dynamics of the F-HAIN model, with raw (blue) and smoothed (orange) curves tracking both loss metrics and detection performance. During the first 50 epochs, all loss curves&#x2014;including train/box loss, train/cls loss, train/dfl loss and validation losses&#x2014;plummet sharply, then stabilize at 0.5&#x2013;1.0; the tight alignment between training and validation losses clearly rules out overfitting, a key advantage for model generalization. In parallel, performance metrics show steady learning progress: metrics/precision(B) and metrics/recall(B) rise rapidly in the initial 50 epochs and plateau at 0.8-0.9, while metrics/mAP50(B) hits 0.85 by epoch 100 and peaks near 0.9. Notably, the more rigorous metrics/mAP50-95(B) stabilizes at 0.6-0.7, implying the model still has room to improve on hard targets. Collectively, these trends confirm F-HAIN&#x2019;s fast convergence, stable training process and robust detection capability across target difficulty levels.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Training metrics of the F-HAIN model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g009.tif">
<alt-text content-type="machine-generated">Grid of ten line graphs showing model training and validation progress over three hundred epochs. Top row displays training losses and precision/recall metrics, all showing decreasing loss and increasing metric values. Bottom row shows corresponding validation losses and mAP metrics with similar improvements across epochs.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Ablation experiment</title>
<p>To systematically evaluate the contribution of each proposed module, we conducted ablation experiments based on the YOLOv5 baseline model. The ablation study has followed a systematic, incremental design to isolate the contribution of each proposed module, where we have defined three experimental groups: the Baseline (original YOLOv5 model without any modifications), single-module variants (Model A: YOLOv5 integrated only with the F-SPPELAN module; Model B: YOLOv5 integrated only with the HAIN module), and the Complete model (F-HAIN, which has incorporated both F-SPPELAN and HAIN modules). This incremental design has enabled us to quantify the individual contribution of each proposed module, assess potential synergistic effects between the two modules, and maintain experimental consistency by adopting YOLOv5 as the unified baseline across all experimental groups. The specific data are shown in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Ablation experiment data.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">Configuration</th>
<th valign="middle" align="center">Params/Mb</th>
<th valign="middle" align="center">GFLOPs/Gb</th>
<th valign="middle" align="center">FPS(PC)</th>
<th valign="middle" align="center">mAP50/%</th>
<th valign="middle" align="center">&#x3c7;mAP50/%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Baseline</td>
<td valign="middle" align="center">Original YOLOv5</td>
<td valign="middle" align="center">26.86</td>
<td valign="middle" align="center">15.9</td>
<td valign="middle" align="center">153.85</td>
<td valign="middle" align="center">84.3</td>
<td valign="middle" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="middle" align="left">Model A</td>
<td valign="middle" align="center">+F-SPPELAN</td>
<td valign="middle" align="center">27.76</td>
<td valign="middle" align="center">16.1</td>
<td valign="middle" align="center">149.25</td>
<td valign="middle" align="center">87.1</td>
<td valign="middle" align="center">2.8</td>
</tr>
<tr>
<td valign="middle" align="left">Model B</td>
<td valign="middle" align="center">+HAIN</td>
<td valign="middle" align="center">28.44</td>
<td valign="middle" align="center">16.6</td>
<td valign="middle" align="center">155.85</td>
<td valign="middle" align="center">87.8</td>
<td valign="middle" align="center">3.5</td>
</tr>
<tr>
<td valign="middle" align="left">F-HAIN</td>
<td valign="middle" align="center">+F-SPPELAN+HAIN</td>
<td valign="middle" align="center">24.93</td>
<td valign="middle" align="center">16.8</td>
<td valign="middle" align="center">161.29</td>
<td valign="middle" align="center">90.1</td>
<td valign="middle" align="center">5.8</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The ablation results systematically demonstrate the contribution of each proposed module, linking specific structural behaviors to performance gains. The original YOLOv5 baseline achieved a mAP50 of 84.3%. Model A (YOLOv5 + F-SPPELAN) incorporates the F-SPPELAN module, which raises mAP50 to 87.1% (&#x394;+2.8). This improvement stems from F-SPPELAN&#x2019;s focal-aware spatial modulation behavior, which adaptively suppresses high-frequency environmental noise (e.g., soil and leaf textures) while amplifying pest-related spatial contexts. This targeted modulation ensures higher recall for targets in cluttered backgrounds with only a marginal GFLOPs increase (+0.2G). Model B (YOLOv5 + HAIN) introduces the HAIN module, raising mAP50 to 87.8% (&#x394;+3.5). The gain is attributed to HAIN&#x2019;s deep interlaced interaction behavior, where Intra-scale Adaptive Interaction (AIFI) eliminates the semantic gap between feature levels during the aggregation process. While the self-attention-based Query-Key-Value (QKV) interactions increase computational complexity (+0.7G), they provide the necessary global context to resolve semantic inconsistencies in micro-pest detection. The complete Focal-HAIN model achieves a mAP50 of 90.1% (&#x394;+5.8). This synergy demonstrates that the model effectively balances localized focal refinement with hierarchical global interaction, maintaining high inference speed (161.29 FPS) while&#xa0;ensuring the precision required for embedded agricultural deployment.</p>
<p>To evaluate the statistical significance and stability of the F-HAIN model, we conducted six independent runs of the F-HAIN model using different random seeds under the same hardware and hyperparameter settings. The final average mAP50 of the model was 89.73%, with a standard deviation of &#xb1;0.47%. Although random factors during the training process may introduce slight fluctuations, the performance of the model remained within a high-precision range. This variance analysis confirmed that the proposed architectural improvements have extremely high robustness and training stability.</p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Comparative experiments with other models</title>
<p>To further verify the validity of the proposed model, various performance indicators of the model were compared with those of other models, including RT-DETR, YOLOv5, YOLOv8, YOLOv10, and YOLOv11. The specific data are shown in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Comparison data with other models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Precision/%</th>
<th valign="middle" align="center">Recall/%</th>
<th valign="middle" align="center">F1 Score</th>
<th valign="middle" align="center">mAP50/%</th>
<th valign="middle" align="center">mAP50:95/%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">RT-DETR</td>
<td valign="middle" align="center">91.7</td>
<td valign="middle" align="center">88.3</td>
<td valign="middle" align="center">0.89</td>
<td valign="middle" align="center">84.6</td>
<td valign="middle" align="center">71.4</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5</td>
<td valign="middle" align="center">92.3</td>
<td valign="middle" align="center">88.5</td>
<td valign="middle" align="center">0.90</td>
<td valign="middle" align="center">83.3</td>
<td valign="middle" align="center">72.3</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8</td>
<td valign="middle" align="center">91.3</td>
<td valign="middle" align="center">88.7</td>
<td valign="middle" align="center">0.89</td>
<td valign="middle" align="center">85.2</td>
<td valign="middle" align="center">70.7</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10</td>
<td valign="middle" align="center">93.3</td>
<td valign="middle" align="center">89.6</td>
<td valign="middle" align="center">0.91</td>
<td valign="middle" align="center">84.7</td>
<td valign="middle" align="center">72.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv11</td>
<td valign="middle" align="center">93.7</td>
<td valign="middle" align="center">88.6</td>
<td valign="middle" align="center">0.91</td>
<td valign="middle" align="center">87.1</td>
<td valign="middle" align="center">70.9</td>
</tr>
<tr>
<td valign="middle" align="center">F-HAIN</td>
<td valign="middle" align="center">94.4</td>
<td valign="middle" align="center">91.5</td>
<td valign="middle" align="center">0.93</td>
<td valign="middle" align="center">90.1</td>
<td valign="middle" align="center">79.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Compared with RT-DETR, YOLOv5, YOLOv8, YOLOv10 and YOLOv11, the mAP50 of the improved model increased by 5.3%, 6.6%, 4.7%, 5.2% and 2.8% respectively. Meanwhile, compared with these comparison models, the mAP50:95 of the improved model increased by 7.7%, 6.8%, 8.4%, 6.9% and 8.2% respectively. These results indicate that our model outperforms the compared detectors for crop pest and disease detection in natural environments.</p>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Comparison of detection results</title>
<p>Controlled experiments were conducted on RT-DETR, YOLOv5, YOLOv8, YOLOv10, YOLOv11, and F-HAIN under identical datasets, training epochs, and hyperparameters to guarantee a fair comparison. Training logs were parsed to plot the comprehensive performance metrics, as illustrated in <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Performance comparison between the proposed F-HAIN and other SOTA models.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g010.tif">
<alt-text content-type="machine-generated">Four-panel figure comparing object detection models. Top left: line graph shows mAP50 over 300 epochs, with F-HAIN outperforming YOLOv5, YOLOv8, and RT-DETR. Top right: line graph shows mAP50-95, with F-HAIN maintaining higher values across epochs. Bottom left: horizontal bar chart compares False Detection Rate (FDR), where F-HAIN achieves the lowest FDR at 5.6% and RT-DETR the highest at 8.3%. Bottom right: horizontal bar chart shows mAPsmall, where F-HAIN scores highest at 82.6% and RT-DETR the lowest at 76.8%.</alt-text>
</graphic></fig>
<p>Specifically, the two upper panels display the mAP50 and mAP50:95 curves throughout the training process, where yellow, green, red, and blue curves correspond to YOLOv5, YOLOv8, RT-DETR, and F-HAIN, respectively. To further evaluate the models under specific agricultural constraints, granular testing was performed on targeted disease subsets, as shown in the bottom panels of <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>. The lower-left bar chart presents the mAPsmall results, which were evaluated using 806 strawberry samples across five categories (S-AFR, S-BB, S-GM, S-PMF, and S-PML) where pest instances are characterized by micro-scale dimensions. The lower-right bar chart illustrates the False Discovery Rate (FDR), calculated on 518 soybean samples (B-ALS and B-R) featuring complex backgrounds such as soil and leaf interference.</p>
<p>Quantitative results demonstrate that F-HAIN outperforms all comparative models by notable margins across all four metrics, achieving the highest mAPsmall (82.6%) and the lowest FDR (5.6%). These performance gains are directly attributed to the HAIN and F-SPPELAN modules. Unlike conventional attention-based or Transformer-based models that often suffer from over-smoothing of fine textures, the Focal Modulation mechanism within our architecture preserves sharper boundary information through hierarchical gated aggregation. This spatial awareness allows F-HAIN to effectively enhance micro-scale feature representation for strawberry pests while suppressing non-target environmental noise in complex soybean field backgrounds, providing a more robust solution than recent YOLO variants.</p>
<p>To evaluate the statistical significance and stability of the Focal-HAIN model, we conducted three independent training runs using different random seeds. The model achieved a mean mAP@0.5 of 86.10% with a standard deviation of &#xb1;0.87%. Although stochastic factors in training, such as data augmentation and weight initialization, introduce minor fluctuations, the performance consistently remains within a high-accuracy range. This variance analysis confirms that the proposed architectural improvements provide robust and reproducible gains over the baseline models.</p>
<p>Specifically, yellow, green, red and blue curves correspond to YOLOv5, YOLOv8, RT-DETR and F-HAIN, respectively. Quantitative results demonstrate that F-HAIN outperforms all comparative models by notable margins in both metrics, with the performance gain attributed to its F-SPPELAN and HAIN modules that strengthen multi-scale feature representation and localization precision.</p>
<p><xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref> presents detection results for four pest and disease types affecting beans and tomatoes. The figure shows that F-HAIN achieves higher confidence scores than the other three models. Detection accuracy is also improved: for example, F-HAIN identified areas in B-ALS and B-R cases that the other models missed.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Detection results of bean and tomato.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g011.tif">
<alt-text content-type="machine-generated">Grid of plant leaf images shows results of four detection models (YOLOv5, YOLOv8, RT-DETR, F-HAIN) across four classes (B-ALS, B-R, T-LM, T-SM). Bounding boxes with confidence scores indicate detected regions on each leaf image.</alt-text>
</graphic></fig>
<p><xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref> presents detection results for four strawberry diseases and pests. The F-HAIN detections exhibit higher confidence than those produced by the other three models. F-HAIN also yields fewer false positives for the S-LS and S-PML categories compared with the other three models.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Detection results of strawberry.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g012.tif">
<alt-text content-type="machine-generated">Grid of sixteen images comparing object detection models (YOLOv5, YOLOv8, RT-DETR, F-HAIN) across four strawberry leaf disease classes (S-AFR, S-BB, S-PML, S-LS). Each cell displays bounding boxes and class confidence scores in colored annotations.</alt-text>
</graphic></fig>
<p>In order to compare the performance differences of the three models in the object detection task more clearly, the EigenGradCAM heat map method is adopted to conduct a visual analysis of the three diseases and pests. The specific results are shown in <xref ref-type="fig" rid="f13"><bold>Figure&#xa0;13</bold></xref>.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Heatmap comparison results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g013.tif">
<alt-text content-type="machine-generated">Comparison chart displaying three original plant images in the first column and four heatmap visualizations for each image in subsequent columns labeled YOLOv5, YOLOv8, RT-DETR, and F-HAIN, illustrating detection and attention regions for each model.</alt-text>
</graphic></fig>
<p>The heat-map graphs show that YOLOv5 and YOLOv8 produce widely dispersed response patterns, particularly near object boundaries. RT-DETR&#x2019;s feature extraction within target areas is limited, which can lead to false detections; for example, in the third type there is a distinct thermal pattern beneath the petals. By contrast, the improved model yields broader high-response regions centered on key targets and shows greater robustness in edge and other non-target areas. These results indicate that the improved model captures targets more completely and thus reduces missed detections.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Experimental system based on raspberry Pi</title>
<p>The proposed F-HAIN model was trained on a crop pests and diseases dataset and the optimized model was deployed on a Raspberry Pi 4B to evaluate detection performance and platform adaptability. Details of this deployment are shown in <xref ref-type="fig" rid="f14"><bold>Figure&#xa0;14</bold></xref>. Experimental results demonstrate that the improved model can display detection parameters in real time on the Raspberry Pi platform and accurately detect different pests and diseases.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Flowchart of the Raspberry Pi detection system.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g014.tif">
<alt-text content-type="machine-generated">Flowchart diagram showing two sections: PC and Terminal. Under PC, arrows connect Dataset preprocessing to Model training. Terminal section contains vertically arranged boxes: Camera, Image storage, Image preprocessing, Image recognition, and Display module, labeled Raspberry Pi, with arrows indicating workflow and a dashed bracket grouping the Terminal steps.</alt-text>
</graphic></fig>
<p>To evaluate the F-HAIN detection model on a Raspberry Pi 4B, images were acquired with a real-time camera. On the Raspberry Pi 4B, the camera captures images, which are then processed for detection on a PC. <xref ref-type="fig" rid="f15"><bold>Figure&#xa0;15a</bold></xref> shows the experimental setup of the embedded crop pest and disease detection system, and <xref ref-type="fig" rid="f15"><bold>Figure&#xa0;15b</bold></xref> presents selected magnified screenshots.</p>
<fig id="f15" position="float">
<label>Figure&#xa0;15</label>
<caption>
<p>Embedded platform experiment. <bold>(a)</bold> Experimental process. <bold>(b)</bold> Experimental result.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778883-g015.tif">
<alt-text content-type="machine-generated">Panel a shows a digital camera setup connected to a computer displaying an image of a green leaf with brown spots. Panel b presents a close-up of the same leaf with a magenta bounding box labeled &#x201c;T&#x2013;B 0.95,&#x201d; indicating experimental detection or classification results.</alt-text>
</graphic></fig>
<p>The F-HAIN lightweight algorithm was tested on the portable Raspberry Pi embedded platform. The experimental results show that Raspberry PI can accurately identify pests and diseases on crops. This not only confirms the effectiveness of the algorithm in practical applications, but also provides an important reference for engineering applications.</p>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusion</title>
<p>In this study, we proposed Focal-HAIN, a lightweight and efficient object detection model designed for real-time crop pest and disease monitoring in complex agricultural environments. By integrating the F-SPPELAN module with Focal Modulation and the HAIN module with a deep interlaced fusion strategy, the model significantly enhances the feature representation of small targets and improves localization precision under background interference. Experimental results on the IP102 dataset demonstrate that</p>
<p>Focal-HAIN achieves a superior balance between accuracy and inference speed. In practical deployment, the Focal-HAIN model achieves 161 FPS on the server-side, providing sufficient throughput to support multiple Raspberry Pi 4B edge nodes simultaneously in a large-scale monitoring network. This edge to-cloud collaborative architecture ensures real-time response capabilities while maintaining low power consumption at the terminal nodes.</p>
<p>Despite its performance gains, the proposed method has certain limitations that warrant further investigation. Specifically, the model&#x2019;s sensitivity decreases in scenarios involving low-contrast early disease spots where the lesion color closely mimics the healthy leaf tissue. In environments with highly reflective leaf surfaces, the focal modulation mechanism occasionally fails to distinguish between specular noise and small-scale pests. Furthermore, high inter-class similarity&#x2014;such as different fungal diseases manifesting similar necrotic patterns on the same crop species&#x2014;remains a challenge for accurate classification. Future work will focus on incorporating self-supervised pre-training to enhance fine-grained feature discrimination and exploring temporal consistency in video streams to improve the robustness of detection in dynamic, high-interference field conditions.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/supplementary material.</p></sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>WL: Visualization, Writing &#x2013; original draft, Conceptualization, Methodology, Data curation, Writing &#x2013; review &amp; editing. LX: Writing &#x2013; original draft, Validation, Writing &#x2013; review &amp; editing, Visualization. XC: Formal analysis, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Funding acquisition. XL: Writing &#x2013; review &amp; editing, Writing &#x2013; original draft, Software, Resources.</p></sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="book">
<person-group person-group-type="author"><collab>(FAO)</collab>
</person-group> (<year>2025</year>). <source>World food and agriculture statistical yearbook 2025.</source> (<publisher-loc>Rome</publisher-loc>: 
<publisher-name>Food and Agriculture Organization of the United Nations</publisher-name>).
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bao</surname> <given-names>W. X.</given-names></name>
<name><surname>Zhu</surname> <given-names>Z. Q.</given-names></name>
<name><surname>Hu</surname> <given-names>G. S.</given-names></name>
<name><surname>Zhou</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>D. Y.</given-names></name>
<name><surname>Yang</surname> <given-names>X. J.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>UAV remote sensing detection of tea leaf blight based on DDMA-YOLO</article-title>. <source>Comput. Electron. Agric.</source> <volume>205</volume>, <fpage>107637</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107637</pub-id>, PMID: <pub-id pub-id-type="pmid">41760527</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bashir</surname> <given-names>R. N.</given-names></name>
<name><surname>Khan</surname> <given-names>F. A.</given-names></name>
<name><surname>Khan</surname> <given-names>A. A.</given-names></name>
<name><surname>Tausif</surname> <given-names>M.</given-names></name>
<name><surname>Abbas</surname> <given-names>M. Z.</given-names></name>
<name><surname>Shahid</surname> <given-names>M. M. A.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Intelligent optimization of Reference Evapotranspiration (ETo) for precision irrigation</article-title>. <source>J. Comput. Sci.</source> <volume>69</volume>, <elocation-id>102025</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jocs.2023.102025</pub-id>, PMID: <pub-id pub-id-type="pmid">41760527</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>Z.</given-names></name>
<name><surname>Wu</surname> <given-names>R.</given-names></name>
<name><surname>Lin</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Chen</surname> <given-names>S.</given-names></name>
<name><surname>Yuan</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Plant disease recognition model based on improved YOLOv5</article-title>. <source>Agronomy</source> <volume>12</volume>, <fpage>365</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy12020365</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chu</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Feng</surname> <given-names>H.</given-names></name>
<name><surname>Weng</surname> <given-names>X.</given-names></name>
<name><surname>Ruan</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Research on Multi-Scale Pest Detection and Identification Method in Granary Based on Improved YOLOv5</article-title>. <source>Agriculture</source>. <volume>13</volume>, <fpage>364</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture13020364</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gao</surname> <given-names>G. H.</given-names></name>
<name><surname>Fang</surname> <given-names>L. F.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z. H.</given-names></name>
<name><surname>Li</surname> <given-names>J. H.</given-names></name>
</person-group> (<year>2026</year>). 
<article-title>Advancing lightweight and efficient detection of tomato main stems for edge device deployment</article-title>. <source>Artificial Intelligence in Agriculture</source>. <volume>16</volume>, <fpage>458</fpage>&#x2013;<lpage>479</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.aiia.2025.10.016</pub-id>, PMID: <pub-id pub-id-type="pmid">41760527</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gao</surname> <given-names>Y. X.</given-names></name>
<name><surname>Cao</surname> <given-names>Z. Z.</given-names></name>
<name><surname>Cai</surname> <given-names>W. W.</given-names></name>
<name><surname>Gong</surname> <given-names>G. F.</given-names></name>
<name><surname>Zhou</surname> <given-names>G. X.</given-names></name>
<name><surname>Li</surname> <given-names>L. J.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Apple leaf disease identification in complex background based on BAM-Net</article-title>. <source>Agronomy</source> <volume>13</volume>, <fpage>1240</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy13051240</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jiang</surname> <given-names>Y.</given-names></name>
<name><surname>Cai</surname> <given-names>M.</given-names></name>
<name><surname>Zhang</surname> <given-names>D.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Lightweight network DCR-YOLO for surface defect detection on printed circuit boards</article-title>. <source>Sensors</source> <volume>23</volume>, <fpage>7310</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s23177310</pub-id>, PMID: <pub-id pub-id-type="pmid">37687766</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Khan</surname> <given-names>A. A.</given-names></name>
<name><surname>Faheem</surname> <given-names>M.</given-names></name>
<name><surname>Bashir</surname> <given-names>R. N.</given-names></name>
<name><surname>Wechtaisong</surname> <given-names>C.</given-names></name>
<name><surname>Abbas</surname> <given-names>M. Z.</given-names></name>
</person-group> (<year>2022</year>a). 
<article-title>Internet of things (IoT) assisted context aware fertilizer recommendation</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>129505</fpage>&#x2013;<lpage>129519</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2022.3228160</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Khan</surname> <given-names>A. A.</given-names></name>
<name><surname>Nauman</surname> <given-names>M. A.</given-names></name>
<name><surname>Bashir</surname> <given-names>R. N.</given-names></name>
<name><surname>Jahangir</surname> <given-names>R.</given-names></name>
<name><surname>Alroobaea</surname> <given-names>R.</given-names></name>
<name><surname>Binmahfoudh</surname> <given-names>A.</given-names></name>
</person-group> (<year>2022</year>b). 
<article-title>Context aware evapotranspiration (ETs) for saline soils reclamation</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>110050</fpage>&#x2013;<lpage>110063</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2022.3206009</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Khan</surname> <given-names>A. I.</given-names></name>
<name><surname>Quadri</surname> <given-names>S. M. K.</given-names></name>
<name><surname>Banday</surname> <given-names>S.</given-names></name>
<name><surname>Shah</surname> <given-names>J. L.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Deep diagnosis: A real-time apple leaf disease detection system based on deep learning</article-title>. <source>Comput. Electron. Agric.</source> <volume>198</volume>, <fpage>107093</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107093</pub-id>, PMID: <pub-id pub-id-type="pmid">41760527</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>D. W.</given-names></name>
<name><surname>Ahmed</surname> <given-names>F.</given-names></name>
<name><surname>Wu</surname> <given-names>N. L.</given-names></name>
<name><surname>Sethi</surname> <given-names>A. I.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>YOLO-JD: A deep learning network for jute diseases and pests detection from images</article-title>. <source>Plants</source> <volume>11</volume>, <fpage>937</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/plants11070937</pub-id>, PMID: <pub-id pub-id-type="pmid">35406915</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>L. L.</given-names></name>
<name><surname>Zhang</surname> <given-names>S. J.</given-names></name>
<name><surname>Wang</surname> <given-names>B.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Apple leaf disease identification with a small and imbalanced dataset based on lightweight convolutional networks</article-title>. <source>Sensors</source> <volume>22</volume>, <fpage>173</fpage>.
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Y.</given-names></name>
<name><surname>He</surname> <given-names>G.</given-names></name>
<name><surname>Wang</surname> <given-names>Z.</given-names></name>
<name><surname>Huang</surname> <given-names>H.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>NRT-YOLO: Improved YOLOv5 based on nested residual transformer for tiny remote sensing object detection</article-title>. <source>Sensors</source> <volume>22</volume>, <fpage>4953</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s22134953</pub-id>, PMID: <pub-id pub-id-type="pmid">35808445</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Nyawose</surname> <given-names>T.</given-names></name>
<name><surname>Maswanganyi</surname> <given-names>R. C.</given-names></name>
<name><surname>Khumalo</surname> <given-names>P.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>A review on the detection of plant disease using machine learning and deep learning approaches</article-title>. <source>J. Imaging</source> <volume>11</volume>, <elocation-id>326</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jimaging11100326</pub-id>, PMID: <pub-id pub-id-type="pmid">41150002</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Qi</surname> <given-names>F.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Tang</surname> <given-names>Z.</given-names></name>
<name><surname>Chen</surname> <given-names>S. H.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Real-time and effective detection of agricultural pest using an improved YOLOv5 network</article-title>. <source>J Real-Time Image Proc</source>. <volume>20</volume>, <fpage>33</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11554-023-01264-0</pub-id>, PMID: <pub-id pub-id-type="pmid">41758449</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Qi</surname> <given-names>J. T.</given-names></name>
<name><surname>Liu</surname> <given-names>X. N.</given-names></name>
<name><surname>Liu</surname> <given-names>K.</given-names></name>
<name><surname>Xu</surname> <given-names>F. R.</given-names></name>
<name><surname>Guo</surname> <given-names>H.</given-names></name>
<name><surname>Tian</surname> <given-names>X. L.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>An improved YOLO v5 model based on visual attention mechanism: Application to recognition of tomato virus disease</article-title>. <source>Comput. Electron. Agric.</source> <volume>194</volume>, <fpage>106780</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.106780</pub-id>, PMID: <pub-id pub-id-type="pmid">41760527</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Soeb</surname> <given-names>M. J. A.</given-names></name>
<name><surname>Jubayer</surname> <given-names>M. F.</given-names></name>
<name><surname>Tarin</surname> <given-names>T. A.</given-names></name>
<name><surname>Al</surname> <given-names>M. M. R.</given-names></name>
<name><surname>Ruhad</surname> <given-names>F. M.</given-names></name>
<name><surname>Parven</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Tea leaf disease detection and identification based on YOLOv7 (YOLO-T)</article-title>. <source>Sci. Rep.</source> <volume>13</volume>, <fpage>6078</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-023-33270-4</pub-id>, PMID: <pub-id pub-id-type="pmid">37055480</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>P.</given-names></name>
<name><surname>Tian</surname> <given-names>S.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Tomato leaf disease detection based on attention mechanism and multi-scale feature fusion</article-title>. <source>Front Plant Sci</source>. <volume>15</volume>, <elocation-id>1382802</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1382802</pub-id>, PMID: <pub-id pub-id-type="pmid">38654901</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wen</surname> <given-names>C.</given-names></name>
<name><surname>Chen</surname> <given-names>H.</given-names></name>
<name><surname>Ma</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>T.</given-names></name>
<name><surname>Yang</surname> <given-names>C.</given-names></name>
<name><surname>Su</surname> <given-names>H.</given-names></name>
<name><surname>Chen</surname> <given-names>H.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Pest-YOLO: A model for large-scale multi-class dense and tiny pest detection and counting</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>973985</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.973985</pub-id>, PMID: <pub-id pub-id-type="pmid">36570910</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>S.</given-names></name>
<name><surname>Xing</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Dong</surname> <given-names>X.</given-names></name>
<name><surname>Gao</surname> <given-names>X.</given-names></name>
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Maize-YOLO: A new high-precision and real-time method for maize pest detection</article-title>. <source>Insects</source> <volume>14</volume>, <elocation-id>278</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/insects14030278</pub-id>, PMID: <pub-id pub-id-type="pmid">36975962</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yue</surname> <given-names>X.</given-names></name>
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Song</surname> <given-names>Q.</given-names></name>
<name><surname>Zeng</surname> <given-names>F.</given-names></name>
<name><surname>Zheng</surname> <given-names>J.</given-names></name>
<name><surname>Ding</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>YOLOv7&#x2014;GCA: A lightweight and high-performance model for pepper disease detection</article-title>. <source>Agronomy</source> <volume>14</volume>, <fpage>618</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy14030618</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>G.</given-names></name>
<name><surname>Li</surname> <given-names>C. F.</given-names></name>
<name><surname>Li</surname> <given-names>G. Y.</given-names></name>
<name><surname>Lu</surname> <given-names>W. D.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Small Target Detection Algorithm for UAV Aerial Images Based on Improved YOLOv7-tiny</article-title>. <source>Advanced Engineering Sciences</source>. <volume>57</volume>, <fpage>235</fpage>&#x2013;<lpage>246</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.15961/j.jsuese.202300593</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Ren</surname> <given-names>W.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<name><surname>Jia</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>L.</given-names></name>
<name><surname>Tan</surname> <given-names>T. N.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Focal and efficient IoU loss for accurate bounding box regression</article-title>. <source>Neurocomputing</source> <volume>506</volume>, <fpage>146</fpage>&#x2013;<lpage>157</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2022.07.042</pub-id>, PMID: <pub-id pub-id-type="pmid">41760527</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Zhao</surname> <given-names>Y.</given-names></name>
<name><surname>Lv</surname> <given-names>W.</given-names></name>
<name><surname>Xu</surname> <given-names>S.</given-names></name>
<name><surname>Wang</surname> <given-names>G. Z.</given-names></name>
<name><surname>Wei</surname> <given-names>J. M.</given-names></name>
<name><surname>Cui</surname> <given-names>C.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). &#x201c;
<article-title>DETRs beat YOLOs on real-time object detection</article-title>,&#x201d; in <source>Proceedings of the 2024IEEE/CVF conference onComputer vision and pattern recognition (CVPR)</source> (<publisher-loc>Seattle, WA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>16965</fpage>&#x2013;<lpage>16974</lpage>.
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>S.</given-names></name>
<name><surname>Ma</surname> <given-names>W.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Yang</surname> <given-names>M.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>C.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>EADD-YOLO: An efficient and accurate disease detector for apple leaf using improved lightweight YOLOv5</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>, <elocation-id>1120724</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1120724</pub-id>, PMID: <pub-id pub-id-type="pmid">36909428</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1596395">Alireza Sanaeifar</ext-link>, University of Minnesota Twin Cities, United States</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1751748">Qing Yao</ext-link>, Zhejiang Sci-Tech University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2147374">Arfat Ahmad Khan</ext-link>, Khon Kaen University, Thailand</p></fn>
</fn-group>
</back>
</article>