<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1748419</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Rice pest detection via multi-scale edge network and wavelet attention enhancement</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Huang</surname><given-names>Xinyue</given-names></name>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhou</surname><given-names>Ruoxuan</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3298786/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><institution>School of Software Engineering, Jiangxi University of Science and Technology</institution>, <city>Nanchang</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Ruoxuan Zhou, <email xlink:href="mailto:zhouruoxuan@jxust.edu.cn">zhouruoxuan@jxust.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-17">
<day>17</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1748419</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>18</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Huang and Zhou.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Huang and Zhou</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-17">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Rice pest detection faces critical challenges including small target recognition difficulties, high morphological similarities, and complex field backgrounds. This study proposes BEAM-YOLO (Bi-branch Edge Attention Multi-scale YOLO) to address these limitations.We constructed the JRICE-PD dataset encompassing 11 economically significant rice pests (4,565 images) and developed four innovative modules: a Multi-scale morphological Edge Network (MEN) enhancing feature discrimination; a Bi-branch Attention Feature Enhancement (BAFE) module utilizing Haar wavelet transform for foreground-background separation; an Enhanced Multi-scale Bidirectional Feature Pyramid Network (EM-BFPN) optimizing information interaction; and a Spatial-Channel Augmented Upsampling (SCAU) improving small target detection.BEAM-YOLO achieves 86.6&#xb1;0.5% mAP@50 and 72.7&#xb1;0.9% mAP@50-95, outperforming YOLOv11 by 3.3% and 3.0% respectively, while maintaining relatively low computational overhead and parameter count. This research provides reliable algorithmic support for intelligent agricultural pest monitoring systems, contributing to precision agriculture advancement and application.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>edge feature enhancement</kwd>
<kwd>object detection</kwd>
<kwd>rice pest detection</kwd>
<kwd>wavelet attention mechanism</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="15"/>
<table-count count="7"/>
<equation-count count="18"/>
<ref-count count="40"/>
<page-count count="20"/>
<word-count count="9973"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Rice, as one of the world&#x2019;s most crucial food crops, provides a fundamental source of nutrition for more than half the global population and plays a vital role in maintaining global food security (<xref ref-type="bibr" rid="B1">Bailey-Serres et&#xa0;al., 2019</xref>). However, rice production faces severe threats from pests and diseases. According to statistics from the Food and Agriculture Organization (FAO), annual global crop losses due to pests are estimated at 20-40% (<xref ref-type="bibr" rid="B21">Rizzo et&#xa0;al., 2021</xref>). Therefore, timely and accurate identification and detection of rice pests and diseases is of significant importance for ensuring food security, improving agricultural production efficiency, reducing pesticide use, decreasing environmental pollution, and promoting sustainable agricultural development (<xref ref-type="bibr" rid="B11">Li S. et&#xa0;al., 2022</xref>).</p>
<p>Traditional rice pest and disease detection primarily relies on the experiential judgment of agricultural experts and field surveys. This approach is limited in scope, time-consuming, labor-intensive, and inherently subjective (<xref ref-type="bibr" rid="B7">Jiang et&#xa0;al., 2008</xref>). Specifically, conventional methods include manual field inspection, yellow sticky trap attraction, light trapping techniques, and laboratory microscopic examination (<xref ref-type="bibr" rid="B25">Thenmozhi and Reddy, 2019</xref>). Though these traditional methods have certain value under specific conditions, they generally suffer from low efficiency, unstable precision, and difficulties in large-scale automation, making them inadequate for meeting the precise and intelligent development needs of modern agriculture (<xref ref-type="bibr" rid="B20">Preti et&#xa0;al., 2021</xref>).</p>
<p>In recent years, numerous classical deep learning architectures such as AlexNet, VGGNet, Inception, ResNet, and DenseNet have been applied to agricultural pest detection (<xref ref-type="bibr" rid="B10">Li et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B16">Lu et&#xa0;al., 2025a</xref>). These methods can automatically extract hierarchical features from images without requiring manually designed feature extraction algorithms, significantly improving detection accuracy and efficiency. For instance, Hasan et&#xa0;al (<xref ref-type="bibr" rid="B6">Hasan et&#xa0;al., 2019</xref>). proposed an integrated approach combining support vector machines with deep convolutional neural networks, which markedly improved the accuracy of rice pest and disease recognition.</p>
<p>Object detection-based methods have become increasingly widespread across various fields and, due to their advantages in real-time performance and localization accuracy, have emerged as one of the mainstream approaches for rice pest and disease detection (<xref ref-type="bibr" rid="B26">Venkateswara and Padmanabhan, 2025</xref>). Object detection technology not only identifies the types of pests and diseases in images but also precisely locates their positions, enabling the possibility of precision pesticide application. As a leading method in object detection research, the YOLO series has inspired numerous improvements. Zheng et&#xa0;al (<xref ref-type="bibr" rid="B39">Zheng et&#xa0;al., 2024</xref>). proposed Rice-YOLO, a lightweight pest detection algorithm based on YOLOv5. This model, built upon YOLOv8-N, incorporates an efficient detection head designed for the complex characteristics of pests, while introducing deep supervision techniques and an improved dynamic upsampling module, achieving excellent detection performance on the large-scale public IP102 pest dataset and R2000 dataset.Lu et&#xa0;al (<xref ref-type="bibr" rid="B17">Lu et&#xa0;al., 2025b</xref>). developed IMobileTransformer, a fusion-based lightweight model that integrates MobileNet&#x2019;s local feature extraction with Transformer&#x2019;s global dependency modeling through a three-branch architecture, demonstrating the effectiveness of hybrid lightweight architectures for rice disease identification. Xiong et&#xa0;al (<xref ref-type="bibr" rid="B32">Xiong et&#xa0;al., 2024</xref>). addressed the problem of rice pest detection in complex agricultural environments by optimizing the YOLOv8 model, introducing the CBAM (Convolutional Block Attention Module) attention mechanism and BiFPN (Bidirectional Feature Pyramid Network) for feature fusion, significantly improving detection precision in complex agricultural environments.</p>
<p>Despite significant progress in YOLO-based rice pest and disease detection, several challenges persist in practical applications. Firstly, conventional convolutional structures struggle to extract detailed features of minute pests, and increasing network depth leads to degradation of edge information, affecting the ability to distinguish morphologically similar pests. Existing models have not achieved optimal balance between computational efficiency and detection precision (<xref ref-type="bibr" rid="B2">Chakrabarty et&#xa0;al., 2024</xref>). Secondly, the low contrast between pests and complex field backgrounds causes feature confusion problems, particularly under non-uniform lighting and occlusion conditions. Single spatial attention mechanisms cannot simultaneously capture multi-scale morphological variations and low-contrast features, resulting in detection instability (<xref ref-type="bibr" rid="B29">Wang et&#xa0;al., 2025</xref>). Finally, traditional feature pyramid networks lack adaptive fusion capabilities for multi-scale features and fail to differentiate contribution degree variations among different resolution features, making individual recognition difficult in densely distributed scenarios and limiting overall detection performance (<xref ref-type="bibr" rid="B19">Prasath and Akila, 2023</xref>).</p>
<p>To address these challenges, this paper proposes an improved YOLOv11 (<xref ref-type="bibr" rid="B9">Khanam and Hussain, 2024</xref>) model&#x2014;Bifurcated Edge-Attention Multi-scale YOLO (BEAM-YOLO)&#x2014;for high-precision real-time detection of rice pests and diseases. The main contributions of this research are as follows:</p>
<list list-type="simple">
<list-item>
<p>1. We constructed a high-quality rice pest dataset named JRICE-PD, comprising 4,565 images across 11 economically significant pest species. The dataset integrates multi-source acquisitionwith a three-tier expert review mechanism, ensuring both ecological authenticity and annotation consistency.</p></list-item>
<list-item>
<p>2. We propose the Morphological Edge Network (MEN) module to address edge information degradation in deep networks. Through multi-scale adaptive pooling and an EdgeEnhancer mechanism, MEN effectively captures fine morphological features of minute pests and enhances discrimination capability for morphologically similar species.</p></list-item>
<list-item>
<p>3. We design the Bifurcated Attention Feature Enhancement (BAFE) module to resolve foreground-background feature confusion. By employing Haar wavelet transform for frequency domain decomposition and a cascaded dual-attention mechanism, BAFE effectively separates pest targets from complex agricultural backgrounds under varying lighting and occlusion conditions.</p></list-item>
<list-item>
<p>4. We propose an Enhanced Multi-scale Bidirectional Feature Pyramid Network (EM-BFPN) to overcome the limitations of traditional FPNs in adaptive feature fusion. The Adaptive Feature Fusion Mechanism (AFFM) dynamically adjusts feature contributions across scales, while the Multi-scale Convolution Module (MSCM) enables gradient receptive field coverage for improved detection in dense distribution scenarios.</p></list-item>
<list-item>
<p>5. We introduce a Spatial-Channel Augmented Upsampling (SCAU) module that combines channel shuffling with Multi-Directional Feature Shifting (MDFS) to enhance small target detection sensitivity without increasing computational overhead.</p></list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data collection</title>
<p>This study established a rice pest dataset named JRICE-PD. The data collection site was located in Nanchang City, Jiangxi Province (28&#xb0;40&#x2032;~29&#xb0;05&#x2032;N, 115&#xb0;45&#x2032;~116&#xb0;15&#x2032;E), one of China&#x2019;s major rice-growing regions (<xref ref-type="bibr" rid="B33">Yang et&#xa0;al., 2021</xref>).</p>
<p>The dataset encompasses 11 common and destructive rice pests, as shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>: Curculionidae, Delphacidae, Cicadellidae, Phlaeothripidae, Cecidomyiidae, Hesperiidae, Crambidae, Chloropidae, Ephydridae, Noctuidae, and Thripidae. Field data collection was conducted between June and September 2024 across approximately 50 hectares of paddy fields, covering the rice growing season from tillering to maturation stages. Images were captured using iPhone 13 (12MP, f/1.6 aperture) at shooting distances of 10&#x2013;50 cm, yielding 2,164 original images at 4032&#xd7;3024 pixels resolution. Camera settings included auto-focus with exposure compensation of -1.0 to +1.0 EV for varying lighting conditions. Additionally, 2,401 supplementary images were collected from online resources including Google Scholar and Baidu Images, constructing a comprehensive dataset of 4,565 images (<xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>). All images were resized to 640&#xd7;640 pixels for training.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Rice pest species, where <bold>(A)</bold> is Delphacidae, <bold>(B)</bold> is Crambidae, <bold>(C)</bold> is Cecidomyiidae, <bold>(D)</bold> is Phlaeothripidae, <bold>(E)</bold> is Thripidae, <bold>(F)</bold> is Chloropidae, <bold>(G)</bold> is Ephydridae, <bold>(H)</bold> is Cicadellidae, <bold>(I)</bold> is Noctuidae, <bold>(J)</bold> is Hesperiidae, <bold>(K)</bold> is Curculionidae.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g001.tif">
<alt-text content-type="machine-generated">A grid of twelve close-up images of various insects and larvae on green plant leaves. Each insect is highlighted within a green bounding box. The images depict different species, providing a detailed look at their physical features and interactions with the leaves.</alt-text>
</graphic></fig>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Bar chart showing specific quantities of pest species in the dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g002.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x201c;Pest Dataset Histogram&#x201d; showing counts of different pest families. Thripidae has the highest count at 823, followed by Noctuidae at 727, and Ephydridae at 632. Other families include Chloropidae (290), Crambidae (370), Hesperiidae (393), Cecidomyiidae (461), Phlaeothripidae (153), Cicadellidae (167), Delphacidae (206), with another Noctuidae at 329.</alt-text>
</graphic></fig>
<p>Data annotation employed the LabelImg tool following YOLO format with a &#x201c;tight bounding box&#x201d; principle (<xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>). An annotation team comprising two entomological experts and three trained annotators performed annotations under a three-tier quality control mechanism. Annotation consistency was quantified using Intersection over Union (IoU) between independent annotations on 200 randomly sampled images, achieving a mean IoU of 0.91. Annotations with IoU below 0.85 were re-annotated until consensus was reached.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Dataset annotation process using the LabelImg tool.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g003.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a neural network architecture. It begins with an image input, processed through multiple stages labeled MEN, Conv, AFFM, MSCM, and BAFE. Arrows indicate connections: red for SCAU, blue for downsampling, and black for copying. The architecture is divided into sections labeled Backbone, Neck, and Head, leading to outputs for reg loss, cls loss, and dfl loss.</alt-text>
</graphic></fig>
<p>The dataset was divided into training (3,652), validation (456), and test (457) sets at an 8:1:1 ratio.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>YOLOv11 baseline framework</title>
<p>YOLOv11 represents one of the most advanced single-stage object detection algorithms currently available, incorporating multiple improvements in network structure and feature extraction compared to previous versions. The basic architecture of YOLOv11 comprises three main components: Backbone, Neck, and Head, with its network structure illustrated in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>YOLOv11 network structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g004.tif">
<alt-text content-type="machine-generated">Diagram of the Yolov11 architecture, showing three main sections: Backbone, Neck, and Head. The Backbone includes layers like Conv, C3k2, SPPF, and C2PSA. The Neck has layers such as Contact, Upsample, and C3k2, while the Head has layers marked as V11Detect. Blocks connect these sections, indicating the flow of data through the model.</alt-text>
</graphic></fig>
<p>The backbone network is responsible for extracting multi-scale features from input images. It first downsamples the image using initial convolutional layers, then generates feature maps of different resolutions through stacked convolutional layers and specialized modules. The neck network aggregates multi-scale feature maps from the backbone network, fusing and enhancing features before passing them to the detection head. It primarily consists of multiple convolutional layers, C3k2 blocks, Concat operations, and upsampling modules. The neck network first upsamples the low-level features (P5) processed by SPPF and C2PSA to the size of mid-level features (P4) and connects them with P4 features; it then upsamples the connected features to the size of high-level features (P3) and connects them.</p>
<p>The detection head is the final component of the model, responsible for generating prediction results. It receives three features from the neck network, corresponding to high-level, mid-level, and low-level features. The detection head utilizes these three features for focal loss calculation, bounding box detection, and class&#xa0;detection. This design enables YOLOv11 to achieve efficient&#xa0;and accurate object localization and classification in application scenarios.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>BEAM-YOLO network structure</title>
<p>The BEAM-YOLO network architecture proposed in this study is algorithmically optimized for rice pest detection tasks, establishing an efficient and precise end-to-end detection algorithm through the synergistic action of four innovative modules. The workflow of BEAM-YOLO can be divided into three key stages: feature extraction, feature fusion, and multi-scale detection, with the overall network structure shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>BEAM-YOLO network structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g005.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a neural network architecture. It begins with an input labeled HWC, splitting into two paths for F_high and F_low. Both paths undergo linear transformation, reshaping, and pass through Softmax layers. Another path involves a 3x3 CBR, linear transformation labeled K, unfolding, and combination into a final output labeled W and H.</alt-text>
</graphic></fig>
<p>In the feature extraction stage, the network first establishes initial feature representation through standard convolutional layers, followed by deployment of the MEN module for feature enhancement. The MEN module constructs multi-scale feature representation through the MSF structure while simultaneously reinforcing edge feature extraction, significantly improving the discriminative ability for morphologically similar pests. The Bifurcated Attention Feature Enhancement (BAFE) module integrated at the backbone network terminus utilizes Haar wavelet transform to decompose features into foreground and background components, and establishes contrast enhancement effects through a dual attention mechanism, effectively resolving foreground-background feature confusion problems in complex field environments.</p>
<p>The feature fusion stage employs the EM-BFPN structure to bidirectionally fuse P3, P4, and P5 three-level features extracted by the backbone. EM-BFPN adopts top-down transmission of high-2level semantic information and bottom-up aggregation of refined spatial details. The network integrates the MSCM module for feature processing, which applies convolution kernel combinations of {1,3,5}, {3,5,7}, and {5,7,9} respectively according to the receptive field requirements of different feature levels, forming a gradient multi-scale feature expression. In the feature upsampling process, the innovative SCAU module is employed, significantly expanding the receptive field range without increasing computational burden through depth-separable convolution, channel shuffling, and multi-directional spatial shifting operations.</p>
<p>In the multi-scale detection stage, the network constructs a highly optimized detection head structure, outputting detection results in parallel at P3, P4, and P5 feature levels, achieving precise localization and classification of pest targets of different scales. The three feature levels work collaboratively, forming a detection range covering multiple scales, effectively addressing the detection challenges posed by significant size variations among rice pests.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>MEN module</title>
<p>Conventional convolution operations often neglect fine morphological features of small pests, reducing detection precision. Moreover, edge information progressively attenuates as network depth increases. To address these issues, this study proposes the Morphological Edge Network (MEN) module for restructuring the backbone network architecture, as shown in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>. This module significantly enhances rice pest detection accuracy and environmental robustness through the organic combination of multi-scale feature representation and edge information enhancement mechanisms.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>MEN model workflow.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g006.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a neural network model architecture. Input starts with an image of a frog. Arrows indicate data flow through modules labeled MEN, Conv, AFFM, MSCM, and more, transitioning from backbone to neck to head stages. Color-coded arrows represent SCAU, downsample, and copy processes. Loss functions reg loss, cls loss, and dfl loss are noted on the right.</alt-text>
</graphic></fig>
<p>The MEN module is innovatively designed based on the Cross Stage Partial (CSP) structure, organically combining feature extraction with information enhancement mechanisms. This module receives an input feature map <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and generates an enhanced feature map <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:mi>Y</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> through complex nonlinear transformations. The overall transformation process can be described by the following unified expression, as shown in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi>&#x2133;</mml:mi><mml:mrow><mml:msub><mml:mi>&#x3b8;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:msubsup></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>In this formula, <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> are two parts of features split along the channel dimension after the input feature <inline-formula>
<mml:math display="inline" id="im5"><mml:mi>X</mml:mi></mml:math></inline-formula> undergoes <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula> convolution, <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msub><mml:mi>&#x2133;</mml:mi><mml:mrow><mml:msub><mml:mi>&#x3b8;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represents the MPM transformation function with parameters <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:msub><mml:mi>&#x3b8;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im9"><mml:mi>n</mml:mi></mml:math></inline-formula> denotes the number of enhancement units in the module, and <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the final feature fusion function, typically implemented by <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula> convolution.</p>
<p>Within the module, DPM serves as the basic unit implementing cross-feature enhancement. This unit adopts a dual-path design, significantly enhancing the model&#x2019;s feature expression capability. Assuming an input feature <inline-formula>
<mml:math display="inline" id="im12"><mml:mi>X</mml:mi></mml:math></inline-formula>, the output feature <inline-formula>
<mml:math display="inline" id="im13"><mml:mi>Z</mml:mi></mml:math></inline-formula> of the unit can be expressed as shown in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mi>Z</mml:mi><mml:mo>=</mml:mo><mml:mi mathvariant="script">G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mi>&#x2130;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>one&#xa0;</mml:mtext><mml:mi mathvariant="script">P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi mathvariant="script">Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im14"><mml:mi mathvariant="script">P</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im15"><mml:mi mathvariant="script">Q</mml:mi></mml:math></inline-formula> represent two parallel <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula> convolution paths, <inline-formula>
<mml:math display="inline" id="im17"><mml:mi>&#x2130;</mml:mi></mml:math></inline-formula> denotes a tandem sequence composed of MSF units, and <inline-formula>
<mml:math display="inline" id="im18"><mml:mi mathvariant="script">G</mml:mi></mml:math></inline-formula> is the fusion function integrating features from both paths. Compared to traditional CSP structures, our design introduces more complex edge-aware mechanisms in the feature extraction path, substantially enhancing the model&#x2019;s detection capability for pest contours.</p>
<p>MSF focuses on achieving multi-scale feature acquisition and edge information enhancement, with its structure shown in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref>. This module adopts a multi-branch parallel architecture, simultaneously processing feature expressions of different abstraction levels. Given an input feature <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, the mathematical expression of the entire module can be unified as shown in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>:</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>MSF module workflow, where &#x2295; represents element-wise addition operation, &#x2296; represents element-wise subtraction operation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g007.tif">
<alt-text content-type="machine-generated">A flowchart depicting two neural network modules, EdgeEnhancer and MSF. The EdgeEnhancer module processes Input X with average pooling, a convolutional layer, and summation, producing X output. The MSF module includes multiple adaptive average pooling layers, convolutional layers, depthwise convolutions, upsampling, and edge enhancement. Outputs are concatenated and passed through a convolutional layer, resulting in X output.</alt-text>
</graphic></fig>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x3a6;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>In this expression, <inline-formula>
<mml:math display="inline" id="im20"><mml:mi>&#x2112;</mml:mi></mml:math></inline-formula> represents the local feature extraction function, implemented through <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:math></inline-formula> convolution; <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:mi mathvariant="script">S</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> denotes a set of predefined scale parameters (e.g., <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>6</mml:mn><mml:mo>,</mml:mo><mml:mn>9</mml:mn><mml:mo>,</mml:mo><mml:mn>12</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>); <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msub><mml:mi mathvariant="script">A</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the adaptive pooling and feature transformation function at scale <inline-formula>
<mml:math display="inline" id="im25"><mml:mi>s</mml:mi></mml:math></inline-formula>; <inline-formula>
<mml:math display="inline" id="im26"><mml:mrow><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represents the transformation function including upsampling and edge enhancement; and <inline-formula>
<mml:math display="inline" id="im27"><mml:mi>&#x3a6;</mml:mi></mml:math></inline-formula> is the final feature fusion function.</p>
<p>In the multi-scale feature path, the EdgeEnhancer submodule plays a crucial role, designed to strengthen target edge information and enhance the model&#x2019;s perception of pest contours. Given an input feature <inline-formula>
<mml:math display="inline" id="im28"><mml:mi>X</mml:mi></mml:math></inline-formula>, the processing of EdgeEnhancer can be mathematically formulated as shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>:</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mi>X</mml:mi><mml:mo>&#x2299;</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x210b;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi mathvariant="script">P</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>In this formula, <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:msub><mml:mi mathvariant="script">P</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represents the average pooling operation, used to simulate background information of local regions; <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi mathvariant="script">P</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> calculates the difference between the original feature and its smoothed version, explicitly extracting edge information; <inline-formula>
<mml:math display="inline" id="im31"><mml:mi>&#x210b;</mml:mi></mml:math></inline-formula> is a nonlinear transformation function composed of convolutional layers; <inline-formula>
<mml:math display="inline" id="im32"><mml:mi>&#x3c3;</mml:mi></mml:math></inline-formula> denotes the Sigmoid activation function, mapping edge features to the <inline-formula>
<mml:math display="inline" id="im33"><mml:mrow><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> interval as attention weights; and <inline-formula>
<mml:math display="inline" id="im34"><mml:mo>&#x2299;</mml:mo></mml:math></inline-formula> represents the Hadamard product (element-wise multiplication). Through this adaptive attention mechanism, the module can precisely enhance feature responses in pest edge regions while suppressing background noise, significantly improving detection accuracy and robustness.</p>
<p>The MEN module proposed in this study, by parallel integration of adaptive average pooling operations with different receptive fields, constructs a feature extraction path with hierarchical multi-scale representation capabilities, effectively capturing discriminative feature information of pests of different sizes. The EdgeEnhancer design in this module, through efficient extraction and enhancement of edge information, significantly improves the model&#x2019;s discrimination capability for fine morphological features of pests, demonstrating superior recognition performance and classification accuracy especially for morphologically similar pest species.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>BAFE module</title>
<p>Although the Cross-Stage Partial Spatial Attention (C2PSA) module has achieved remarkable success in object detection, it struggles to discriminate features between pest targets and rice plants, resulting in false positives and false negatives in complex field environments. Single spatial attention mechanisms struggle to capture multi-scale morphological variation features of pests. To address these issues, this paper proposes a novel Bifurcated Attention Feature Enhancement (BAFE), with its structure shown in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>. This module significantly improves rice pest detection precision through wavelet transform separation of foreground and background information and the introduction of a dual attention mechanism.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>BAFE workflow diagram, where HWC represents HaarWaveletConv, &#x2297; represents broadcast element-wise multiplication.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g008.tif">
<alt-text content-type="machine-generated">Two flowcharts labeled MEN and DPM. MEN starts with Input X, leading to a Conv layer, Splitting, then multiple DPM layers, Concatenation, another Conv layer, and finally Output. DPM begins with Input X, then Conv, multiple MSF layers, Concatenation, another Conv layer, and Output. Both involve iterative processes denoted by 'n'.</alt-text>
</graphic></fig>
<p>The BAFE module employs frequency domain deconstruction and contrast-driven cascaded attention mechanisms, achieving efficient feature enhancement and aggregation. Given an input feature map <inline-formula>
<mml:math display="inline" id="im35"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, the module first performs initial feature optimization through an input preprocessing network <inline-formula>
<mml:math display="inline" id="im36"><mml:mrow><mml:msub><mml:mi>&#x3a6;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, then obtains enhanced features <inline-formula>
<mml:math display="inline" id="im37"><mml:mi>Y</mml:mi></mml:math></inline-formula> through a series of transformation operations. The core component HaarWaveletConv (<xref ref-type="bibr" rid="B24">Su et&#xa0;al., 2024</xref>) achieves frequency domain decomposition of features based on discrete Haar wavelet transform principles. This component maps input features to different subbands in the wavelet domain through a set of predefined convolution kernels. Specifically, for input feature <inline-formula>
<mml:math display="inline" id="im38"><mml:mi>X</mml:mi></mml:math></inline-formula>, the wavelet transform process can be expressed as shown in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mi>W</mml:mi><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>*</mml:mo><mml:msub><mml:mi>&#x3a8;</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>|</mml:mo><mml:mi>d</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im39"><mml:mrow><mml:msub><mml:mi>&#x3a8;</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im40"><mml:mrow><mml:msub><mml:mi>&#x3a8;</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> <inline-formula>
<mml:math display="inline" id="im41"><mml:mrow><mml:msub><mml:mi>&#x3a8;</mml:mi><mml:mi>v</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im42"><mml:mrow><mml:msub><mml:mi>&#x3a8;</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represent Haar filters for approximation (low-pass), horizontal, vertical, and diagonal (high-pass) directions respectively, and <inline-formula>
<mml:math display="inline" id="im43"><mml:mo>&#x2217;</mml:mo></mml:math></inline-formula> denotes the convolution operation.</p>
<p>Specifically, the HaarWaveletConv employs fixed (non-learnable) Haar wavelet filters initialized according to the standard discrete Haar transform coefficients: <inline-formula>
<mml:math display="inline" id="im44"><mml:mrow><mml:msub><mml:mtext>W</mml:mtext><mml:mtext>a</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im45"><mml:mrow><mml:msub><mml:mtext>W</mml:mtext><mml:mtext>h</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im46"><mml:mrow><mml:msub><mml:mtext>W</mml:mtext><mml:mtext>v</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>;</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im47"><mml:mrow><mml:msub><mml:mtext>W</mml:mtext><mml:mtext>d</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>;</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. The convolution operation uses a stride of 2 and no padding, effectively downsampling the feature map by a factor of 2 while decomposing it into frequency subbands. These filter weights remain frozen during training to preserve the mathematical properties of the Haar wavelet transform.</p>
<p>Among these four subbands, the approximation subband <inline-formula>
<mml:math display="inline" id="im48"><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> contains low-frequency structural information of the image, while the remaining three high-frequency subbands <inline-formula>
<mml:math display="inline" id="im49"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>v</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> capture edge and texture details in different directions. In our implementation, these high-frequency subbands are fused into a single high-frequency feature representation <inline-formula>
<mml:math display="inline" id="im50"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>v</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, which, together with the low-frequency feature <inline-formula>
<mml:math display="inline" id="im51"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, provides the foundation for subsequent contrast-driven processing. After obtaining frequency domain decomposed features, the module uses a dual attention mechanism to process high-frequency and low-frequency information separately. The foreground attention stage first constructs a spatially sensitive attention map based on high-frequency features. Given the deformed feature <inline-formula>
<mml:math display="inline" id="im52"><mml:mrow><mml:mi>V</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>N</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>L</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msup><mml:mi>k</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> foreground attention calculation can be represented as shown in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>:</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mi mathvariant="script">Q</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>F</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi mathvariant="script">K</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>F</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>&#x22a4;</mml:mo></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:mi>B</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>B</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>N</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>L</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msup><mml:mi>k</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo>&#xd7;</mml:mo><mml:msup><mml:mi>k</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im53"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>F</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the high-frequency feature after pooling downsampling, <inline-formula>
<mml:math display="inline" id="im54"><mml:mrow><mml:msub><mml:mi mathvariant="script">Q</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im55"><mml:mrow><mml:msub><mml:mi mathvariant="script">K</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> are query and key transformation functions respectively, <inline-formula>
<mml:math display="inline" id="im56"><mml:mi>B</mml:mi></mml:math></inline-formula> is positional encoding, <inline-formula>
<mml:math display="inline" id="im57"><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the number of attention heads, <inline-formula>
<mml:math display="inline" id="im58"><mml:mi>L</mml:mi></mml:math></inline-formula> is the number of feature spatial positions, <inline-formula>
<mml:math display="inline" id="im59"><mml:mi>k</mml:mi></mml:math></inline-formula> is the convolution kernel size, and <inline-formula>
<mml:math display="inline" id="im60"><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the feature dimension of each head. This attention mechanism enhances perception of edges and textures of small pests by considering local spatial dependencies. After applying attention weights to value feature <inline-formula>
<mml:math display="inline" id="im61"><mml:mi>V</mml:mi></mml:math></inline-formula>, an enhanced feature representation is obtained as shown in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>:</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mi>k</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:munderover><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo>&#x2299;</mml:mo><mml:mi>V</mml:mi><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im62"><mml:mo>&#x2299;</mml:mo></mml:math></inline-formula> represents broadcast element-wise multiplication, and <inline-formula>
<mml:math display="inline" id="im63"><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is projection transformation. Similarly, the background attention stage utilizes low-frequency features to guide further enhancement of the first stage output, forming the final feature representation. This process can be formalized as shown in <xref ref-type="disp-formula" rid="eq8">Equations 8</xref>, <xref ref-type="disp-formula" rid="eq9">9</xref>:</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mi mathvariant="script">Q</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>F</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi mathvariant="script">K</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>F</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>&#x22a4;</mml:mo></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:mi>B</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x3a6;</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msup><mml:mi>k</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:munderover><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo>&#x2299;</mml:mo><mml:msup><mml:mi>V</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>This cascaded contrast attention design enables the module to progressively refine feature representations under the guidance of different frequency domain information, particularly suitable for capturing subtle differences of rice pests in complex backgrounds. Notably, by using sliding window (unfold) operations and position-sensitive attention mapping, the module can efficiently process spatial dependencies, enhancing representation capability for small targets and pests with complex morphologies.</p>
<p>The BAFE module proposed in this paper achieves adaptive decomposition of features through wavelet transform, effectively separating low-frequency background information from high-frequency foreground information, resolving the foreground-background feature confusion problem in traditional detection networks and enabling the model to focus more precisely on pest targets. It also designed a dual attention mechanism, processing foreground and background information separately and forming a contrast enhancement effect through a stacked approach, significantly enhancing the model&#x2019;s recognition capacity for morphologically variable and minuscule rice pests.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>EM-BFPN</title>
<p>Traditional neck networks exhibit evident limitations in feature fusion, typically employing simple concatenation or weighted summation without adequately considering inter-layer correlation and complementarity. This results in poor detection performance for small, dense, and morphologically variable targets such as rice pests. To address these issues, this paper proposes an Enhanced Multi-scale Bidirectional Feature Pyramid Network (EM-BFPN), with its structure shown in <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>. This network achieves efficient information interaction and complementary information mining between different feature layers through the design of an Adaptive Feature Fusion Mechanism (AFFM) and Multi-scale Convolution Module (MSCM).</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>EM-BFPN structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g009.tif">
<alt-text content-type="machine-generated">Diagram of an EM-BFPN architecture with nodes labeled P2 to P5 on the left, connected by arrows indicating processes: copy (black), upsample (red), and downsample (blue). Nodes represent Conv, AFFM, and MSCM.</alt-text>
</graphic></fig>
<p>The workflow of EM-BFPN can be divided into three stages: feature preprocessing, multi-scale bidirectional feature fusion, and feature enhancement. In the feature preprocessing stage, P3, P4, and P5 feature maps from the backbone network are first unified in channels through 1&#xd7;1 convolution to reduce computational complexity and improve feature fusion efficiency. In the multi-scale bidirectional feature fusion stage, EM-BFPN implements bidirectional feature transmission from top-down and bottom-up, forming a closed-loop feedback mechanism. The feature fusion process is implemented by the AFFM mechanism, which performs weighted fusion of features from different sources through learnable weight coefficients. The feature enhancement stage employs the innovative MSCM module, which can extract features under multiple receptive fields, effectively enhancing the model&#x2019;s adaptability to pests of different scales. Finally, P3, P4, and P5 features enhanced through multi-level feature interaction are sent to the detection head for target classification and localization.</p>
<p>The AFFM module in EM-BFPN is a key component for achieving efficient feature fusion. Unlike simple feature concatenation or summation, this module adaptively adjusts the contribution of different features through learnable weight parameters.</p>
<p>For a set of input features <inline-formula>
<mml:math display="inline" id="im64"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, a learnable weight parameter <inline-formula>
<mml:math display="inline" id="im65"><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is first defined for each input feature. Then, <inline-formula>
<mml:math display="inline" id="im66"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>'</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>'</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo>'</mml:mo></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> are obtained through ReLU activation, ensuring that each input feature&#x2019;s contribution in the fusion process is non-negative, avoiding mutual cancellation between features. Finally, after weight normalization, the weighted sum of input features is output as shown in <xref ref-type="disp-formula" rid="eq10">Equation 10</xref>:</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mtext>fused</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mfrac><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>'</mml:mo><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:msubsup><mml:msub><mml:mi>w</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>'</mml:mo><mml:mo>+</mml:mo><mml:mi>&#x3f5;</mml:mi></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im67"><mml:mrow><mml:mi>&#x3f5;</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mn>10</mml:mn></mml:mrow><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is a small constant to prevent division by zero. Compared to fixed-weight fusion methods, this learning-based fusion mechanism can dynamically adjust the importance of different features, adapting to different detection scenarios. Through end-to-end training, weight parameters can be automatically optimized according to the loss function without manual adjustment, maintaining computational efficiency.</p>
<p>The MSCM module integrates the advantages of cross-stage partial networks and multi-scale convolution, effectively extracting and processing multi-scale feature information, with its structure shown in <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>. Given an input feature <inline-formula>
<mml:math display="inline" id="im68"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, the forward propagation process of MSCM can be represented as shown in <xref ref-type="disp-formula" rid="eq11">Equation 11</xref>:</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>MSCM internal structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g010.tif">
<alt-text content-type="machine-generated">Diagram illustrating neural network components MSCM, PRFFM, and DSARB with flowcharts. MSCM includes input, convolution, DSARB, concatenation, and output. PRFFM comprises input, depth-wise convolutions, batch normalization, ReLU activations, and output. DSARB features input, convolution, PRFFM, channel shuffle, and output with stride equal to one.</alt-text>
</graphic></fig>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>&#x3a6;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mi>S</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munderover><mml:mo>&#x220f;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mi>R</mml:mi><mml:msub><mml:mi>B</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im69"><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msup><mml:mo>&#x2192;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xb7;</mml:mo><mml:mi>e</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im70"><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xb7;</mml:mo><mml:mi>e</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x2192;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xb7;</mml:mo><mml:mi>e</mml:mi><mml:mo stretchy="false">/</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> represent feature transformation functions respectively, <inline-formula>
<mml:math display="inline" id="im71"><mml:mi>e</mml:mi></mml:math></inline-formula> is the expansion coefficient, <inline-formula>
<mml:math display="inline" id="im72"><mml:mrow><mml:mtext>Concat[</mml:mtext><mml:mo>&#xb7;</mml:mo><mml:mo>]</mml:mo></mml:mrow></mml:math></inline-formula> denotes feature concatenation operation along the channel dimension, and <inline-formula>
<mml:math display="inline" id="im73"><mml:mrow><mml:msubsup><mml:mo>&#x220f;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:msubsup><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mi>R</mml:mi><mml:msub><mml:mi>B</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represents the combined operation of multiple cascaded Dynamic Scale-Adaptive Residual Blocks (DSARB).</p>
<p>DSARB employs parallel multi-scale depth-separable convolution to enhance feature extraction capability. The complete computation process of DSARB can be represented as shown in <xref ref-type="disp-formula" rid="eq12">Equation 12</xref>:</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x3a8;</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mo>(</mml:mo><mml:mi>&#x2133;</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mtext>exp</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>if</mml:mtext><mml:mo>&#x2009;</mml:mo><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x3a8;</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mo>(</mml:mo><mml:mi>&#x2133;</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mtext>exp</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>if</mml:mtext><mml:mo>&#x2009;</mml:mo><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im74"><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mtext>exp</mml:mtext></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2192;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>e</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is pointwise convolution for channel expansion, <inline-formula>
<mml:math display="inline" id="im75"><mml:mi>e</mml:mi></mml:math></inline-formula> is the expansion coefficient; <sub><italic>&#x2133;</italic></sub> is the multi-scale feature aggregation function, representing the feature aggregation result after the Parallel Receptive Field Fusion Module (PRFFM); <inline-formula>
<mml:math display="inline" id="im77"><mml:mi>&#x3a8;</mml:mi></mml:math></inline-formula> represents the Channel Shuffle operation (<xref ref-type="bibr" rid="B36">Zhang and Yang, 2021</xref>); <inline-formula>
<mml:math display="inline" id="im78"><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>e</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2192;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is pointwise convolution for channel projection; <inline-formula>
<mml:math display="inline" id="im79"><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2192;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is 1&#xd7;1 convolution for residual connection; and <inline-formula>
<mml:math display="inline" id="im80"><mml:mi>s</mml:mi></mml:math></inline-formula> represents the convolution stride.</p>
<p>The PRFFM module is the core component of DSARB, implementing parallel multi-scale depth-separable convolution. Given an input feature <inline-formula>
<mml:math display="inline" id="im81"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and a set of convolution kernel sizes <inline-formula>
<mml:math display="inline" id="im82"><mml:mrow><mml:mi mathvariant="script">K</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, the output of PRFFM can be represented as shown in <xref ref-type="disp-formula" rid="eq13">Equation 13</xref>:</p>
<disp-formula id="eq13"><label>(13)</label>
<mml:math display="block" id="M13"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>s</mml:mi><mml:mi>d</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x2009;</mml:mtext><mml:mtext>&#x2009;</mml:mtext><mml:msub><mml:mi>Y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x212c;</mml:mi><mml:mi mathvariant="script">N</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>C</mml:mi></mml:munderover><mml:msubsup><mml:mi>W</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msubsup><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im83"><mml:mrow><mml:msubsup><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msubsup><mml:mo>:</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2192;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>'</mml:mo><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi><mml:mo>'</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> represents the depth-separable convolution operation with kernel size <inline-formula>
<mml:math display="inline" id="im84"><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>: where <inline-formula>
<mml:math display="inline" id="im85"><mml:mrow><mml:msubsup><mml:mi>W</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is the convolution kernel weight of the <inline-formula>
<mml:math display="inline" id="im86"><mml:mi>c</mml:mi></mml:math></inline-formula>-th channel, <inline-formula>
<mml:math display="inline" id="im87"><mml:mo>*</mml:mo></mml:math></inline-formula> represents the two-dimensional convolution operation, <inline-formula>
<mml:math display="inline" id="im88"><mml:mrow><mml:mi>&#x212c;</mml:mi><mml:mi mathvariant="script">N</mml:mi></mml:mrow></mml:math></inline-formula> represents batch normalization, and <inline-formula>
<mml:math display="inline" id="im89"><mml:mi>&#x3c3;</mml:mi></mml:math></inline-formula> represents the ReLU nonlinear activation function.</p>
<p>The EM-BFPN neck network structure proposed in this paper has been specially optimized for rice pest detection scenarios. Through multi-path feature flow and iterative feature fusion, it significantly enhances feature reuse efficiency and multi-scale information interaction, addressing the limitations of traditional FPN in processing rice pest targets with significant scale variations. By adopting the AFFM fusion mechanism, the network can adaptively adjust the importance of features at different scales, enhancing selective extraction of key information and avoiding information redundancy and noise interference problems caused by traditional simple fusion methods.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>SCAU upsampling submodule</title>
<p>Traditional upsampling modules typically employ bilinear or nearest-neighbor interpolation. While computationally efficient, these methods often cause information loss and spatial blurring during feature map reconstruction. Particularly in rice pest detection tasks, where pest targets are typically small in volume, morphologically similar, and in complex backgrounds, such information loss significantly affects detection precision. To address these issues, this paper proposes an upsampling module called Spatial-Channel Augmented Upsampling (SCAU), as shown in <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref>. This module effectively enhances the model&#x2019;s capability to detect small-sized rice pests and distinguish between similar pest categories by employing a Channel Shuffle mechanism and innovative Multi-Directional Feature Shifting (MDFS) units.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>SCAU module structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g011.tif">
<alt-text content-type="machine-generated">3D bar chart comparing performance metrics for different combinations of components: A, A+B, A+B+C, and A+B+C+D. The vertical axis represents metric values, with a legend indicating blue bars for mAP@0.5 and orange bars for mAP@0.5-0.95. The highest values are observed for A+B+C+D.</alt-text>
</graphic></fig>
<p>The workflow of the SCAU module primarily includes four stages: upsampling, Channel Shuffle, Multi-Directional Feature Shifting, and pointwise convolution. The input feature map <inline-formula>
<mml:math display="inline" id="im90"><mml:mi>X</mml:mi></mml:math></inline-formula> is processed through an upsampling layer combined with depth-separable convolution (DWConv), then processed through the Channel Shuffle mechanism to obtain <inline-formula>
<mml:math display="inline" id="im91"><mml:mrow><mml:mi>X</mml:mi><mml:mo>"</mml:mo></mml:mrow></mml:math></inline-formula>. This operation ensures that channel information from different groups can be thoroughly mixed, enhancing the diversity of feature expression. After channel shuffling, the feature map enters the MDFS unit for spatial shift mixing, followed by further fusion of channel information through pointwise convolution (PWConv) to obtain the final output map.</p>
<p>The MDFS unit is the core innovation of the SCAU module, aimed at enhancing the spatial context perception capability of feature maps without increasing parameter count. For an input feature map <inline-formula>
<mml:math display="inline" id="im92"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, the MDFS unit first evenly divides it along the channel dimension into four sub-feature maps, obtaining <inline-formula>
<mml:math display="inline" id="im93"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>4</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula>. Each sub-feature map <inline-formula>
<mml:math display="inline" id="im94"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mfrac><mml:mi>C</mml:mi><mml:mn>4</mml:mn></mml:mfrac><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im95"><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, contains one-quarter of the channels of the input feature map.</p>
<p>After division, different directions and magnitudes of circular shift operations are applied to these four sub-feature maps, as shown in <xref ref-type="disp-formula" rid="eq14">Equations 14</xref>&#x2013;<xref ref-type="disp-formula" rid="eq17">17</xref>:</p>
<disp-formula id="eq14"><label>(14)</label>
<mml:math display="block" id="M14"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mn>1</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq15"><label>(15)</label>
<mml:math display="block" id="M15"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mn>2</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq16"><label>(16)</label>
<mml:math display="block" id="M16"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mn>3</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mrow><mml:mi>W</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>3</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq17"><label>(17)</label>
<mml:math display="block" id="M17"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msubsup><mml:mi>X</mml:mi><mml:mn>4</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mrow><mml:mi>W</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>4</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>In these formulas, <inline-formula>
<mml:math display="inline" id="im96"><mml:mrow><mml:mi>R</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:math></inline-formula> represents the circular shift operation, <inline-formula>
<mml:math display="inline" id="im97"><mml:mrow><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im98"><mml:mrow><mml:msub><mml:mi mathvariant="script">T</mml:mi><mml:mrow><mml:mi>W</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represent shift transformation functions in height and width dimensions respectively, and <inline-formula>
<mml:math display="inline" id="im99"><mml:mi>s</mml:mi></mml:math></inline-formula> is the shift amount. Specifically, <inline-formula>
<mml:math display="inline" id="im100"><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:math></inline-formula> indicates shifting in the height dimension, and <inline-formula>
<mml:math display="inline" id="im101"><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:math></inline-formula> indicates shifting in the width dimension. Positive shift values indicate downward or rightward movement of features,&#xa0;while negative shift values indicate upward or leftward movement. After the shift operations, the four sub-feature maps&#xa0;are&#xa0;reconnected along the channel dimension, as shown in <xref ref-type="disp-formula" rid="eq18">Equation 18</xref>:</p>
<disp-formula id="eq18"><label>(18)</label>
<mml:math display="block" id="M18"><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>'</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>'</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>'</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo>'</mml:mo></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im102"><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:math></inline-formula> represents the tensor concatenation operation, and <inline-formula>
<mml:math display="inline" id="im103"><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula> specifies concatenation along the channel dimension. The final output feature map <inline-formula>
<mml:math display="inline" id="im104"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> has the same shape as the input feature map <inline-formula>
<mml:math display="inline" id="im105"><mml:mi>X</mml:mi></mml:math></inline-formula> but contains enhanced spatial context information.</p>
<p>The SCAU module proposed in this paper addresses small target recognition and similar category discrimination problems in rice pest detection tasks through upsampling combined with depth-separable convolution, channel shuffling mechanism, and spatial shift mixing strategy, effectively enhancing feature map expression capability without significantly increasing computational complexity. Compared to traditional upsampling methods, the SCAU module reduces computation while preserving spatial information by employing depth-separable convolution. The module adopts a channel shuffling mechanism to promote interactive fusion of different channel features, enhancing the model&#x2019;s representation capability.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental environment and hyperparameter settings</title>
<p>The hardware configuration for this experiment comprised computing nodes and graphics processors, with specific environmental settings and experimental parameters detailed in <xref ref-type="table" rid="T1"><bold>Tables&#xa0;1</bold></xref>, <xref ref-type="table" rid="T2"><bold>2</bold></xref>. The central processing unit utilized was a 12th Gen Intel(R) Core(TM)i7-12650H, while the graphics processor employed was an NVIDIA GeForce RTX 4060. The software environment was deployed on the Windows 11 operating system, with a programming environment based on Python 3.9 and the PyTorch 11.7 deep learning framework. Specific parameters for model training were configured as follows: iteration period (epoch) was set to 250, batch size to 16, and the optimization algorithm utilized&#xa0;was Stochastic Gradient Descent (SGD) with an initial learning rate of 0.01 and a momentum factor of 0.937, with other parameters adopting default values. Input data dimensions were uniformly adjusted to 640&#xd7;640 pixel resolution through standardized preprocessing.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Experimental platform.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Platform</th>
<th valign="middle" align="center">Name</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">CPU</td>
<td valign="middle" align="center">12th Gen Intel(R) Core(TM)i7-12650H</td>
</tr>
<tr>
<td valign="middle" align="center">GPU</td>
<td valign="middle" align="center">NVIDIA GeForce RTX 4060</td>
</tr>
<tr>
<td valign="middle" align="center">The operating system</td>
<td valign="middle" align="center">Windows 11</td>
</tr>
<tr>
<td valign="middle" align="center">Programming language</td>
<td valign="middle" align="center">Python 3.9</td>
</tr>
<tr>
<td valign="middle" align="center">Deep learning framework</td>
<td valign="middle" align="center">Pytorch 11.7</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Some experimental details of each framework.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Name</th>
<th valign="middle" align="center">Value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">epoch</td>
<td valign="middle" align="center">250</td>
</tr>
<tr>
<td valign="middle" align="center">Batch size</td>
<td valign="middle" align="center">16</td>
</tr>
<tr>
<td valign="middle" align="center">optimizer</td>
<td valign="middle" align="center">SGD</td>
</tr>
<tr>
<td valign="middle" align="center">Initial learning rate</td>
<td valign="middle" align="center">0.01</td>
</tr>
<tr>
<td valign="middle" align="center">momentum</td>
<td valign="middle" align="center">0.937</td>
</tr>
<tr>
<td valign="middle" align="center">Image size</td>
<td valign="middle" align="center">640&#xd7;640</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Ablation experiments</title>
<p>To verify the effectiveness of the proposed BEAM-YOLO algorithm in rice pest detection tasks, we conducted ablation experiments on four core innovations. Here, A represents the MEN module, B represents the BAFE module, C represents the EM-BFPN module, and D represents the SCAU module. Starting from the baseline model, each innovative module was progressively added to evaluate their independent and combined contributions to detection performance. The ablation experiment results are shown in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Ablation experiment results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50-95</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">FLOPS</th>
<th valign="middle" align="center">Params</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">BaseLine</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">69.7 &#xb1; 1.1%</td>
<td valign="middle" align="center">85.1 &#xb1; 0.3%</td>
<td valign="middle" align="center">74.9 &#xb1; 0.6%</td>
<td valign="middle" align="center">6.3G</td>
<td valign="middle" align="center">2.5M</td>
</tr>
<tr>
<td valign="middle" align="center">A</td>
<td valign="middle" align="center">83.7 &#xb1; 0.6%</td>
<td valign="middle" align="center">69.9 &#xb1; 0.8%</td>
<td valign="middle" align="center">80.5 &#xb1; 0.7%</td>
<td valign="middle" align="center">78.3 &#xb1; 0.8%</td>
<td valign="middle" align="center">6.5G</td>
<td valign="middle" align="center">2.56M</td>
</tr>
<tr>
<td valign="middle" align="center">B</td>
<td valign="middle" align="center">83.5 &#xb1; 0.5%</td>
<td valign="middle" align="center">69.8 &#xb1; 1.0%</td>
<td valign="middle" align="center">86.2 &#xb1; 0.6%</td>
<td valign="middle" align="center">76.1 &#xb1; 0.6%</td>
<td valign="middle" align="center">8.6G</td>
<td valign="middle" align="center">5.20M</td>
</tr>
<tr>
<td valign="middle" align="center">C</td>
<td valign="middle" align="center">84.1 &#xb1; 0.6%</td>
<td valign="middle" align="center">70.3 &#xb1; 1.0%</td>
<td valign="middle" align="center">82.7 &#xb1; 0.6%</td>
<td valign="middle" align="center">76.8 &#xb1; 0.7%</td>
<td valign="middle" align="center">6.8G</td>
<td valign="middle" align="center">2.14M</td>
</tr>
<tr>
<td valign="middle" align="center">D</td>
<td valign="middle" align="center">83.9 &#xb1; 0.5%</td>
<td valign="middle" align="center">70.0 &#xb1; 0.7%</td>
<td valign="middle" align="center">84.2 &#xb1; 0.8%</td>
<td valign="middle" align="center">75.7 &#xb1; 0.6%</td>
<td valign="middle" align="center">6.9G</td>
<td valign="middle" align="center">2.70M</td>
</tr>
<tr>
<td valign="middle" align="center">A+C</td>
<td valign="middle" align="center">85.2 &#xb1; 0.3%</td>
<td valign="middle" align="center">71.5 &#xb1; 0.9%</td>
<td valign="middle" align="center">82.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">79.1 &#xb1; 1.0%</td>
<td valign="middle" align="center">6.6G</td>
<td valign="middle" align="center">2.09M</td>
</tr>
<tr>
<td valign="middle" align="center">A+D</td>
<td valign="middle" align="center">84.5 &#xb1; 0.5%</td>
<td valign="middle" align="center">70.8 &#xb1; 0.5%</td>
<td valign="middle" align="center">82.7 &#xb1; 0.6%</td>
<td valign="middle" align="center">79.5 &#xb1; 0.8%</td>
<td valign="middle" align="center">6.9G</td>
<td valign="middle" align="center">2.65M</td>
</tr>
<tr>
<td valign="middle" align="center">B+C</td>
<td valign="middle" align="center">85.7 &#xb1; 0.5%</td>
<td valign="middle" align="center">71.7 &#xb1; 0.9%</td>
<td valign="middle" align="center">87.2 &#xb1; 0.3%</td>
<td valign="middle" align="center">77.5 &#xb1; 0.6%</td>
<td valign="middle" align="center">8.7G</td>
<td valign="middle" align="center">4.72M</td>
</tr>
<tr>
<td valign="middle" align="center">B+D</td>
<td valign="middle" align="center">85.3 &#xb1; 0.3%</td>
<td valign="middle" align="center">71.4 &#xb1; 0.8%</td>
<td valign="middle" align="center">87.5 &#xb1; 0.5%</td>
<td valign="middle" align="center">77.2 &#xb1; 0.7%</td>
<td valign="middle" align="center">9.0G</td>
<td valign="middle" align="center">5.28M</td>
</tr>
<tr>
<td valign="middle" align="center">A+B+C</td>
<td valign="middle" align="center">86.0 &#xb1; 0.3%</td>
<td valign="middle" align="center">72.0 &#xb1; 0.8%</td>
<td valign="middle" align="center">81.5 &#xb1; 0.5%</td>
<td valign="middle" align="center">78.0 &#xb1; 0.8%</td>
<td valign="middle" align="center">8.7 G</td>
<td valign="middle" align="center">4.67M</td>
</tr>
<tr>
<td valign="middle" align="center">A+B+D</td>
<td valign="middle" align="center">85.9 &#xb1; 0.4%</td>
<td valign="middle" align="center">72.3 &#xb1; 1.0%</td>
<td valign="middle" align="center">85.4 &#xb1; 0.5%</td>
<td valign="middle" align="center">79.3 &#xb1; 0.8%</td>
<td valign="middle" align="center">9.0G</td>
<td valign="middle" align="center">5.23M</td>
</tr>
<tr>
<td valign="middle" align="center">B+C+D</td>
<td valign="middle" align="center">86.2 &#xb1; 0.6%</td>
<td valign="middle" align="center">72.4 &#xb1; 0.7%</td>
<td valign="middle" align="center">86.8 &#xb1; 0.6%</td>
<td valign="middle" align="center">78.4 &#xb1; 0.6%</td>
<td valign="middle" align="center">8.8G</td>
<td valign="middle" align="center">4.73M</td>
</tr>
<tr>
<td valign="middle" align="center">A+B+C+D</td>
<td valign="middle" align="center">86.6 &#xb1; 0.5%</td>
<td valign="middle" align="center">72.7 &#xb1; 0.9%</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">78.8 &#xb1; 0.7%</td>
<td valign="middle" align="center">8.8 G</td>
<td valign="middle" align="center">4.68M</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The experimental results clearly demonstrate the significant performance improvement brought by the proposed innovative modules. When all four modules were combined, our proposed BEAM-YOLO model achieved 86.6% mAP@50, a 3.3% improvement over the baseline, and reached 72.7% under the more stringent mAP@50&#x2013;95 evaluation criterion, a 3.0% improvement. Notably, the combination of the MEN module and EM-BFPN achieved significant performance improvement while maintaining relatively low computational complexity and parameter count. The BAFE module showed exceptional performance in improving Recall, enabling the model to better detect difficult-to-recognize pest targets. <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref> illustrates the ablation experiment results for different network component combinations through a three-dimensional bar chart, intuitively presenting the performance of each configuration on two key metrics: mAP@50 and mAP@50-95. A clear progressive improvement trend in model performance can be observed as components are accumulated.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Ablation parameter bar chart.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g012.tif">
<alt-text content-type="machine-generated">Flowchart illustrating SCAU and MDFS processes. SCAU consists of Input X processed through Up 2x, DWConv, Channel Shuffle, MDFS, and PWConv, leading to Output. MDFS splits Input X into X1, X2, X3, X4, each with dimensions \([C/4,H,W]\). X1 is right-shifted on height, X2 left-shifted on height, X3 down-shifted on width, and X4 up-shifted on width, merging to form Output \(X_{out}\). C, H, W represent channel, height, and width, respectively.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Comparative experiments</title>
<sec id="s4_3_1">
<label>4.3.1</label>
<title>Comparison of different backbones</title>
<p>To verify the effectiveness of the proposed improved YOLOv11 backbone network in rice pest detection tasks, we conducted comprehensive comparative experiments. Five mainstream backbone networks in the current object detection field were selected, including Resnet18 (<xref ref-type="bibr" rid="B18">Odusami et&#xa0;al., 2021</xref>), Repvit (<xref ref-type="bibr" rid="B27">Wang et&#xa0;al., 2024a</xref>), EfficientViT (<xref ref-type="bibr" rid="B15">Liu et&#xa0;al., 2023</xref>), and Unireplknet (<xref ref-type="bibr" rid="B5">Ding et&#xa0;al., 2024</xref>), for performance comparison with our improved network that integrates the MEN and BAFE modules. Through these comparative experiments, we aimed to evaluate the enhancement effect of the proposed multi-scale edge enhancement and contrastive wavelet attention mechanism on rice pest detection performance.</p>
<p>The experimental results show that our proposed backbone network achieved significant advantages across all metrics. As shown in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>, our improved backbone network model demonstrated optimal performance, with mAP@50 improved by 1.5%, mAP@50&#x2013;95 by 1.5%, recall rate by 1.6%, and precision by 3.7% compared to the baseline YOLOv11 model. Meanwhile, our model used only 5.1M parameters with a computational load of 8.6GG, reducing parameter count by 60.8% and computational load by 74.4% compared to Resnet18, and reducing parameter count by 20.3% and computational load by 49.4% compared to Repvit. These results fully validate the effectiveness of the backbone network reconstituted with the MEN module and BAFE module, where the MEN module significantly enhanced detection capability for pests of different sizes through multi-scale feature extraction and edge enhancement, while the BAFE module improved recognition precision for morphologically variable and tiny rice pests through its dual attention mechanism.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Comparison experiment results of different backbones.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50-95</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">FLOPS</th>
<th valign="middle" align="center">Params</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">BaseLine</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">69.7 &#xb1; 1.1%</td>
<td valign="middle" align="center">85.1 &#xb1; 0.3%</td>
<td valign="middle" align="center">74.9 &#xb1; 0.6%</td>
<td valign="middle" align="center">6.3G</td>
<td valign="middle" align="center">2.5M</td>
</tr>
<tr>
<td valign="middle" align="center">Resnet18</td>
<td valign="middle" align="center">82.6 &#xb1; 0.7%</td>
<td valign="middle" align="center">68.3 &#xb1; 0.7%</td>
<td valign="middle" align="center">86.1 &#xb1; 0.4%</td>
<td valign="middle" align="center">68.4 &#xb1; 1.0%</td>
<td valign="middle" align="center">33.6G</td>
<td valign="middle" align="center">13.0M</td>
</tr>
<tr>
<td valign="middle" align="center">Repvit</td>
<td valign="middle" align="center">82.7 &#xb1; 0.6%</td>
<td valign="middle" align="center">70.7 &#xb1; 0.5%</td>
<td valign="middle" align="center">77.2 &#xb1; 0.7%</td>
<td valign="middle" align="center">76.1 &#xb1; 0.6%</td>
<td valign="middle" align="center">17.0G</td>
<td valign="middle" align="center">6.4M</td>
</tr>
<tr>
<td valign="middle" align="center">EfficientViT</td>
<td valign="middle" align="center">81.9 &#xb1; 0.5%</td>
<td valign="middle" align="center">68.0 &#xb1; 0.9%</td>
<td valign="middle" align="center">82.8 &#xb1; 0.8%</td>
<td valign="middle" align="center">76.6 &#xb1; 0.8%</td>
<td valign="middle" align="center">7.9G</td>
<td valign="middle" align="center">3.7M</td>
</tr>
<tr>
<td valign="middle" align="center">Unireplknet</td>
<td valign="middle" align="center">79.6 &#xb1; 0.8%</td>
<td valign="middle" align="center">65.2 &#xb1; 0.6%</td>
<td valign="middle" align="center">86.0 &#xb1; 0.3%</td>
<td valign="middle" align="center">66.9 &#xb1; 0.6%</td>
<td valign="middle" align="center">14.1G</td>
<td valign="middle" align="center">5.8M</td>
</tr>
<tr>
<td valign="middle" align="center">Ours</td>
<td valign="middle" align="center">84.8 &#xb1; 0.6%</td>
<td valign="middle" align="center">71.2 &#xb1; 0.9%</td>
<td valign="middle" align="center">86.7 &#xb1; 0.6%</td>
<td valign="middle" align="center">78.6 &#xb1; 0.6%</td>
<td valign="middle" align="center">8.6 G</td>
<td valign="middle" align="center">5.1M</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To further verify the feature extraction performance of our proposed model, we generated feature response heatmaps for various backbone networks on the rice pest dataset, as shown in <xref ref-type="fig" rid="f13"><bold>Figure&#xa0;13</bold></xref>. The visualization results quantitatively demonstrate the significant advantage of our proposed backbone network integrating MEN and BAFE modules in target feature extraction. Compared to the control group networks, our proposed network architecture exhibited higher feature selectivity, more precise localization of pest target regions, higher intensity of concentrated activation, and clearer boundary definition in the heatmaps. This result further confirms that the multi-scale edge enhancement mechanism of the proposed MEN module and the foreground-background contrastive attention mechanism of the BAFE module can work synergistically to effectively improve the detection performance of small rice pest targets.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Heatmap visualization of different backbone networks.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g013.tif">
<alt-text content-type="machine-generated">Scatter plot comparing different models: YOLO versions, SSD, and RTDETR variants. The x-axis shows mAP@0.5 (%) ranging from 65 to 90, and the y-axis represents FLOPS (Millions) from 0 to 140. YOLO models are spread across the plot, with YOLOv9 achieving the best performance indicated by the positioning towards the lower right corner. A legend identifies each model by color and shape.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_3_2">
<label>4.3.2</label>
<title>Comparison of different Neck-FPNs</title>
<p>To verify the effectiveness of our proposed neck network in rice pest detection tasks, we compared five mainstream neck network architectures, including Slim-neck (<xref ref-type="bibr" rid="B12">Li H. et&#xa0;al., 2022</xref>), MAFPN (<xref ref-type="bibr" rid="B35">Zhang et&#xa0;al., 2023</xref>), GFPN (<xref ref-type="bibr" rid="B38">Zhao et&#xa0;al., 2021</xref>), EfficientRepBiPAN (<xref ref-type="bibr" rid="B34">Ye et&#xa0;al., 2024</xref>), and Bifpn (<xref ref-type="bibr" rid="B3">Chen et&#xa0;al., 2021</xref>), with the experimental results shown in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>. All comparative experiments were conducted based on the same backbone network and detection head, ensuring fair comparison of the performance differences between various neck networks. The experiments focused on evaluating various model metrics, comprehensively analyzing the advantages and limitations of different neck networks in detecting small targets and multi-morphological rice pests.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Experiment results of different neck networks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50-95</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">FLOPS</th>
<th valign="middle" align="center">Params</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">BaseLine</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">69.7 &#xb1; 1.1%</td>
<td valign="middle" align="center">85.1 &#xb1; 0.3%</td>
<td valign="middle" align="center">74.9 &#xb1; 0.6%</td>
<td valign="middle" align="center">6.3G</td>
<td valign="middle" align="center">2.5M</td>
</tr>
<tr>
<td valign="middle" align="center">Slim-neck</td>
<td valign="middle" align="center">81.6 &#xb1; 0.5%</td>
<td valign="middle" align="center">68.0 &#xb1; 0.9%</td>
<td valign="middle" align="center">86.0 &#xb1; 0.3%</td>
<td valign="middle" align="center">70.0 &#xb1; 0.7%</td>
<td valign="middle" align="center">5.9G</td>
<td valign="middle" align="center">2.5M</td>
</tr>
<tr>
<td valign="middle" align="center">MAFPN</td>
<td valign="middle" align="center">77.9 &#xb1; 1.0%</td>
<td valign="middle" align="center">65.3 &#xb1; 0.7%</td>
<td valign="middle" align="center">78.5 &#xb1; 0.9%</td>
<td valign="middle" align="center">69.7 &#xb1; 1.1%</td>
<td valign="middle" align="center">7.1G</td>
<td valign="middle" align="center">2.6M</td>
</tr>
<tr>
<td valign="middle" align="center">GFPN</td>
<td valign="middle" align="center">80.5 &#xb1; 0.7%</td>
<td valign="middle" align="center">67.9 &#xb1; 1.0%</td>
<td valign="middle" align="center">81.0 &#xb1; 0.5%</td>
<td valign="middle" align="center">72.4 &#xb1; 0.7%</td>
<td valign="middle" align="center">8.2G</td>
<td valign="middle" align="center">3.6M</td>
</tr>
<tr>
<td valign="middle" align="center">EfficientRepBiPAN</td>
<td valign="middle" align="center">77.5 &#xb1; 0.6%</td>
<td valign="middle" align="center">62.9 &#xb1; 1.0%</td>
<td valign="middle" align="center">78.8 &#xb1; 0.7%</td>
<td valign="middle" align="center">68.0 &#xb1; 0.9%</td>
<td valign="middle" align="center">7.8G</td>
<td valign="middle" align="center">3.1M</td>
</tr>
<tr>
<td valign="middle" align="center">Bifpn</td>
<td valign="middle" align="center">77.9 &#xb1; 1.0%</td>
<td valign="middle" align="center">64.8 &#xb1; 0.9%</td>
<td valign="middle" align="center">76.0 &#xb1; 0.6%</td>
<td valign="middle" align="center">72.0 &#xb1; 0.8%</td>
<td valign="middle" align="center">6.3G</td>
<td valign="middle" align="center">1.9M</td>
</tr>
<tr>
<td valign="middle" align="center">Ours</td>
<td valign="middle" align="center">84.6 &#xb1; 0.8%</td>
<td valign="middle" align="center">70.5 &#xb1; 0.7%</td>
<td valign="middle" align="center">81.6 &#xb1; 0.5%</td>
<td valign="middle" align="center">78.4 &#xb1; 0.6%</td>
<td valign="middle" align="center">6.7G</td>
<td valign="middle" align="center">2.1M</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The experimental results prove that our improved neck network outperforms the comparative methods across all performance metrics. Compared to the baseline YOLOv11 model, our approach improved mAP@50 by 1.3%, mAP@50&#x2013;95 by 0.8%, and precision by 3.5%. Particularly noteworthy is that our model uses only 2.1M parameters, 16% fewer than YOLOv11 and 41.7% fewer than GFPN. In terms of computational efficiency, our model&#x2019;s FLOPS is 6.7G, only 6.3% higher than the baseline model, but 18.3% lower than GFPN and 14.1% lower than EfficientRepBiPAN. These results indicate that through the EM-BFPN&#x2019;s adaptive feature fusion mechanism and SCAU&#x2019;s multi-directional feature shifting strategy, our neck network improves rice pest detection precision while maintaining relatively low computational complexity, making it particularly suitable for resource-constrained environments in practical applications.</p>
</sec>
<sec id="s4_3_3">
<label>4.3.3</label>
<title>Comparison of different datasets</title>
<p>To verify the generalization capability of the proposed BEAM-YOLO model, we conducted cross-validation experiments based on the public dataset pest-dc2xk (<xref ref-type="bibr" rid="B22">Serasinghe, 2022</xref>) from the Roboflow Universe platform. This dataset, created by Aryan Serasinghe, contains 1,003 high-quality rice pest images covering 10 common rice pest categories, including yellow rice borer, rice leaf roller, rice leafhopper, rice water weevil, rice gall midge, and other morphologically diverse pest species. These images were collected under varying environmental conditions, lighting intensities, and shooting angles, providing an ideal testing foundation for evaluating model robustness in complex real-world scenarios. We divided the dataset into training, validation, and test sets at an 8:1:1 ratio and conducted comparative experiments with other mainstream object detection algorithms, with experimental results shown in <xref ref-type="table" rid="T6"><bold>Table&#xa0;6</bold></xref>.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Comparative experiments across different datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Dataset</th>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50-95</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Precision</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="2" align="center">JRICE-PD</td>
<td valign="middle" align="center">YOLOv11</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">69.7 &#xb1; 1.1%</td>
<td valign="middle" align="center">85.1 &#xb1; 0.3%</td>
<td valign="middle" align="center">74.9 &#xb1; 0.6%</td>
</tr>
<tr>
<td valign="middle" align="center">BEAM-YOLO</td>
<td valign="middle" align="center">86.6 &#xb1; 0.5%</td>
<td valign="middle" align="center">72.7 &#xb1; 0.9%</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">78.8 &#xb1; 0.7%</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="center">pest-dc2xk</td>
<td valign="middle" align="center">YOLOv11</td>
<td valign="middle" align="center">79.1 &#xb1; 1.0%</td>
<td valign="middle" align="center">51.2 &#xb1; 1.0%</td>
<td valign="middle" align="center">85.7 &#xb1; 0.5%</td>
<td valign="middle" align="center">74.2 &#xb1; 0.5%</td>
</tr>
<tr>
<td valign="middle" align="center">BEAM-YOLO</td>
<td valign="middle" align="center">82.6 &#xb1; 0.7%</td>
<td valign="middle" align="center">53.5 &#xb1; 1.0%</td>
<td valign="middle" align="center">84.8 &#xb1; 0.6%</td>
<td valign="middle" align="center">78.6 &#xb1; 0.6%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The experimental results demonstrate that BEAM-YOLO outperforms the original YOLOv11 across all evaluation metrics, fully validating the effectiveness of our four innovative components. On the challenging pest-dc2xk dataset, BEAM-YOLO also performed excellently, achieving an mAP@50 of 82.6%, a 3.5% improvement over YOLOv11&#x2019;s 79.1%; precision also increased from 74.2% to 78.6%, a growth of 4.4%. These results confirm that our MEN module effectively enhanced the ability to extract pest edge features, the BAFE module significantly improved foreground-background feature discrimination, the EM-BFPN optimized fusion efficiency of features at different scales, and the SCAU module improved detection precision for small target pests. Particularly in complex agricultural backgrounds, BEAM-YOLO demonstrated stronger recognition capability and environmental adaptability.</p>
</sec>
<sec id="s4_3_4">
<label>4.3.4</label>
<title>Comparison of different models</title>
<p>To verify the effectiveness of the proposed BEAM-YOLO model in rice pest detection tasks, we compared current mainstream object detection algorithms, including Transformer-based RTDETR series, the classic SSD, and YOLO series models, with our BEAM-YOLO model. To ensure fair comparison, all models were retrained from scratch on the JRICE-PD dataset under identical experimental conditions: 250 epochs, 640&#xd7;640 input resolution, batch size of 16, SGD optimizer with initial learning rate of 0.01, and the same data augmentation pipeline. The experimental results are shown in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>, comprehensively evaluating the performance of each model in terms of detection precision, recall rate, computational efficiency, and other aspects.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Comparison experiment results of different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">Type</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50-95</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">FLOPS</th>
<th valign="middle" align="center">Params</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">RTDETR-R18 (<xref ref-type="bibr" rid="B40">Zhu and Kong, 2024</xref>)</td>
<td valign="middle" align="center">DETR</td>
<td valign="middle" align="center">73.9 &#xb1; 0.9%</td>
<td valign="middle" align="center">51.2 &#xb1; 1.0%</td>
<td valign="middle" align="center">69.1 &#xb1; 0.6%</td>
<td valign="middle" align="center">84.8 &#xb1; 0.6%</td>
<td valign="middle" align="center">57.0G</td>
<td valign="middle" align="center">19.9M</td>
</tr>
<tr>
<td valign="middle" align="center">RTDETR-R50 (<xref ref-type="bibr" rid="B37">Zhao et&#xa0;al., 2024</xref>)</td>
<td valign="middle" align="center">DETR</td>
<td valign="middle" align="center">69.7 &#xb1; 1.1%</td>
<td valign="middle" align="center">47.5 &#xb1; 1.2%</td>
<td valign="middle" align="center">72.9 &#xb1; 0.8%</td>
<td valign="middle" align="center">78.6 &#xb1; 0.6%</td>
<td valign="middle" align="center">129.6G</td>
<td valign="middle" align="center">42.0M</td>
</tr>
<tr>
<td valign="middle" align="center">RTDETR-L (<xref ref-type="bibr" rid="B8">Jun et&#xa0;al., 2024</xref>)</td>
<td valign="middle" align="center">DETR</td>
<td valign="middle" align="center">70.5 &#xb1; 0.7%</td>
<td valign="middle" align="center">47.9 &#xb1; 0.8%</td>
<td valign="middle" align="center">77.2 &#xb1; 0.7%</td>
<td valign="middle" align="center">81.3 &#xb1; 0.4%</td>
<td valign="middle" align="center">103.5G</td>
<td valign="middle" align="center">33.0M</td>
</tr>
<tr>
<td valign="middle" align="center">SSD (<xref ref-type="bibr" rid="B14">Liu et&#xa0;al., 2016</xref>)</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">82.2 &#xb1; 0.8%</td>
<td valign="middle" align="center">64.4 &#xb1; 0.7%</td>
<td valign="middle" align="center">77.3 &#xb1; 0.6%</td>
<td valign="middle" align="center">82.1 &#xb1; 0.8%</td>
<td valign="middle" align="center">61.2G</td>
<td valign="middle" align="center">24.8M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv3-tiny (<xref ref-type="bibr" rid="B4">Cheng et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">81.9 &#xb1; 0.5%</td>
<td valign="middle" align="center">64.2 &#xb1; 1.1%</td>
<td valign="middle" align="center">83.1%</td>
<td valign="middle" align="center">73.2 &#xb1; 0.9</td>
<td valign="middle" align="center">18.9G</td>
<td valign="middle" align="center">12.1M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5 (<xref ref-type="bibr" rid="B31">Wu et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">80.2 &#xb1; 0.7%</td>
<td valign="middle" align="center">65.6 &#xb1; 0.9%</td>
<td valign="middle" align="center">76.0 &#xb1; 0.6%</td>
<td valign="middle" align="center">75.2 &#xb1; 0.5%</td>
<td valign="middle" align="center">7.1G</td>
<td valign="middle" align="center">25.0M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv6 (<xref ref-type="bibr" rid="B13">Li C. et&#xa0;al., 2022</xref>)</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">74.8%</td>
<td valign="middle" align="center">61.0%</td>
<td valign="middle" align="center">78.3%</td>
<td valign="middle" align="center">67.6%</td>
<td valign="middle" align="center">11.5G</td>
<td valign="middle" align="center">4.1M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8 (<xref ref-type="bibr" rid="B23">Sohan et&#xa0;al., 2024</xref>)</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">81.6 &#xb1; 0.5%</td>
<td valign="middle" align="center">68.0 &#xb1; 0.9%</td>
<td valign="middle" align="center">78.4 &#xb1; 0.6%</td>
<td valign="middle" align="center">73.6 &#xb1; 0.8%</td>
<td valign="middle" align="center">8.1G</td>
<td valign="middle" align="center">3.0M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv9 (<xref ref-type="bibr" rid="B30">Wang C-Y. et&#xa0;al., 2024</xref>)</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">84.4 &#xb1; 0.6%</td>
<td valign="middle" align="center">72.5 &#xb1; 0.8%</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">76.1 &#xb1; 0.6%</td>
<td valign="middle" align="center">87.2G</td>
<td valign="middle" align="center">21.1M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10 (<xref ref-type="bibr" rid="B28">Wang et&#xa0;al., 2024b</xref>)</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">81.6 &#xb1; 0.5%</td>
<td valign="middle" align="center">69.0 &#xb1; 1.1%</td>
<td valign="middle" align="center">83.9 &#xb1; 0.5%</td>
<td valign="middle" align="center">72.7 &#xb1; 0.9%</td>
<td valign="middle" align="center">6.5G</td>
<td valign="middle" align="center">2.2M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv11</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">69.7 &#xb1; 1.1%</td>
<td valign="middle" align="center">85.1 &#xb1; 0.3%</td>
<td valign="middle" align="center">74.9 &#xb1; 0.6%</td>
<td valign="middle" align="center">6.3G</td>
<td valign="middle" align="center">2.5M</td>
</tr>
<tr>
<td valign="middle" align="center">BEAM-YOLO</td>
<td valign="middle" align="center">One-Stage</td>
<td valign="middle" align="center">86.6 &#xb1; 0.5%</td>
<td valign="middle" align="center">72.7 &#xb1; 0.9%</td>
<td valign="middle" align="center">83.3 &#xb1; 0.5%</td>
<td valign="middle" align="center">78.8 &#xb1; 0.7%</td>
<td valign="middle" align="center">8.8 G</td>
<td valign="middle" align="center">4.6M</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>All models in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref> were retrained by the authors using the hyperparameters specified in Section 4.1.1. Model-specific configurations are provided in the supplementary YAML files.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>The experimental results clearly demonstrate that the proposed BEAM-YOLO model achieved optimal performance in rice pest detection tasks. On the key metric mAP@50, BEAM-YOLO reached a detection precision of 86.6%, significantly outperforming all comparative models. Compared to Transformer-based RTDETR series, BEAM-YOLO&#x2019;s mAP@50 surpassed RTDETR-R18 by 12.7% while its computational load was only 15.4% of the latter; compared to the traditional SSD model, it improved by 4.4% with a parameter count of only 18.5%. Within the YOLO series, BEAM-YOLO improved by 3.3% over YOLOv11&#x2019;s 83.3%; on the mAP@50&#x2013;95 metric reflecting the model&#x2019;s scale adaptability, BEAM-YOLO reached 72.7%, representing improvements of 3.0% and 8.3% over YOLOv11&#x2019;s 69.7% and SSD&#x2019;s 64.4%, respectively. Most significantly, BEAM-YOLO maintained high detection precision while substantially reducing computational burden compared to models with similar performance, with only 8.8G FLOPS and 4.6M parameters, dramatically lower than YOLOv9.</p>
<p>Comprehensive analysis indicates that the BEAM-YOLO model effectively captured pest morphological features through the MEN edge enhancement module, successfully separated pest targets from complex agricultural backgrounds through the BAFE module, while the EM-BFPN and SCAU modules enhanced detection capability for multi-scale pests, jointly constructing a rice pest detection algorithm with high precision and strong generalization ability, achieving significant improvement in detection performance while maintaining computational efficiency. <xref ref-type="fig" rid="f14"><bold>Figure&#xa0;14</bold></xref> shows a scatter plot of the relationship between detection performance and computational complexity for different YOLO model variants. From the figure, it can be clearly observed that the BEAM-YOLO model exhibits excellent performance in the trade-off between performance and efficiency.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Scatter plot of different model parameters.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g014.tif">
<alt-text content-type="machine-generated">Three rows of images showing insects on leaves. Each row consists of an original image and five model-generated heatmaps labeled &#x201c;Ours,&#x201d; &#x201c;Unireplknet,&#x201d; &#x201c;EfficientViT,&#x201d; &#x201c;Tepvit,&#x201d; and &#x201c;Resnet18.&#x201d; Each model highlights the insects with different levels of intensity using colorful overlays, indicating detection accuracy. Row (a) shows a red beetle, (b) a brown butterfly, and (c) multiple small insects.</alt-text>
</graphic></fig>
</sec>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Visualization comparison of detection results</title>
<p>To comprehensively evaluate the practical performance of the BEAM-YOLO model in rice pest detection tasks, this study comparatively analyzed the detection results of this model and mainstream YOLO series models in multiple typical scenarios, as shown in <xref ref-type="fig" rid="f15"><bold>Figure&#xa0;15</bold></xref>. The experimental design included four representative detection scenarios: (1) Ephydridae pests in green leaf backgrounds, for evaluating model precision in target recognition under similar backgrounds; (2) tiny Hesperiidae pests on green leaves, for testing the model&#x2019;s ability to capture fine morphological features; (3) extremely small-sized Thripidae pests, for examining the sensitivity of detection models to minuscule targets; and (4) multiple Noctuidae pests in complex backgrounds, for verifying model robustness in multi-target complex environments. Through these challenging representative scenarios, the algorithm evaluated the performance differences between the BEAM-YOLO constructed based on four innovative modules and existing YOLO variants.</p>
<fig id="f15" position="float">
<label>Figure&#xa0;15</label>
<caption>
<p>Visualization comparison of different model detection results, where <bold>(A)</bold> represents BEAM-YOLO, <bold>(B)</bold> represents YOLOv11, <bold>(C)</bold> represents YOLOv10, <bold>(D)</bold> represents YOLOv9, <bold>(E)</bold> represents YOLOv8, <bold>(F)</bold> represents YOLOv6, <bold>(G)</bold> represents YOLOv5, <bold>(H)</bold> represents YOLOv3-tiny.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1748419-g015.tif">
<alt-text content-type="machine-generated">Diagram showing a sequence of panels labeled (a) to (h), each with four columns labeled (1) to (4). Column (1) shows insects labeled &#x201c;Ephydridae&#x201d; with confidence scores. Column (2) shows plants labeled &#x201c;Hesperiidae&#x201d; and others, with confidence scores. Column (3) depicts insects labeled &#x201c;Thripidae&#x201d; with confidence scores. Column (4) includes bar graphs with different labels such as &#x201c;Noctuidae&#x201d; and &#x201c;Cecidomyiidae&#x201d; with varying confidence scores. Each panel demonstrates image recognition and classification processes.</alt-text>
</graphic></fig>
<p>The experimental results intuitively demonstrate the detection advantages of the BEAM-YOLO model in various complex scenarios. In scenario (1), BEAM-YOLO&#x2019;s detection confidence for Ephydridae pests reached 0.95, higher than other models, particularly 0.21 higher than YOLOv3-tiny; in scenario (2) with extremely small Hesperiidae targets, BEAM-YOLO&#x2019;s recognition confidence reached 0.95, while YOLOv6 incorrectly identified it as Cecidomyiidae, and YOLOv5 exhibited confusion with double tagging; in scenario (3), all models performed relatively similarly, but BEAM-YOLO still led with the highest confidence of 0.96; in the most challenging scenario (4), BEAM-YOLO successfully detected all Noctuidae targets with more balanced confidence distribution, while YOLOv6 and YOLOv3-tiny misclassified some targets as Ephydridae, and YOLOv10, YOLOv9, and YOLOv5 showed obvious missed detections. These results fully validate the synergistic effect of the innovative modules in BEAM-YOLO: the MEN edge enhancement module precisely captures pest morphological features, the BAFE contrastive wavelet attention effectively separates pest targets from complex backgrounds, and the EM-BFPN feature pyramid network and SCAU upsampling module significantly enhance multi-scale feature extraction and fusion capabilities, giving BEAM-YOLO comprehensive performance advantages in detection tasks involving small targets, similar morphologies, and complex backgrounds like rice pests.</p>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>This study presents BEAM-YOLO, a detection framework that addresses technical bottlenecks in intelligent rice pest detection while achieving an optimal balance between detection accuracy and computational efficiency. The four innovative modules work synergistically to effectively resolve practical application challenges faced by existing algorithms in actual field environments. The MEN module enhances morphological feature capture through multi-scale feature representation and edge enhancement mechanisms. The BAFE module innovatively employs wavelet decomposition combined with a dual attention mechanism, overcoming foreground-background confusion problems in complex agricultural environments. The EM-BFPN network strengthens information exchange between different feature levels through adaptive feature fusion and multi-scale convolution processing. The SCAU upsampling module improves detection sensitivity for small-sized targets by introducing channel shuffling and spatial displacement strategies.</p>
<p>Extensive comparative experiments demonstrate that our proposed model outperforms current state-of-the-art detection algorithms across various evaluation scenarios, showing particularly significant advantages when handling challenging situations such as complex agricultural environments, minute targets, and morphologically similar pests. Future research will focus on exploring model lightweight strategies to enable edge device deployment, integrating temporal information to develop real-time monitoring systems based on video sequences. Additionally, establishing shared data platforms utilizing blockchain technology and promoting industry standards will further facilitate large-scale application of precision agricultural pest control technologies, contributing to food security and sustainable agricultural development.</p>
<p>This study has several limitations that should be acknowledged. First, the JRICE-PD dataset was collected from a single region (Jiangxi Province), which may limit generalization to other geographical areas with different pest species distributions. Second, the dataset exhibits class imbalance (5.4:1 ratio), potentially affecting detection performance on minority classes. Third, all experiments were conducted on static images; real-time performance on video streams and temporal consistency remain to be validated. Fourth, inference latency was not measured on edge devices; preliminary tests on RTX 4060 show 8.2ms per frame, but embedded platform performance requires further investigation.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>XH: Investigation, Writing &#x2013; original draft, Software, Funding acquisition, Conceptualization, Resources, Writing &#x2013; review &amp; editing, Data curation. RZ: Visualization, Project administration, Supervision, Validation, Software, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bailey-Serres</surname> <given-names>J.</given-names></name>
<name><surname>Parker</surname> <given-names>J. E.</given-names></name>
<name><surname>Ainsworth</surname> <given-names>E. A.</given-names></name>
<name><surname>Oldroyd</surname> <given-names>G. E. D.</given-names></name>
<name><surname>Schroeder</surname> <given-names>J. I.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Genetic strategies for improving crop yields</article-title>. <source>Nature</source> <volume>575</volume>, <fpage>109</fpage>&#x2013;<lpage>118</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41586-019-1679-0</pub-id>, PMID: <pub-id pub-id-type="pmid">31695205</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chakrabarty</surname> <given-names>S.</given-names></name>
<name><surname>Shashank</surname> <given-names>P. R.</given-names></name>
<name><surname>Deb</surname> <given-names>C. K.</given-names></name>
<name><surname>Haque</surname> <given-names>M. A.</given-names></name>
<name><surname>Thakur</surname> <given-names>P.</given-names></name>
<name><surname>Kamil</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Deep learning-based accurate detection of insects and damage in cruciferous crops using YOLOv5</article-title>. <source>Smart Agric. Technol.</source> <volume>9</volume>, <fpage>100663</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.atech.2024.100663</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Mai</surname> <given-names>H.</given-names></name>
<name><surname>Luo</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>X.</given-names></name>
<name><surname>Wu</surname> <given-names>K.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>&#x201c;Effective feature fusion network in BIFPN for small object detection&#x201d;</article-title>. in <source>2021 IEEE international conference on image processing (ICIP)</source>. (
<publisher-name>IEEE</publisher-name>), <fpage>699</fpage>&#x2013;<lpage>703</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICIP42928.2021.9506347</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cheng</surname> <given-names>R.</given-names></name>
<name><surname>He</surname> <given-names>X.</given-names></name>
<name><surname>Zheng</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Multi-scale safety helmet detection based on SAS-YOLOv3-tiny</article-title>. <source>Appl. Sci.</source> <volume>11</volume>, <fpage>3652</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app11083652</pub-id>, PMID: <pub-id pub-id-type="pmid">36015821</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ding</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Ge</surname> <given-names>Y.</given-names></name>
<name><surname>Zhao</surname> <given-names>S.</given-names></name>
<name><surname>Song</surname> <given-names>L.</given-names></name>
<name><surname>Yue</surname> <given-names>X.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>&#x201c;UniRepLKNet: a universal perception large-kernel ConvNet for audio, video, point cloud, time-series and image recognition.&#x201d;</article-title> in: <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>, <fpage>5513</fpage>&#x2013;<lpage>5524</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.00527</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hasan</surname> <given-names>M. J.</given-names></name>
<name><surname>Mahbub</surname> <given-names>S.</given-names></name>
<name><surname>Alom</surname> <given-names>M. S.</given-names></name>
<name><surname>Nasim</surname> <given-names>M. A.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>&#x201c;Rice disease identification and classification by integrating support vector machine with deep convolutional neural network.&#x201d;</article-title> in: <source>Proceedings of the 2019 1st International Conference on Advances in Science, Engineering and Robotics Technology (ICASERT)</source> (
<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICASERT46338.2019</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jiang</surname> <given-names>J.-A.</given-names></name>
<name><surname>Tseng</surname> <given-names>C.-L.</given-names></name>
<name><surname>Lu</surname> <given-names>F.-M.</given-names></name>
<name><surname>Yang</surname> <given-names>E.-C.</given-names></name>
<name><surname>Wu</surname> <given-names>Z.-S.</given-names></name>
<name><surname>Chen</surname> <given-names>C.-P.</given-names></name>
<etal/>
</person-group>. (<year>2008</year>). 
<article-title>A GSM-based remote wireless automatic monitoring system for field information: a case study for ecological monitoring of the oriental fruit fly, Bactrocera dorsalis (Hendel)</article-title>. <source>Comput. Electron. Agric.</source> <volume>62</volume>, <fpage>243</fpage>&#x2013;<lpage>259</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2008.01.005</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jun</surname> <given-names>E. L. T.</given-names></name>
<name><surname>Tham</surname> <given-names>M.-L.</given-names></name>
<name><surname>Kwan</surname> <given-names>B.-H.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>&#x201c;A comparative analysis of RT-DETR and YOLOv8 for urban zone aerial object detection.&#x201d;</article-title> In: <source>Proceedings of the 2024 IEEE International Conference on Automatic Control and Intelligent Systems (I2CACIS)</source> (
<publisher-name>IEEE</publisher-name>), <fpage>340</fpage>&#x2013;<lpage>345</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/I2CACIS61270.2024.10649836</pub-id> 
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Khanam</surname> <given-names>R.</given-names></name>
<name><surname>Hussain</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Yolov11: an overview of the key architectural enhancements</article-title>. <source>arXiv preprint</source> <volume>arXiv</volume>:<fpage>2410.17725</fpage>. Available online at: <uri xlink:href="https://arxiv.org/abs/2410.17725">https://arxiv.org/abs/2410.17725</uri> (Accessed <date-in-citation content-type="access-date">January 30, 2026</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>R.</given-names></name>
<name><surname>Xie</surname> <given-names>C.</given-names></name>
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>R.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>A recognition method for rice plant diseases and pests video detection based on deep convolutional neural network</article-title>. <source>Sensors</source> <volume>20</volume>, <fpage>578</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s20030578</pub-id>, PMID: <pub-id pub-id-type="pmid">31973039</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>S.</given-names></name>
<name><surname>Feng</surname> <given-names>Z.</given-names></name>
<name><surname>Yang</surname> <given-names>B.</given-names></name>
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Liao</surname> <given-names>F.</given-names></name>
<name><surname>Gao</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>An intelligent monitoring system of diseases and pests on rice canopy</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>972286</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.972286</pub-id>, PMID: <pub-id pub-id-type="pmid">36035691</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Wei</surname> <given-names>H.</given-names></name>
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Zhan</surname> <given-names>Z.</given-names></name>
<name><surname>Ren</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Slim-neck by GSConv: a better design paradigm of detector architectures for autonomous vehicles</article-title>. <source>arXiv preprint</source> <volume>arXiv</volume>:<fpage>2206.02424</fpage>. Available online at: <uri xlink:href="https://arxiv.org/abs/2206.02424">https://arxiv.org/abs/2206.02424</uri> (Accessed <date-in-citation content-type="access-date">January 30, 2026</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<name><surname>Jiang</surname> <given-names>H.</given-names></name>
<name><surname>Weng</surname> <given-names>K.</given-names></name>
<name><surname>Geng</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>YOLOv6: a single-stage object detection framework for industrial applications</article-title>. <source>arXiv preprint</source> <volume>arXiv</volume>:<fpage>2209.02976</fpage>. Available online at: <uri xlink:href="https://arxiv.org/abs/2209.02976">https://arxiv.org/abs/2209.02976</uri> (Accessed <date-in-citation content-type="access-date">January 30, 2026</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>W.</given-names></name>
<name><surname>Anguelov</surname> <given-names>D.</given-names></name>
<name><surname>Erhan</surname> <given-names>D.</given-names></name>
<name><surname>Szegedy</surname> <given-names>C.</given-names></name>
<name><surname>Reed</surname> <given-names>S.</given-names></name>
<name><surname>Fu</surname> <given-names>C.-Y.</given-names></name>
<etal/>
</person-group>. (<year>2016</year>). 
<article-title>&#x201c;SSD: single shot multibox detector.&#x201d;;</article-title> In: <source>Computer Vision, ECCV 2016. Lecture Notes in Computer Science</source>, eds. 
<person-group person-group-type="editor">
<name><surname>Leibe</surname> <given-names>B.</given-names></name>
<name><surname>Matas</surname> <given-names>J.</given-names></name>
<name><surname>Sebe</surname> <given-names>N.</given-names></name>
<name><surname>Welling</surname> <given-names>M.</given-names></name>
</person-group> (<publisher-loc>Cham</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <volume>14</volume>, <fpage>21</fpage>&#x2013;<lpage>37</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-46448-0_2</pub-id> 
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Peng</surname> <given-names>H.</given-names></name>
<name><surname>Zheng</surname> <given-names>N.</given-names></name>
<name><surname>Yang</surname> <given-names>Y.</given-names></name>
<name><surname>Hu</surname> <given-names>H.</given-names></name>
<name><surname>Yuan</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>&#x201c;Efficientvit: Memory efficient vision transformer with cascaded group attention&#x201d;</article-title>, in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition)</source>, <fpage>14420</fpage>&#x2013;<lpage>14430</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.01386</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lu</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>P.</given-names></name>
<name><surname>Wang</surname> <given-names>P.</given-names></name>
<name><surname>Li</surname> <given-names>T.</given-names></name>
<name><surname>Li</surname> <given-names>G.</given-names></name>
</person-group> (<year>2025</year>a). 
<article-title>A method of rice yield prediction based on the QRBILSTM-MHSA network and hyperspectral image</article-title>. <source>Comput. Electron. Agric.</source> <volume>239</volume>, <fpage>110884</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110884</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lu</surname> <given-names>Y.</given-names></name>
<name><surname>Zhou</surname> <given-names>H.</given-names></name>
<name><surname>Wang</surname> <given-names>P.</given-names></name>
<name><surname>Wang</surname> <given-names>E.</given-names></name>
<name><surname>Li</surname> <given-names>G.</given-names></name>
<name><surname>Yu</surname> <given-names>T.</given-names></name>
</person-group> (<year>2025</year>b). 
<article-title>IMobileTransformer: A fusion-based lightweight model for rice disease identification</article-title>. <source>Eng. Appl. Artif. Intell.</source> <volume>161</volume>, <fpage>112271</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2025.112271</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Odusami</surname> <given-names>M.</given-names></name>
<name><surname>Maskeliunas</surname> <given-names>R.</given-names></name>
<name><surname>Dama evicius</surname> <given-names>R.</given-names></name>
<name><surname>Krilavicius</surname> <given-names>T.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Analysis of features of Alzheimer&#x2019;s disease: detection of early stage from functional brain changes in magnetic resonance images using a finetuned ResNet18 network</article-title>. <source>Diagnostics</source> <volume>11</volume>, <fpage>1071</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/diagnostics11061071</pub-id>, PMID: <pub-id pub-id-type="pmid">34200832</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Prasath</surname> <given-names>B.</given-names></name>
<name><surname>Akila</surname> <given-names>M.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>IoT-based pest detection and classification using deep features with enhanced deep learning strategies</article-title>. <source>Eng. Appl. Artif. Intell.</source> <volume>121</volume>, <fpage>105985</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2023.105985</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Preti</surname> <given-names>M.</given-names></name>
<name><surname>Verheggen</surname> <given-names>F.</given-names></name>
<name><surname>Angeli</surname> <given-names>S.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Insect pest monitoring with camera-equipped traps: strengths and limitations</article-title>. <source>J. Pest Sci.</source> <volume>94</volume>, <fpage>203</fpage>&#x2013;<lpage>217</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10340-020-01309-4</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rizzo</surname> <given-names>D. M.</given-names></name>
<name><surname>Lichtveld</surname> <given-names>M.</given-names></name>
<name><surname>Mazet</surname> <given-names>J. A.</given-names></name>
<name><surname>Togami</surname> <given-names>E.</given-names></name>
<name><surname>Miller</surname> <given-names>S. A.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Plant health and its effects on food safety and security in a one health framework: four case studies</article-title>. <source>One Health Outlook</source> <volume>3</volume>, <fpage>6</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s42522-021-00038-7</pub-id>, PMID: <pub-id pub-id-type="pmid">33829143</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Serasinghe</surname> <given-names>A.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Pest dataset. Roboflow Universe</article-title>. Available online at: <uri xlink:href="https://universe.roboflow.com/aryan-serasinghe-mwhtn/pest-dc2xk">https://universe.roboflow.com/aryan-serasinghe-mwhtn/pest-dc2xk</uri> (Accessed <date-in-citation content-type="access-date">January 30, 2026</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sohan</surname> <given-names>M.</given-names></name>
<name><surname>Sai Ram</surname> <given-names>T.</given-names></name>
<name><surname>Rami Reddy</surname> <given-names>C. V.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>&#x201c;A review on YOLOv8 and its advancements.&#x201d;</article-title> In: <source>Data Intelligence and Cognitive Informatics. ICDICI 2023. Algorithms for Intelligent Systems</source>, eds. 
<person-group person-group-type="editor">
<name><surname>Jacob</surname> <given-names>I. J.</given-names></name>
<name><surname>Piramuthu</surname> <given-names>S.</given-names></name>
<name><surname>Falkowski-Gilski</surname> <given-names>P.</given-names></name>
</person-group> (<publisher-loc>Singapore</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>529</fpage>&#x2013;<lpage>545</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-981-99-7962-2_39</pub-id> 
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Su</surname> <given-names>Y.</given-names></name>
<name><surname>Tan</surname> <given-names>W.</given-names></name>
<name><surname>Dong</surname> <given-names>Y.</given-names></name>
<name><surname>Xu</surname> <given-names>W.</given-names></name>
<name><surname>Huang</surname> <given-names>P.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Enhancing concealed object detection in active millimeter wave images using wavelet transform</article-title>. <source>Signal Process</source> <volume>216</volume>, <fpage>109303</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.sigpro.2023.109303</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Thenmozhi</surname> <given-names>K.</given-names></name>
<name><surname>Reddy</surname> <given-names>U. S.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Crop pest classification based on deep convolutional neural network and transfer learning</article-title>. <source>Comput. Electron. Agric.</source> <volume>164</volume>, <fpage>104906</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2019.104906</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Venkateswara</surname> <given-names>S. M.</given-names></name>
<name><surname>Padmanabhan</surname> <given-names>J.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Deep learning based agricultural pest monitoring and classification</article-title>. <source>Sci. Rep.</source> <volume>15</volume>, <fpage>8684</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-025-92659-5</pub-id>, PMID: <pub-id pub-id-type="pmid">40082501</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>A.</given-names></name>
<name><surname>Chen</surname> <given-names>H.</given-names></name>
<name><surname>Lin</surname> <given-names>Z.</given-names></name>
<name><surname>Han</surname> <given-names>J.</given-names></name>
<name><surname>Ding</surname> <given-names>G.</given-names></name>
</person-group> (<year>2024</year>a). 
<article-title>&#x201c;Repvit: Revisiting mobile cnn from vit perspective&#x201d;</article-title>, in: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>15909</fpage>&#x2013;<lpage>15920</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01506</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>A.</given-names></name>
<name><surname>Chen</surname> <given-names>H.</given-names></name>
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>K.</given-names></name>
<name><surname>Lin</surname> <given-names>Z.</given-names></name>
<name><surname>Han</surname> <given-names>J.</given-names></name>
</person-group> (<year>2024</year>b). 
<article-title>Yolov10: real-time end-to-end object detection</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>37</volume>, <fpage>107984</fpage>&#x2013;<lpage>108011</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.52202/079017</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>N.</given-names></name>
<name><surname>Fu</surname> <given-names>S.</given-names></name>
<name><surname>Rao</surname> <given-names>Q.</given-names></name>
<name><surname>Zhang</surname> <given-names>G.</given-names></name>
<name><surname>Ding</surname> <given-names>M.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Insect-YOLO: a new method of crop insect detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>232</volume>, <fpage>110085</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110085</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>C.-Y.</given-names></name>
<name><surname>Yeh</surname> <given-names>I.-H.</given-names></name>
<name><surname>Liao</surname> <given-names>H.-Y. M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Yolov9: learning what you want to learn using programmable gradient information</article-title>. <source>Eur. Conf. Comput. Vis</source>.
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>W.</given-names></name>
<name><surname>Liu</surname> <given-names>H.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<name><surname>Long</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). 
<article-title>Application of local fully convolutional neural network combined with YOLO v5 algorithm in small target detection of remote sensing image</article-title>. <source>PloS One</source> <volume>16</volume>, <elocation-id>e0259283</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0259283</pub-id>, PMID: <pub-id pub-id-type="pmid">34714878</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xiong</surname> <given-names>P.</given-names></name>
<name><surname>Zhang</surname> <given-names>C.</given-names></name>
<name><surname>He</surname> <given-names>L.</given-names></name>
<name><surname>Zhan</surname> <given-names>X.</given-names></name>
<name><surname>Han</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Deep learning-based rice pest detection research</article-title>. <source>PloS One</source> <volume>19</volume>, <elocation-id>e0313387</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0313387</pub-id>, PMID: <pub-id pub-id-type="pmid">39509376</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>G.</given-names></name>
<name><surname>Chen</surname> <given-names>G.</given-names></name>
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Fu</surname> <given-names>J.</given-names></name>
<name><surname>Guo</surname> <given-names>Y.</given-names></name>
<name><surname>Liang</surname> <given-names>H.</given-names></name>
</person-group>. (<year>2021</year>). 
<article-title>Convolutional rebalancing network for the classification of large imbalanced rice pest and disease datasets in the field</article-title>. <source>Front. Plant Sci.</source> <volume>12</volume>, <elocation-id>671134</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2021.671134</pub-id>, PMID: <pub-id pub-id-type="pmid">34290724</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ye</surname> <given-names>R.</given-names></name>
<name><surname>Shao</surname> <given-names>G.</given-names></name>
<name><surname>He</surname> <given-names>Y.</given-names></name>
<name><surname>Gao</surname> <given-names>Q.</given-names></name>
<name><surname>Li</surname> <given-names>T.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>YOLOv8-RMDA: lightweight YOLOv8 network for early detection of small target diseases in tea</article-title>. <source>Sensors</source> <volume>24</volume>, <fpage>2896</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s24092896</pub-id>, PMID: <pub-id pub-id-type="pmid">38733002</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Guo</surname> <given-names>W.</given-names></name>
<name><surname>Xing</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Yin</surname> <given-names>H.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>AugFCOS: augmented fully convolutional one-stage object detection network</article-title>. <source>Pattern Recognit</source> <volume>134</volume>, <elocation-id>109098</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.patcog.2022.109098</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>Q.-L.</given-names></name>
<name><surname>Yang</surname> <given-names>Y.-B.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Sa-net: shuffle attention for deep convolutional neural networks</article-title>. <source>ICASSP</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICASSP39728.2021.9414568</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhao</surname> <given-names>Y.</given-names></name>
<name><surname>Lv</surname> <given-names>W.</given-names></name>
<name><surname>Xu</surname> <given-names>S.</given-names></name>
<name><surname>Wei</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>G.</given-names></name>
<name><surname>Dang</surname> <given-names>Q.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>&#x201c;Detrs beat yolos on real-time object detection&#x201d;</article-title>, in: <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>16965</fpage>&#x2013;<lpage>16974</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01605</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhao</surname> <given-names>G.</given-names></name>
<name><surname>Ge</surname> <given-names>W.</given-names></name>
<name><surname>Yu</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>GraphFPN: Graph feature pyramid network for object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>, <fpage>2763</fpage>&#x2013;<lpage>2772</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00276</pub-id> 
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>Y.</given-names></name>
<name><surname>Zheng</surname> <given-names>W.</given-names></name>
<name><surname>Du</surname> <given-names>X.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>A lightweight rice pest detection algorithm based on improved YOLOv8</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>1</fpage>&#x2013;<lpage>18</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-024-81587-5</pub-id>, PMID: <pub-id pub-id-type="pmid">39623058</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Kong</surname> <given-names>E.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Multi-scale fusion uncrewed aerial vehicle detection based on RT-DETR</article-title>. <source>Electronics</source> <volume>13</volume>, <fpage>1489</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics13081489</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2878096">Lang Qiao</ext-link>, University of Minnesota Twin Cities, United States</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1060134">Yang Lu</ext-link>, Heilongjiang Bayi Agricultural University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1487422">Nevien Adel Ismaeil</ext-link>, Al-Azhar University, Egypt</p></fn>
</fn-group>
</back>
</article>