<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1775987</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>DFSNet: directional feature aggregation and shape-aware supervision for eggplant pest and disease detection</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Sun</surname><given-names>Hui</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3057750/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Fan</surname><given-names>Weicun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhang</surname><given-names>Junbo</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Feng</surname><given-names>Minghan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname><given-names>Fulin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Fu</surname><given-names>Rui</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3056321/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Weifang University of Science and Technology</institution>, <city>Weifang</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Shandong First Medical University &amp; Shandong Academy of Medical Sciences</institution>, <city>Jinan</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Rui Fu, <email xlink:href="mailto:furui19891209@wfust.edu.cn">furui19891209@wfust.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-09">
<day>09</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1775987</elocation-id>
<history>
<date date-type="received">
<day>26</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>17</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Sun, Fan, Zhang, Feng, Wang and Fu.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Sun, Fan, Zhang, Feng, Wang and Fu</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-09">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>In natural planting environments, pest and disease detection on eggplant fruits is characterized by small lesion sizes, weak edge feature information, significant scale variations, and complex backgrounds. Particularly, fruit borer holes, fruit rot lesions, and melon thrips bite marks exhibit obvious differences in size, edge structure, and spatial distribution, posing considerable challenges for real-time accurate detection. This paper proposes the DFSNet, a lightweight improved network for pest and disease detection on eggplant fruits in natural scenes. First, PConv is introduced in the P1, P2 shallow feature extraction stages of the baseline model&#x2019;s backbone network to enhance the modeling capability for fine-grained directional textures and weak edge information. Subsequently, an MSDA (Multi-Scale Directional Aggregation) module is designed and embedded into the feature enhancement modules at the P3, P4, and P5 layers of the backbone, which effectively improves the perception capability for insect hole edges and lesion contours through multi-directional depthwise separable convolution and Directional Edge Enhancer (DEE). Furthermore, a CSP-MSLA structure is introduced into the neck network, combining multi-scale linear attention mechanism with cross-stage partial connections to achieve selective enhancement of key pest and disease regions while maintaining low computational complexity. Finally, an SDDH (Shape-based Dynamic Detection Head) is introduced, which enhances the model&#x2019;s adaptive capability to different pest and disease geometric features and scale variations by introducing Scale-based Dynamic Loss. Experimental results demonstrate that the model achieves Precision of 81.0%, Recall of 78.3%, and mAP@50 of 80.5% on a self-constructed eggplant pest and disease dataset under natural scenes, representing improvements of 6.9, 8.8%, and 7.8% percentage points respectively compared to the baseline model. Meanwhile, the model parameters and computational cost are compressed to 1.8M and 5.4G respectively, with an inference speed of up to 378.13 FPS. The proposed method effectively improves small target detection accuracy and robustness under complex backgrounds while ensuring real-time performance, demonstrating particularly significant advantages in detecting small targets such as fruit borer holes and melon thrips bite marks, proving that this model is an efficient and robust real-time detection model for eggplant fruit pests and diseases.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>edge feature enhancement</kwd>
<kwd>eggplant disease detection</kwd>
<kwd>multi-scale attention</kwd>
<kwd>real-time detection</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. Shandong Provincial Natural Science Foundation (Grant No. ZR2025QC649). Shandong Province Higher Education Institutions 2025 Young Innovative Research Team (Grant No.2025KJH190).</funding-statement>
</funding-group>
<counts>
<fig-count count="13"/>
<table-count count="5"/>
<equation-count count="29"/>
<ref-count count="36"/>
<page-count count="20"/>
<word-count count="9429"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Eggplant (Solanum melongena L.), rich in various vitamins and bioactive substances, is one of the widely cultivated vegetables around the world. However, during the growth cycle, fruit diseases become a significant obstacle to eggplant production, such as internal boring caused by fruit borers, fruit rot caused by fungi or bacteria, and surface scars and banded stripes caused by thrips feeding (such as melon thrips), as shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>. These pests and diseases lead to the loss of fruit commercial value, severely causing enormous economic losses, affecting the economic income of plantations and the production enthusiasm of practitioners (<xref ref-type="bibr" rid="B9">Kellab et&#xa0;al., 2025</xref>). Traditional pest and disease diagnosis mainly relies on field inspection and empirical judgment by agricultural experts. This method depends on growers&#x2019; visual observation and production experience, with diagnostic results easily influenced by personal experience and conditions. Meanwhile, it is difficult to achieve rapid inspection over large areas and cannot meet the requirements for accurate identification operations in modern large-scale agricultural production. Therefore, developing more efficient and intelligent detection technologies has become an urgent problem to be solved in modern agriculture.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Fruit borer holes (nearly circular cavities) and banded stripes caused by melon thrips feeding, as well as fruit rot caused by fungi, occurring in eggplant cultivation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g001.tif">
<alt-text content-type="machine-generated">Four eggplants on plants, each showing signs of damage. The first is slightly twisted, the second has scratches, the third shows yellow and brown marks, and the fourth has several holes.</alt-text>
</graphic></fig>
<p>Early research was mostly based on traditional machine learning (<xref ref-type="bibr" rid="B1">Agarwal et&#xa0;al., 2020</xref>), performing disease classification through manually designed features, such as lesion segmentation (<xref ref-type="bibr" rid="B23">Spisni et&#xa0;al., 2020</xref>) and SVM classification (<xref ref-type="bibr" rid="B28">Wu et&#xa0;al., 2014</xref>). However, although such methods have high computational efficiency, their feature representation capability is limited, and generalization performance is poor under complex field conditions such as illumination variations and background interference. In contrast, deep learning methods (<xref ref-type="bibr" rid="B4">Fu et&#xa0;al., 2025</xref>) have achieved significant progress in the fields of agricultural monitoring and disease detection, opening up new pathways for intelligent agricultural management. Among them, Convolutional Neural Networks (CNNs) have demonstrated excellent performance in crop disease recognition and classification tasks, with accuracy rates substantially surpassing traditional methods. For example (<xref ref-type="bibr" rid="B2">Ashurov et&#xa0;al., 2025</xref>), proposed a DCNN model integrating depthwise separable convolution, SE module, and improved residual connections, achieving a significant reduction in computational complexity and enhancing disease recognition capability in resource-constrained environments with an accuracy of 99.47% (PlantVillage dataset) (<xref ref-type="bibr" rid="B21">Shafik et&#xa0;al., 2025</xref>). proposed a plant pest and disease detection method based on ResNet-9 deep convolutional neural network, which not only improved detection accuracy to 97.4%, but also effectively alleviated the class imbalance problem in the dataset through data augmentation strategies (<xref ref-type="bibr" rid="B19">Salka et&#xa0;al., 2025</xref>). reviewed and compared CNN-based plant disease detection architectures, establishing EfficientNet-B4 with an accuracy of 99.97% as the current accuracy benchmark. Recent studies have explored target perception and image quality enhancement in complex environments (<xref ref-type="bibr" rid="B12">Li et&#xa0;al., 2025a</xref>). introduced a joint detection and tracking framework based on reinforcement learning, which improves the perception of weak targets under heavy clutter (<xref ref-type="bibr" rid="B26">Wang et&#xa0;al., 2023</xref>). addressed underwater image degradation through color compensation and multi-attribute adjustment. Later (<xref ref-type="bibr" rid="B27">Wang et&#xa0;al., 2026</xref>), proposed a multimodal diffusion model to enhance color fidelity and detail representation under limited data conditions. In addition (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2025b</xref>), employed graph convolutional networks to exploit echo topology, improving target discrimination in low signal-to-noise scenarios. These studies provide useful insights for perception modeling in challenging environments. Although existing research has made certain progress, disease detection still faces severe challenges in complex agricultural scenarios. As shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>, the lesions of fruit rot, fruit borer damage, and melon thrips on eggplant fruits are extremely small in size and easily interfered with by complex backgrounds such as leaf textures, illumination shadows, and fruit surfaces, resulting in extremely low pixel proportions of lesion regions and weak visual features. This is because the spatial resolution of single-scale features is insufficient, making it difficult for the model to effectively capture the fine-grained geometric and texture information of small targets, thus causing high miss detection rates and false detection rates, which severely restricts the improvement of detection performance.</p>
<p>To address this, researchers have enhanced the model&#x2019;s representation capability for targets at different scales by introducing multi-scale feature fusion and attention mechanisms (<xref ref-type="bibr" rid="B34">Zhang et&#xa0;al., 2025a</xref>). proposed MAVM-UNet based on multi-scale aggregated vision Mamba, achieving pixel accuracy and MIoU of 82.07% and 81.48% respectively, with performance superior to HCFormer and VM-UNet (<xref ref-type="bibr" rid="B33">Zhang et&#xa0;al., 2023</xref>). designed DBCLNet, a dual-branch collaborative network combining multi-scale convolution and Focal Loss, achieving an accuracy of 99.89% on the PlantVillage dataset, significantly surpassing existing mainstream models. Furthermore, WMC-RTDETR proposed by (<xref ref-type="bibr" rid="B32">Zhang et&#xa0;al., 2025b</xref>) enhanced multi-scale feature extraction by integrating CSRFPN, achieving 97.7% mAP50 while reducing computational cost by 40.42%, enabling real-time detection on edge devices.</p>
<p>However, although multi-scale feature fusion and attention mechanisms have achieved positive progress, existing methods still have the following limitations: (1) channel and spatial attention based on global pooling are difficult to effectively model cross-scale high-dimensional feature dependencies; (2) complex attention structures are sensitive to noisy features, affecting the robustness of small target detection; (3) the introduction of multi-scale structures often leads to a significant increase in network parameters and computational complexity, which is unfavorable for lightweight deployment and real-time applications.</p>
<p>To address the above problems, this paper proposes a lightweight detection framework integrating edge feature enhancement, multi-scale feature modeling, and efficient feature selection. The main contributions of this study are as follows:</p>
<list list-type="bullet">
<list-item>
<p>PConv backbone design for shallow detail perception. To address the problems of blurred edges and weak directional texture features of pest and disease targets, this paper introduces PConv (Pinwheel Convolution) in the shallow stages (P1&#x2013;P2) of the lightweight detection network. This module models local directional structural information in parallel through multi-directional asymmetric convolution kernels. It significantly enhances the representation capability of direction-sensitive features with minimal parameter increase, effectively improving the feature representation quality of small insect holes and early lesions, and lays a reliable low-level feature foundation for subsequent multi-scale feature fusion.</p></list-item>
<list-item>
<p>MSDA structure for multi-directional multi-scale feature aggregation. To address the limitations of the C3K2 module in directional information modeling, this paper designs the MSDA (Multi-Scale Directional Aggregation) structure and embeds it into C3K2. This module models horizontal and vertical structural information in parallel through multi-directional depthwise separable convolution branches, while introducing a dual-branch edge enhancement module (DEE) to explicitly enhance the edge response of pest and disease targets, thereby improving the model&#x2019;s recognition capability for fruit rot lesion contours and irregular insect bite marks.</p></list-item>
<list-item>
<p>Lightweight attention-driven CSP-MSLA neck structure. To address the problems of complex background interference and redundant features, this paper constructs the CSP-MSLA structure in the neck network, combining Multi-Scale Linear Attention (MSLA) with the CSP mechanism to achieve adaptive enhancement of key pest and disease regions while controlling computational complexity. This structure improves the discriminability of multi scale feature fusion and enhances the detection robustness of the model under complex illumination variations and local occlusion conditions.</p></list-item>
<list-item>
<p>Shape-aware dynamic detection head SDDH. To address the significant differences in scale distribution and geometric morphology among different pest and disease targets, this paper designs a shape-aware detection head structure and adopts Scale-based Dynamic Loss as the regression supervision strategy. This method alleviates the problem of insufficient gradient contribution of small-scale targets during the training process through a scale-adaptive dynamic supervision mechanism, further improving the generalization capability of the detection head for multi-scale pest and disease targets.</p></list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Lightweight object detection backbone network structure</title>
<p>In recent years, with the widespread application of object detection algorithms in mobile and embedded scenarios, lightweight network design has become a research hotspot. CNN-based MobileNets (<xref ref-type="bibr" rid="B6">Howard et&#xa0;al., 2019</xref>, <xref ref-type="bibr" rid="B7">2017</xref>; <xref ref-type="bibr" rid="B20">Sandler et&#xa0;al., 2018</xref>) significantly reduced computational complexity by replacing standard convolutions with depthwise separable convolutions, while GhostNets (<xref ref-type="bibr" rid="B5">Han et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B14">Liu et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B25">Tang et&#xa0;al., 2022</xref>) her reduced the number of parameters by generating feature maps on half of the channels using cheap operations. However, these methods are limited by local receptive fields and struggle to capture global context information. In contrast, Vision Transformer (ViT) demonstrates advantages with its global receptive field and long-range dependency modeling capability. However, the quadratic computational complexity of its self-attention mechanism brings higher computational overhead. To achieve a better trade-off between speed and accuracy, single-stage detection models represented by the YOLO (You Only Look Once) series have achieved a balance between efficiency and performance in real-time object detection tasks through collaborative optimization design of the backbone network, neck structure, and detection head.</p>
<p>Among them (<xref ref-type="bibr" rid="B16">Pan et&#xa0;al., 2025</xref>), constructed the SSD-YOLO model by integrating the SENetV2 mechanism and DySample lightweight sampling module, achieving efficient and accurate detection of rice diseases with only 6MB parameters (<xref ref-type="bibr" rid="B22">Song et&#xa0;al., 2024</xref>). constructed the extremely lightweight model DODN by fusing deformable convolution and Transformer components, achieving efficient and accurate detection of cucumber diseases in complex scenarios with only 3.7 MB parameter scale and 3.9 GFLOPs low power consumption. However, such methods rely on spatial convolution stacking, which is not only limited by computational resources, but also difficult to adapt to pest and disease detection due to the neglect of fine-grained textures, resulting in existing models often being in a suboptimal state in real agricultural scenarios, which is also the core problem that this paper urgently needs to solve.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Frequency domain feature modeling and the application of wavelet transform in visual tasks</title>
<p>In addition to traditional spatial domain convolution, frequency domain feature modeling has gradually received attention in recent years. MWCNN proposed by (<xref ref-type="bibr" rid="B15">Liu et&#xa0;al., 2018</xref>) introduced Discrete Wavelet Transform (DWT) to replace traditional downsampling, retaining frequency domain information while compressing feature maps, effectively alleviating the information loss problem (<xref ref-type="bibr" rid="B10">Li et&#xa0;al., 2020</xref>). achieved decoupling of high-frequency and low-frequency components through DWT, significantly enhancing the anti-noise robustness of the model. Wavelet-SRNet proposed by (<xref ref-type="bibr" rid="B8">Huang et&#xa0;al., 2017</xref>) utilized wavelet coefficient prediction to reconstruct facial details, solving the over-smoothing problem in super-resolution tasks. Wavelet transform, with its excellent time-frequency localization characteristics, achieves effective decoupling of structure and details through multi-scale frequency band decomposition, demonstrating significant advantages in tasks such as image restoration, super-resolution, and semantic segmentation.</p>
<p>In the field of pest and disease detection, existing lightweight models neglect fine-grained textures due to their reliance on spatial convolution stacking, making it difficult to cope with complex detection scenarios. To address this, researchers have attempted to introduce wavelet transform to enhance texture perception capability (<xref ref-type="bibr" rid="B11">Li et&#xa0;al., 2022</xref>). introduced Discrete Wavelet Transform (DWT) into YOLOv4, strengthening the extraction of pest and disease detail textures and achieving accurate detection of small targets under complex backgrounds (<xref ref-type="bibr" rid="B11">Li et&#xa0;al., 2022</xref>). utilized Continuous Wavelet Analysis (CWA) to process hyperspectral data, accurately discriminating the stress states of tea plants affected by tea green leafhoppers, anthracnose, and other similar symptoms (<xref ref-type="bibr" rid="B17">Panchananam et&#xa0;al., 2025</xref>). proposed WFS-YOLO, which enhanced features in both frequency domain and spatial domain through Discrete Wavelet Transform (DWT), improving the perception accuracy of small pests and diseases in complex environments.</p>
<p>However, such methods are difficult to adapt to lightweight deployment due to their structural complexity and computational expense. How to efficiently utilize frequency domain information under conditions of limited computing power remains a core challenge in current model design.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Edge and high-frequency feature enhancement methods</title>
<p>Edge information is an important basis for target contour and shape discrimination. In fine-grained visual tasks such as pest and disease detection, high-frequency textures and boundary features are crucial for distinguishing lesions, insect holes, and healthy regions. However, low-resolution images lose a large amount of high-frequency details during the imaging process, resulting in edge blurring and texture degradation. To solve this problem, researchers have explored enhancement strategies for edge and high-frequency features from multiple perspectives. For example (<xref ref-type="bibr" rid="B35">Zhao et&#xa0;al., 2019</xref>), proposed EGNet, which guides target localization by explicitly modeling edge features (Edge Guidance Stream), compensating for the loss of boundary information in deep networks (<xref ref-type="bibr" rid="B18">Qiu et&#xa0;al., 2024</xref>). proposed an Adaptive Compressed Sensing (ACS) architecture that captures key edge regions through a cascaded guidance mechanism, providing a low-overhead solution for pest and disease detail preservation (<xref ref-type="bibr" rid="B36">Zheng and Yang, 2024</xref>). proposed Contextual Boundary Aware Network (CBA-Net), which strengthens the model&#x2019;s capture of salient object contours through a contextual boundary awareness mechanism.</p>
<p>Although existing methods have made progress in edge and high-frequency feature enhancement, they still have limitations in fine-grained tasks such as pest and disease detection. Existing methods mostly focus on salient edges, with insufficient reconstruction capability for small textures such as early lesions and insect holes; frequency domain and spatial domain mechanisms are often designed independently, making it difficult to collaboratively capture global and local features; in addition, improper module design can easily lead to a surge in computational overhead or feature distribution imbalance. Therefore, how to design a lightweight and efficient enhancement mechanism that balances fine-grained texture recovery and global reconstruction quality is the core problem of this paper.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Application of multi-scale attention mechanisms</title>
<p>Multi-scale feature fusion is a key technology for improving object detection performance. Classic structures such as FPN and PAN fuse features at different scales through top-down and bottom-up pathways, but their information interaction mainly relies on element-wise addition or concatenation, lacking explicit modeling of cross-scale semantic relationships. In recent years, attention mechanisms have been introduced into detection networks to enhance feature selectivity and context awareness capability. For example (<xref ref-type="bibr" rid="B24">Sun et&#xa0;al., 2025</xref>), proposed the SRCA attention module, which effectively integrates high and low-resolution features through adaptive weighting and bidirectional fusion, significantly improving the multi-scale perception capability of tomato leaf lesions (<xref ref-type="bibr" rid="B30">Zhang, 2025</xref>). designed the MHCF encoder, which enhances multi-scale feature fusion using the Transformer structure, achieving a balance between accuracy and efficiency in pomegranate detection in complex orchard environments.</p>
<p>However, existing attention-based multi-scale fusion methods are usually accompanied by high computational complexity, making it difficult to directly adapt to resource-constrained lightweight detection models. How to design computationally efficient attention mechanisms while maintaining the effectiveness of multi-scale feature fusion, achieving a balance between detection accuracy and model lightweight, is the core problem that this paper is committed to solving.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>Overall network architecture</title>
<p>Based on the baseline model lightweight detection framework, this paper constructs an improved model DFSNet (Directional Feature Aggregation and Shape-Aware Supervision for Eggplant Pest and Disease Detection) for pest and disease detection on eggplant fruits in natural scenes. The structure is shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2a</bold></xref>. While maintaining the original inference efficiency advantages, DFSNet performs targeted optimization on backbone feature extraction, feature fusion, and detection head, specifically addressing the characteristics of pest and disease targets such as &#x201c;small scale, multiple morphologies, and weak edges&#x201d;.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Overall architecture of DFSNet: <bold>(a)</bold> the complete network,<bold>(b)</bold>  the  MSDA model and <bold>(c)</bold>  the CSP-MSLA model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g002.tif">
<alt-text content-type="machine-generated">Illustration of a neural network architecture with three sections: (a) Backbone, (b) Neck and Head, and (c) Detailed CSP-MSLA module. Section (a) shows sequential processing steps like PConv and MSDA. Section (b) highlights the Neck including concatenations, upsampling, and the CSP-MSLA leading to the Head, specifically SDDH. Section (c) details the CSP-MSLA module with Conv, Split, MSLA, and SiLU layers. A small image inset shows a plant stem with pore-like structures. The design is color-coded and labeled with module names and parameters.</alt-text>
</graphic></fig>
<p>In the backbone network, Conv is replaced by PConv (Pinwheel Convolution) in the P1&#x2013;P2 layers to enhance shallow directional texture and fine-grained edge feature representation; meanwhile, the MSDA (Multi-Scale Directional Aggregation, as shown the <xref ref-type="fig" rid="f2"><bold>Figure 2b</bold></xref> module is introduced into the C3K2 structure to improve the aggregation capability for multi-directional structural information. In the neck network, the CSP-MSLA structure (as shown in the <xref ref-type="fig" rid="f2"><bold>Figure 2c</bold></xref> is designed to integrate the multi-scale linear attention mechanism into the cross-stage partial connection framework to achieve selective enhancement of key pest and disease regions. Finally, SDDH (Shape-Based Dynamic Detection Head) is introduced to improve the model&#x2019;s adaptive capability to different pest and disease geometric features through a shape aware dynamic loss function.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>PConv-enhanced shallow feature extraction</title>
<p>In the baseline model YOLOv11 original network, backbone feature extraction mainly relies on standard two-dimensional convolution operators. The convolution kernels in standard convolution have a unified response form in all spatial directions. This modeling approach implicitly assumes that local structures have similar statistical characteristics in different directions. However, in natural scene eggplant fruit pest and disease detection, this assumption is difficult to establish. For example, fruit borer holes typically exhibit extremely small scale and weak edge local structures, while melon thrips bite marks present obvious elongated stripe morphology with strong directional dependence. Standard convolution has limited discriminative capability for these directional features in the shallow stages, easily leading to the weakening of key information during the feature downsampling process. Therefore, this paper introduces the PConv (Pinwheel Convolution) structure in the P1, P2 layers of the YOLOv11 backbone network, as shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>, to replace standard convolution.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Pinwheel-shaped convolution module. The CBS module consists of three parts: Conv, BN (Batch Normalization), and SiLU. Concat stands for concatenate.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating a pinwheel-shaped convolution module. The process begins with a grid, forming an overlapping structure. Multiple CBS blocks are concatenated into a 3D grid. This undergoes a Conv(2,2) operation, resulting in a receptive field visualization, featuring a hash operation with a pinwheel pattern.</alt-text>
</graphic></fig>
<p>PConv (<xref ref-type="bibr" rid="B29">Yang et&#xa0;al., 2025</xref>) achieves explicit modeling of local structural directionality by introducing asymmetric padding strategies in different spatial directions of the input feature map and combining parallel convolution operations. Specifically, for the four directions of left, right, top, and bottom, different forms of asymmetric padding are applied respectively, as shown in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msup><mml:mi>X</mml:mi><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:mi>d</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtext>left</mml:mtext><mml:mo>,</mml:mo><mml:mtext>right</mml:mtext><mml:mo>,</mml:mo><mml:mtext>top</mml:mtext><mml:mo>,</mml:mo><mml:mtext>bottom</mml:mtext></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> represents the asymmetric padding operation applied along the <italic>d</italic>-th direction, used to introduce directionally biased spatial context information. Subsequently, CBS operations are performed on the four directionally enhanced feature maps respectively, with the expression given by <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mtext>SiLU</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>BN</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mi>X</mml:mi><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2009;</mml:mo><mml:mtext>one</mml:mtext><mml:mo>&#x2009;</mml:mo><mml:msub><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>k<sub>i</sub></italic>is the convolution kernel. Finally, the output feature <italic>X</italic><sub>out</sub> is obtained, as shown in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>:</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>SiLU</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>BN</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mn>4</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x2009;one&#x2009;</mml:mtext><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>Compared to traditional convolution, this approach can more sensitively capture target structures with obvious directional features, such as lesion edge contours and elongated insect bodies, while maintaining a low number of parameters. By introducing PConv before attention modeling, the model can proactively highlight morphological information related to pests and diseases and suppress the interference of complex background textures on subsequent feature fusion processes. Meanwhile, explicit directional structural modeling capability is introduced at the shallow stage, enhancing the discriminability of edge and texture features while maintaining lightweight characteristics.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>C3K2-MSDA</title>
<p>Although introducing directional convolution PConv at shallow layers can enhance detail representation, the baseline model adopting small convolution kernels and shallow design is prone to insufficient context modeling and insufficient receptive field problems when dealing with large targets and complex backgrounds. This paper introduces a wavelet-enhanced multi-scale dilated attention module MSDA (Multi-Scale Directional Aggregation) into C3K2 at the P3, P4, and P5 stages of the backbone, obtaining C3K2-MSDA as shown in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>. In this module, one path maintains the original cross-stage connection path of C3K2, while the other path introduces MSDA for enhanced modeling, with the structure shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2b</bold></xref>-MSDA. This design ensures the continuity of the original C3K2 feature flow while providing additional multi-scale attention supplementation to the network. The MSDA structure consists of two important branches: multi-scale modeling and EdgeEnhance branch. Let the input feature be <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:mi>F</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, where C, H, and W represent the number of channels, height, and width respectively.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Architecture of C3K2-MSDA.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g004.tif">
<alt-text content-type="machine-generated">Flowchart of the C3K2-MSDA model showing sequential components: a 1x1 convolution, a split, MSDA, another 1x1 convolution, SiLU activation, and more convolutions, concluding with a concatenation process. Arrows depict data flow between elements, with feedback loops and repetition indicated.</alt-text>
</graphic></fig>
<p>First, a 1&#xd7;1 convolution is used to complete channel mapping as shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>:</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Multi-scale modeling branch</title>
<p>The multi-scale modeling branch adopts a continuous WTConv structure combined with the GELU nonlinear function as shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2b</bold></xref>, and the feature transformation process can be expressed as <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>F<sub>m</sub></italic>is used for multi-scale feature modeling, and <italic>F<sub>a</sub></italic>is used for attention weight generation. In the multi-scale branch, a continuous feature transformation structure based on WTConv is introduced to enhance the perception capability for local patterns at different scales. The feature extraction process of this branch can be expressed as <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>:</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mi>F</mml:mi><mml:msub><mml:mo>'</mml:mo><mml:mi>m</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x3b4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> represent WTConv operations, and <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:mi>&#x3b4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> is the GELU nonlinear activation function. To avoid information attenuation in deep networks while enhancing feature stability, the feature <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> after introducing the residual connection is shown in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>:</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mo>'</mml:mo><mml:mi>m</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>This structure enables multi-scale features to obtain richer contextual representations while maintaining original structural information. WTConv (<xref ref-type="bibr" rid="B3">Finder et&#xa0;al., 2024</xref>) (as shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>), as a key component for implementing large receptive field modeling in the MSDA module, is used to enhance the global representation capability of features while maintaining local structural information. WTConv decomposes the input feature map into four frequency subbands through discrete wavelet transform, including LL, LH, HL, and HH. Among them, LL mainly reflects the overall structure and semantic information of the target, while (LH, HL, and HH) correspond to detail features such as edges and textures. By independently modeling features in different frequency bands and fusing them in subsequent stages, WTConv can introduce cross-scale and cross-frequency feature responses without significantly increasing computational complexity. This multi-band feature aggregation approach enables the network to simultaneously perceive local details and larger-scale contextual information, thereby effectively expanding the receptive field in the MSDA module and enhancing the representation capability for edges and structural changes of pest and disease targets.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Architecture of WTConv.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g005.tif">
<alt-text content-type="machine-generated">Diagram illustrating the WTConv architecture. Input \(X\) undergoes a convolution, producing intermediary outputs. Wavelet transform (WT) decomposes \(X\) into sub-bands \(X^{1}_{LL}, X^{1}_{LH}, X^{1}_{HL}, X^{1}_{HH}\). These are processed through convolution and additional WT steps, generating transformed sub-bands \(X^{2}_{LL}, X^{2}_{LH}, X^{2}_{HL}, X^{2}_{HH}\). After further transformations, inverse wavelet transform (IWT) recomposes the signals into \(X'\), contributing to the final output. Multiple operations merge through addition at various stages.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Edge feature enhancement module DEE</title>
<p>In pest and disease detection tasks, the edges of pests and diseases such as fruit borer holes, fruit rot lesions, and melon thrips stripes present rapidly changing pixel intensity regions on eggplant fruit surface images. These regions change dramatically and constitute high-frequency information, which is also the core feature for distinguishing pest and disease features from healthy fruit images. Therefore, effective modeling of high frequency edge features can effectively highlight the contour and shape features of pests and diseases. However, traditional convolutional neural networks tend to produce smoothing effects on high-frequency details during layer-by-layer downsampling and feature fusion processes, resulting in the gradual weakening of edge information. To address this problem, this paper designs DEE (Edge feature Enhancement Module) as shown in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>, which explicitly highlights high-frequency change regions in the input features, enabling the network to perceive the contour and shape information of pest and disease regions during the feature extraction stage.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Architecture of DEE module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g006.tif">
<alt-text content-type="machine-generated">Diagram depicting an edge feature enhancement module with a flowchart of operations. Input feature \( F_a \) is split and passed through two depthwise convolution (DWConv) units. Outputs \( F_{d1} \) and \( F_{d2} \) are combined by element-wise addition, then processed through a \( 1 \times 1 \) convolution and a sigmoid function, producing output \( A \). This is element-wise multiplied with the initial split input. A legend describes symbols for element-wise addition and multiplication.</alt-text>
</graphic></fig>
<p>This module models gradient changes in the feature map and injects the enhanced edge response into the original features in residual form, thereby avoiding interference with the overall semantic structure. It is used to effectively strengthen the representation capability of edge and high-frequency information without changing the spatial resolution of the input feature map. The DEE module acts on intermediate layer features of the network, and its enhancement process is learnable, capable of adaptively adjusting the response intensity to different edge patterns according to task requirements.</p>
<p>This module takes intermediate layer feature mapping as input and first calculates feature gradients (F_d1, F_d2) in the horizontal and vertical directions respectively, representing regions with relatively drastic pixel intensity changes in the feature map, thereby explicitly extracting high-frequency information such as edges and textures. Subsequently, a comprehensive edge response is obtained through gradient magnitude fusion, and 1&#xd7;1 convolution is used for channel mapping and adaptive reweighting of edge features. Finally, the enhanced edge features are combined with the original features in residual form, achieving effective enhancement of target boundaries and local structures while avoiding disruption of the original semantic information distribution. <xref ref-type="disp-formula" rid="eq8">Equations 8</xref>-<xref ref-type="disp-formula" rid="eq12">12</xref> express the feature information flow process.</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> represents Depthwise Convolution, used to obtain spatial structural information at lower computational complexity.</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>Subsequently, 1&#xd7;1 convolution and Sigmoid function are used to generate attention weight A as shown in the equation:</p>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:mi>A</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>and perform element-wise recalibration on the input features:</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>&#x2a00;</mml:mo><mml:mi>A</mml:mi></mml:mrow></mml:math>
</disp-formula>
<p>where &#x2299; represents element-wise multiplication. This process achieves joint feature selection at both spatial and channel levels without introducing global pooling or high-complexity operators.</p>
<p>DEE enhances the activation intensity of pest and disease target edge regions through adaptive weighting of multi-directional feature responses, improving the model&#x2019;s perception capability for fruit borer hole edges and fruit rot lesion contours. This module maintains a lightweight design 345 in structure, effectively balancing directional information modeling capability and computational 346 efficiency.</p>
</sec>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>CSP-MSLA neck and shape-aware detection head</title>
<sec id="s3_4_1">
<label>3.4.1</label>
<title>CSP-MSLA neck design</title>
<p>In natural scenes, complex illumination, leaf occlusion, and background texture similarity easily introduce a large amount of redundant features, weakening the multi-scale feature fusion effect. In object detection, the Neck structure plays an important role in connecting the backbone network and detection head, with its core objective being to achieve effective alignment and fusion of multi-scale features. The original Neck of YOLOv11n mainly relies on lightweight convolution modules such as C3K2 for feature transformation, possessing certain local modeling capability while ensuring computational efficiency. However, such structures are still essentially dominated by spatial domain convolution, with limited modeling capability for cross-scale contextual relationships and long-range dependencies. Especially in complex agricultural scenarios, the semantic associations between small scale lesions and medium-scale fruit regions are difficult to fully characterize. On the other hand, attention mechanisms demonstrate obvious advantages in modeling long-distance dependencies and global information interaction, but directly introducing standard self-attention structures often brings high computational and storage overhead, making them unsuitable for lightweight detection frameworks. Based on this, in the baseline Neck part, this paper introduces the Multi-Scale Linear Attention (MSLA) module for key scale feature layers P3, P4, and P5, and deeply integrates it with the C3K2 structure to reconstruct the module into CSP-MSLA units as shown in CSP-MSLA in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2c</bold></xref>. The CSP structure reduces redundant computation and enhances gradient flow through cross-stage feature splitting and recombination. The MSLA structure, as shown in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref>, explicitly introduces multi-scale global modeling capability, modeling cross-position and cross-scale global association relationships in multi-scale feature space. The combination of the two enables the neck network to significantly enhance the model&#x2019;s capability to localize pest and disease target regions of interest and suppress irrelevant information while maintaining lightweight characteristics.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Architecture of MSLA.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g007.tif">
<alt-text content-type="machine-generated">Diagram illustrating a multi-scale linear attention model. It shows a process from input channels through multi-scale feature extraction using depthwise convolutions (DWConv) with different kernel sizes (3x3, 5x5, 7x7, 9x9) and ReLU activation. Outputs are fed into a multi-head efficient attention mechanism involving linear transformations and matrix multiplications (Matmul). Combined results are processed via 1x1 convolution, resulting in an output. Various stages and connections are visually labeled, detailing the flow and transformations of data.</alt-text>
</graphic></fig>
<p>In the MSLA (Multi-Scale Linear Attention) module, the three branches Q, K, and V are retained, but to reduce computational complexity, the global computation of QKT is approximated linearly. Meanwhile, multi-scale convolutions (e.g., 3 &#xd7; 3, 5 &#xd7; 5, 7 &#xd7; 7, 9 &#xd7; 9) are applied to enhance the features of Q and K, and matrix multiplication is used to calculate weights for local regions. That is,</p>
<disp-formula>
<mml:math display="block" id="M13"><mml:mrow><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>Q</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x3d5;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mi>T</mml:mi></mml:msup><mml:mo>&#xb7;</mml:mo><mml:mi>V</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im8"><mml:mi>&#x3d5;</mml:mi></mml:math></inline-formula> represents a kernel function approximation or multi-scale feature transformation. This transformation bypasses explicit Softmax, achieving attention intensity distribution through kernel function approximation or weight normalization. Due to the adoption of block computation (i.e., first calculating <italic>K<sup>T</sup>V</italic>, then multiplying with <italic>Q</italic>), the overall computational complexity is reduced from <italic>O</italic>(<italic>N</italic><sup>2</sup>) in traditional self-attention to <italic>O</italic>(<italic>N</italic>). By constructing multi-scale parallel convolution branches, MSLA can capture feature responses under different receptive fields and utilize the linear attention mechanism to perform weighted fusion of multi-scale features, enabling coordinated representation of local detail information and global semantic information. This design promotes effective transfer of multi-scale features in the backbone network and helps improve cross-scale feature modeling capability.</p>
</sec>
<sec id="s3_4_2">
<label>3.4.2</label>
<title>Shape-aware dynamic detection head</title>
<sec id="s3_4_2_1">
<label>3.4.2.1</label>
<title>Baseline model loss function</title>
<p>The baseline model adopts a fixed loss function in the detection head, which consists of the classification loss <italic>L</italic><sub>cls</sub>, the objectness confidence loss <italic>L</italic><sub>obj</sub>, and the bounding box regression loss <italic>L</italic><sub>reg</sub>. The overall formulation can be expressed as <xref ref-type="disp-formula" rid="eq14">Equation 13</xref>.</p>
<disp-formula id="eq14"><label>(13)</label>
<mml:math display="block" id="M14"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>YOLOv</mml:mtext><mml:mn>11</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>cls</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>obj</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x3bb;</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mtext>reg</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>Here, <italic>L</italic><sub>reg</sub> is typically optimized based on IoU or its variants, and <italic>&#x3bb;</italic> denotes the weighting coefficient of the regression loss term <italic>L</italic><sub>reg</sub>. This loss function adopts a unified weight allocation strategy for all targets, without explicitly distinguishing the contributions of objects with different scales or shapes during training. In pest and disease detection tasks, such approximately uniform supervision is prone to causing gradient imbalance. On the one hand, small-scale targets (e.g., borer holes) occupy a relatively small proportion of pixels in the feature maps, making their <italic>L</italic><sub>reg</sub> easily overwhelmed in the overall loss. On the other hand, elongated and stripe-like targets (e.g., feeding traces of thrips) are highly sensitive to slight localization deviations under IoU-based constraints, which leads to instability in the regression process. To address these issues, this paper introduces a Scale-based Dynamic Loss on the basis of the original regression structure to dynamically adjust the regression supervision. This strategy is applied to the detection head, forming the SDDH (Shape-aware Dynamic Detection Head).</p>
</sec>
<sec id="s3_4_2_2">
<label>3.4.2.2</label>
<title>Scale-based dynamic loss</title>
<p>It is well known that IoU-based losses (<italic>S</italic><sub>loss</sub>) exhibit relatively large fluctuations in small object detection, which negatively affect model stability and regression performance. In <italic>S</italic><sub>loss</sub> with bounding box (BBox) annotations, smaller objects usually receive lower attention weights, whereas mask annotations have a greater impact on small or irregularly shaped objects. Therefore, some studies dynamically adjust the influence coefficients <italic>&#x3b2;</italic> of <italic>S</italic><sub>loss</sub> and <italic>L</italic><sub>loss</sub> according to object scale, so as to enhance the influence of <italic>S</italic><sub>loss</sub> on mask annotations and reduce the adverse effects of inaccurate annotations on the stability of the loss function, thereby ensuring that the model pays more attention to small or irregularly shaped objects. This loss function mainly consists of two components: LSDB (the Scale-based Dynamic Loss for the BBox) and LSDM (the Scale-based Dynamic Loss for the Mask). The computation of LSDB and its related parameters are given in <xref ref-type="disp-formula" rid="eq14">Equations 14</xref>-<xref ref-type="disp-formula" rid="eq20">20</xref>, while the computation of LSDM and its related parameters are presented in <xref ref-type="disp-formula" rid="eq21">Equations 21</xref>-<xref ref-type="disp-formula" rid="eq23">23</xref>.</p>
<p>&#x2022; The <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>SDB</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula></p>
<p>The scale-based dynamic loss for the bounding box is composed of a scale consistency loss <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BS</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> and a localization loss <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BL</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> with corresponding weights. It is defined as <xref ref-type="disp-formula" rid="eq14">Equation 14</xref>:</p>
<disp-formula id="eq15"><label>(14)</label>
<mml:math display="block" id="M15"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>SDB</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BS</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BL</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>Here, <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>0.5</mml:mn><mml:mo>,</mml:mo><mml:mn>1.0</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im13"><mml:mrow><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>1.0</mml:mn><mml:mo>,</mml:mo><mml:mn>1.5</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>as defined in <xref ref-type="disp-formula" rid="eq18">Equation 18</xref>, denote the dynamic weighting coefficients for the bounding box scale loss <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BS</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> (see <xref ref-type="disp-formula" rid="eq15">Equation 15</xref>) and the localization loss <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BL</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> (see <xref ref-type="disp-formula" rid="eq17">Equation 16</xref>), respectively. The scale loss is defined as:</p>
<disp-formula id="eq16"><label>(15)</label>
<mml:math display="block" id="M16"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BS</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mtext>IoU</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x3b3;</mml:mi></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq17"><label>(16)</label>
<mml:math display="block" id="M17"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>BL</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mi>d</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mi>&#xa0;</mml:mi><mml:mi>one</mml:mi><mml:mo>&#xa0;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mtext>sbp</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mtext>sbp</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mtext>sgt</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mtext>sgt</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msup><mml:mi>L</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>Let <italic>B</italic><sub>P</sub> and <italic>B</italic><sub>gt</sub> denote the predicted bounding box and the ground-truth bounding box, respectively. The scale-aware <italic>S</italic><sub>IoU</sub> is defined as <xref ref-type="disp-formula" rid="eq18">Equation 17</xref>:</p>
<disp-formula id="eq18"><label>(17)</label>
<mml:math display="block" id="M18"><mml:mrow><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mtext>IoU</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mtext>P</mml:mtext></mml:msub><mml:mo>&#x2229;</mml:mo><mml:mi>&#xa0;</mml:mi><mml:mi>one</mml:mi><mml:mo>&#x200b;</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mtext>P</mml:mtext></mml:msub><mml:mo>&#x222a;</mml:mo><mml:mi>&#xa0;</mml:mi><mml:mi>one</mml:mi><mml:mo>&#x200b;</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p><italic>&#x3b2;</italic><sub>1</sub> and <italic>&#x3b2;</italic><sub>2</sub> is defined in <xref ref-type="disp-formula" rid="eq19">Equation 18</xref>. These coefficients dynamically adjust the loss weights according to the object scale. Here a scale influence factor <italic>&#x3b2;</italic><sub>3</sub> is introduced as shown in <xref ref-type="disp-formula" rid="eq20">Equation 19</xref>:</p>
<disp-formula id="eq19"><label>(18)</label>
<mml:math display="block" id="M19"><mml:mrow><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3b4;</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mi>&#x3b4;</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>3</mml:mn></mml:msub></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq20"><label>(19)</label>
<mml:math display="block" id="M20"><mml:mrow><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mtext>min&#xa0;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>max</mml:mi><mml:mo>&#xa0;</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>&#xd7;</mml:mo><mml:mi>&#x3b8;</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>&#x3b4;</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#x2009;</mml:mtext><mml:mi>&#x3b4;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im16"><mml:mi>&#x3b4;</mml:mi></mml:math></inline-formula> =0.5 is the upper limit for scale adjustment, used to constrain the range of weight variation and prevent instability during training, red <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:mtext>max&#xa0;</mml:mtext><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>81</mml:mn></mml:mrow></mml:math></inline-formula> is the maximum size of IRST as defined by the Society of Photo-Optical
Instrumentation Engineers (<xref ref-type="bibr" rid="B31">Zhang et&#xa0;al., 2003</xref>). The scale mapping factor <italic>&#x3b8;</italic> is defined as <xref ref-type="disp-formula" rid="eq21">Equation 20</xref>:</p>
<disp-formula id="eq21"><label>(20)</label>
<mml:math display="block" id="M21"><mml:mrow><mml:mi>&#x3b8;</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mtext>size</mml:mtext></mml:mrow><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mtext>size</mml:mtext></mml:mrow><mml:mi>f</mml:mi></mml:msub></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>Here, <inline-formula>
<mml:math display="inline" id="im18"><mml:mrow><mml:msub><mml:mi>&#x3b2;</mml:mi><mml:mn>3</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> is the scale influence factor for both the BBox and Mask branches; the function <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> denotes the Euclidean distance; <inline-formula>
<mml:math display="inline" id="im20"><mml:mi>L</mml:mi></mml:math></inline-formula> represents the diagonal length of the minimum enclosing rectangle that simultaneously bounds the predicted box <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mtext>P</mml:mtext></mml:msub></mml:mrow></mml:math></inline-formula> and the ground-truth box <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mtext>G</mml:mtext></mml:msub></mml:mrow></mml:math></inline-formula>, used to normalize the center point distance. <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:msub><mml:mrow><mml:mtext>size</mml:mtext></mml:mrow><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msub><mml:mrow><mml:mtext>size</mml:mtext></mml:mrow><mml:mi>f</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> denote the dimensions of the original image and the current feature map, respectively.</p>
<p>&#x2022; The <inline-formula>
<mml:math display="inline" id="im25"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>SDM</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula></p>
<p>The <inline-formula>
<mml:math display="inline" id="im26"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>SDM</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is similarly composed of the mask scale loss <inline-formula>
<mml:math display="inline" id="im27"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>MS</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> and the mask localization loss <inline-formula>
<mml:math display="inline" id="im28"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>ML</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> with corresponding weights <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mi>&#x3b2;</mml:mi><mml:mn>1</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>1.0</mml:mn><mml:mo>,</mml:mo><mml:mn>1.5</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msubsup><mml:mi>&#x3b2;</mml:mi><mml:mn>2</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mi>&#x131;</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>0.5</mml:mn><mml:mo>,</mml:mo><mml:mn>1.0</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, as defined in <xref ref-type="disp-formula" rid="eq21">Equation 21</xref>:</p>
<disp-formula id="eq22"><label>(21)</label>
<mml:math display="block" id="M22"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>SDM</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mi>&#x3b2;</mml:mi><mml:mn>1</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>MS</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msubsup><mml:mi>&#x3b2;</mml:mi><mml:mn>2</mml:mn><mml:mo>'</mml:mo></mml:msubsup><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>ML</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>Let <italic>M</italic><sub>P</sub> and <italic>M</italic><sub>gt</sub> denote the sets of pixels in the predicted mask and the ground-truth mask, respectively, and let <italic>p</italic> be a weighting coefficient. The mask scale loss <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>MS</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>is defined as <xref ref-type="disp-formula" rid="eq23">Equations 22</xref>, <xref ref-type="disp-formula" rid="eq24">23</xref>:</p>
<disp-formula id="eq23"><label>(22)</label>
<mml:math display="block" id="M23"><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mtext>IoU</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mtext>P</mml:mtext></mml:msub><mml:mo>&#x2229;</mml:mo><mml:mi>&#xa0;</mml:mi><mml:mi>one</mml:mi><mml:mo>&#x200b;</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mtext>P</mml:mtext></mml:msub><mml:mo>&#x222a;</mml:mo><mml:mi>&#xa0;</mml:mi><mml:mi>one</mml:mi><mml:mo>&#x200b;</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq24"><label>(23)</label>
<mml:math display="block" id="M24"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>MS</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>p</mml:mi><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mtext>IoU</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>The mask localization loss <inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>ML</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is defined as <xref ref-type="disp-formula" rid="eq25">Equation 24</xref>:</p>
<disp-formula id="eq25"><label>(24)</label>
<mml:math display="block" id="M25"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>ML</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:mi>min</mml:mi><mml:mi>&#xa0;</mml:mi><mml:mi>one</mml:mi><mml:mo>&#xa0;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mtext>mp</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mtext>mgt</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>max</mml:mi><mml:mi>&#xa0;</mml:mi><mml:mi>one</mml:mi><mml:mo>&#xa0;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mtext>mp</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mtext>mgt</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mn>4</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x3b8;</mml:mi><mml:mrow><mml:mtext>mp</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x3b8;</mml:mi><mml:mrow><mml:mtext>mgt</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msup><mml:mi>&#x3c0;</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>Here, <italic>d</italic><sub>mp</sub> and <italic>d</italic><sub>mgt</sub> denote the average distances of the predicted mask pixels and the ground-truth mask pixels from the origin in polar coordinates, respectively; <italic>&#x3b8;</italic><sub>pp</sub> and <italic>&#x3b8;</italic><sub>pgt</sub> represent the average angles of the predicted mask pixels and the ground-truth mask pixels in polar coordinates, respectively.</p>
<p>The Scale-based Dynamic Loss (SDLoss) employed in this study incorporates object scale information into the loss computation process to achieve adaptive constraints for targets of different scales. In the bounding box regression branch, the scale consistency term and the localization term are combined with weighted summation, and the weights are adjusted using scale factors, thereby applying differentiated supervision to targets of varying scales without altering the original regression formulation. In the mask branch, SDLoss integrates pixel-level overlap constraints with polar-coordinate-based spatial distribution modeling, providing joint constraints on the regional consistency and spatial distribution of mask predictions, which helps improve the modeling stability for irregularly shaped targets. It should be noted that the mathematical definition of the Scale-based Dynamic Loss is not modified in this work; rather, it is applied to pest and disease detection tasks and combined with the proposed detection head structure to accommodate the variations in scale and shape of pest and disease targets in natural scenes.</p>
</sec>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments and results</title>
<sec id="s4_1">
<label>4.1</label>
<title>Dataset construction</title>
<p>In this study, a custom dataset of eggplant fruit pests and diseases was established, comprising four categories: FruitBorer, FruitRot, and MelonThrips, as shown in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>. The dataset was primarily derived from two sources. The first source consists of sample images collected on October 3, 2025, in a vegetable greenhouse in Shouguang, Shandong Province, China. The original images, captured using an iPhone 14 at a resolution of 1920&#xd7;1080, encompassed various lighting conditions and shooting angles to enhance data diversity, resulting in 1074 images in JPEG format. After removing blurred, duplicate, and invalid images, 673 valid samples remained. These images were annotated using the Label Studio tool to label the pest and disease regions and their corresponding categories, and the annotations were saved in YOLO format. The second source is the publicly available Eggplant Fruit Disease dataset from the Robotflow platform, from which 2,177 pest and disease images were randomly selected. Combined with the first source, a total of 3,250 sample images were obtained, and all images were resized to 640&#xd7;640 pixels. To improve model generalization and robustness, Mosaic data augmentation techniques, including random translation, horizontal/vertical flipping, non-uniform scaling, brightness adjustment, and Gaussian noise injection, were applied to expand the number of training images to 7,256. The dataset was randomly split into training (5,080 images), validation (1,451 images), and test sets (725 images) in a 7:2:1 ratio.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Some samples of eggplant fruit disease dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g008.tif">
<alt-text content-type="machine-generated">Four eggplants are shown in separate stages of condition. The first is affected by a fruit borer with visible holes. The second has fruit rot with dark patches. The third is damaged by melon thrips, showing a scarred area. The fourth is a healthy eggplant with no visible damage, hanging on a plant with green leaves.</alt-text>
</graphic></fig>
<p>To further evaluate the generalization capability of the proposed method in agricultural vision tasks, comparative experiments were conducted on the publicly available PlantDoc dataset. PlantDoc is an open dataset for disease detection in real agricultural scenarios, covering multiple crop types and their corresponding disease categories. The dataset exhibits uneven object scale distribution, diverse lesion morphologies, and complex backgrounds, which effectively reflect the practical challenges of object detection tasks in natural cultivation environments. Representative sample images are shown in <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Some samples of PlantDoc dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g009.tif">
<alt-text content-type="machine-generated">A grid of plant images shows various diseases and conditions affecting crops. Top row: corn with gray leaf spots, bean with rust, grape with black rot, tomato with rust. Middle row: apple with scab, leaves with bacterial spot, tomato with septoria, potato with early blight. Bottom row: healthy blueberry plant, strawberry with yellow leaf, blueberry with bilberry, and a peach with a leaf.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental environment</title>
<p>All experiments in this study were conducted using the same model parameters and environmental settings. The experimental environment and model parameter configurations are listed in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>, and the epochs is 150, batch size is 16.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Experimental environment parameters.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Name</th>
<th valign="middle" align="left">Hardware configuration version</th>
<th valign="middle" align="left">Name</th>
<th valign="middle" align="left">Value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">GPU</td>
<td valign="middle" align="left">NVIDIA RTX 4090</td>
<td valign="middle" align="left">Leamingrate (Lr)</td>
<td valign="middle" align="left">0.001</td>
</tr>
<tr>
<td valign="middle" align="left">CPU</td>
<td valign="middle" align="left">Intel Core i7</td>
<td valign="middle" align="left">Batch size</td>
<td valign="middle" align="left">16</td>
</tr>
<tr>
<td valign="middle" align="left">CUDA</td>
<td valign="middle" align="left">11.8</td>
<td valign="middle" align="left">Optimizer</td>
<td valign="middle" align="left">AdamW</td>
</tr>
<tr>
<td valign="middle" align="left">Worker</td>
<td valign="middle" align="left">8</td>
<td valign="middle" align="left">Momentom</td>
<td valign="middle" align="left">0.9</td>
</tr>
<tr>
<td valign="middle" align="left">PyTorch</td>
<td valign="middle" align="left">2.6.0</td>
<td valign="middle" align="left">Torchvision</td>
<td valign="middle" align="left">0.21.0</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Performance evaluation metrics</title>
<p>In the eggplant fruit pest and disease detection task, multiple evaluation metrics were employed
to comprehensively assess the detection accuracy and computational efficiency of the lightweight
model in complex natural environments. The metrics include Precision (<italic>P</italic>), Recall (<italic>R</italic>), mean Average Precision (<italic>mAP</italic>@50 and <italic>mAP</italic>@50 &#x2212; 95), number of parameters (Params), and floating-point operations (<italic>GFLOPs</italic>, True positives (<italic>TP</italic>) are defined as correctly detected fruit borer holes, fruit rot lesions, or thrips feeding traces; false positives (<italic>FP</italic>) occur when fruit surface textures, glare, or other regions are incorrectly identified as pests or diseases; false negatives (<italic>FN</italic>) correspond to missed detections of existing borer holes or lesions, particularly for small-scale targets. The main formulas are presented in <xref ref-type="disp-formula" rid="eq26">Equations 25</xref>-<xref ref-type="disp-formula" rid="eq29">28</xref>.</p>
<disp-formula id="eq26"><label>(25)</label>
<mml:math display="block" id="M26"><mml:mrow><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq27"><label>(26)</label>
<mml:math display="block" id="M27"><mml:mrow><mml:mi>R</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq28"><label>(27)</label>
<mml:math display="block" id="M28"><mml:mrow><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:mrow><mml:msubsup><mml:mo>&#x222b;</mml:mo><mml:mn>0</mml:mn><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>R</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#xa0;</mml:mtext><mml:mi>d</mml:mi><mml:mi>R</mml:mi></mml:mrow></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>Here, <italic>P</italic>(<italic>R</italic>) represents the precision-recall curve with recall as the horizontal axis. The <italic>mAP</italic> is then obtained by averaging over all categories:</p>
<disp-formula id="eq29"><label>(28)</label>
<mml:math display="block" id="M29"><mml:mrow><mml:mi>m</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mi>A</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Comparative study</title>
<p>To comprehensively evaluate the performance of various object detection models in the eggplant pest and disease detection task, this study selected mainstream models including YOLOv5n, YOLOv8n, YOLOv10n, YOLOv11n, YOLOv12n, Faster R-CNN, and the RT-DETR r18 variant for experiments on the custom eggplant pest and disease dataset. The comparative experimental results of these models on the publicly available PlantDoc dataset are presented in the <xref ref-type="table" rid="T2"><bold>Tables&#xa0;2</bold></xref>, <xref ref-type="table" rid="T3"><bold>3</bold></xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparison of results on the eggplant dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="left">P/%</th>
<th valign="middle" align="left">R/%</th>
<th valign="middle" align="left">mAP@50/%</th>
<th valign="middle" align="left">mAP@50-95/%</th>
<th valign="middle" align="left">Params</th>
<th valign="middle" align="left">GFLOPs</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">YOLOv5n</td>
<td valign="middle" align="center">68.1</td>
<td valign="middle" align="center">65.0</td>
<td valign="middle" align="center">70.8</td>
<td valign="middle" align="center">42.1</td>
<td valign="middle" align="center">2.19M</td>
<td valign="middle" align="center">5.8</td>
<td valign="middle" align="center">362.36</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv8n</td>
<td valign="middle" align="center">72.1</td>
<td valign="middle" align="center">67.3</td>
<td valign="middle" align="center">71.7</td>
<td valign="middle" align="center">44.9</td>
<td valign="middle" align="center">2.69M</td>
<td valign="middle" align="center">6.8</td>
<td valign="middle" align="center">328.09</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv9t</td>
<td valign="middle" align="center">73.1</td>
<td valign="middle" align="center">67.7</td>
<td valign="middle" align="center">71.1</td>
<td valign="middle" align="center">46.2</td>
<td valign="middle" align="center">1.77M</td>
<td valign="middle" align="center">6.4</td>
<td valign="middle" align="center">361.91</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv10n</td>
<td valign="middle" align="center">71.2</td>
<td valign="middle" align="center">66.3</td>
<td valign="middle" align="center">69.8</td>
<td valign="middle" align="center">44.1</td>
<td valign="middle" align="center">2.69M</td>
<td valign="middle" align="center">8.2</td>
<td valign="middle" align="center">371.84</td>
</tr>
<tr>
<td valign="middle" align="left">Baseline</td>
<td valign="middle" align="center">74.1</td>
<td valign="middle" align="center">69.5</td>
<td valign="middle" align="center">72.7</td>
<td valign="middle" align="center">45.3</td>
<td valign="middle" align="center">2.59M</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">325.43</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv12n</td>
<td valign="middle" align="center">75.7</td>
<td valign="middle" align="center">68.6</td>
<td valign="middle" align="center">72.7</td>
<td valign="middle" align="center">45.1</td>
<td valign="middle" align="center">2.50M</td>
<td valign="middle" align="center">5.8</td>
<td valign="middle" align="center">297.67</td>
</tr>
<tr>
<td valign="middle" align="left">Faster-RCNN</td>
<td valign="middle" align="center">68.7</td>
<td valign="middle" align="center">42.8</td>
<td valign="middle" align="center">54.0</td>
<td valign="middle" align="center">22.6</td>
<td valign="middle" align="center">41.34M</td>
<td valign="middle" align="center">78.5</td>
<td valign="middle" align="center">91.3</td>
</tr>
<tr>
<td valign="middle" align="left">RT-DETR r18</td>
<td valign="middle" align="center">58.1</td>
<td valign="middle" align="center">60.6</td>
<td valign="middle" align="center">65.4</td>
<td valign="middle" align="center">42.7</td>
<td valign="middle" align="center">15.78M</td>
<td valign="middle" align="center">46.0</td>
<td valign="middle" align="center">108.7</td>
</tr>
<tr>
<td valign="middle" align="left">DFS-Net (OURS)</td>
<td valign="middle" align="center"><bold>81</bold>.<bold>0</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>78</bold>.<bold>3</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>80</bold>.<bold>5</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>48</bold>.<bold>0</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>1</bold>.<bold>8</bold>&#x2193;</td>
<td valign="middle" align="center"><bold>5</bold>.<bold>4</bold>&#x2193;</td>
<td valign="middle" align="center"><bold>378</bold>.<bold>13</bold>&#x2191;</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of results on the PlantDoc dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">P/%</th>
<th valign="middle" align="center">R/%</th>
<th valign="middle" align="center">mAP@50/%</th>
<th valign="middle" align="center">mAP@50-95/%</th>
<th valign="middle" align="center">Params</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">YOLOv5n</td>
<td valign="middle" align="center">50.3</td>
<td valign="middle" align="center">46.2</td>
<td valign="middle" align="center">51.4</td>
<td valign="middle" align="center">38.7</td>
<td valign="middle" align="center">2.5M</td>
<td valign="middle" align="center">7.9</td>
<td valign="middle" align="center">270.27</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv8n</td>
<td valign="middle" align="center">41.3</td>
<td valign="middle" align="center">51.0</td>
<td valign="middle" align="center">49.8</td>
<td valign="middle" align="center">38.7</td>
<td valign="middle" align="center">3.01M</td>
<td valign="middle" align="center">8.1</td>
<td valign="middle" align="center">250.00</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv9t</td>
<td valign="middle" align="center">38.4</td>
<td valign="middle" align="center">50.2</td>
<td valign="middle" align="center">47.5</td>
<td valign="middle" align="center">37.4</td>
<td valign="middle" align="center">1.97M</td>
<td valign="middle" align="center">7.6</td>
<td valign="middle" align="center">232.55</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv10n</td>
<td valign="middle" align="center">48.0</td>
<td valign="middle" align="center">49.6</td>
<td valign="middle" align="center">50.3</td>
<td valign="middle" align="center">39.1</td>
<td valign="middle" align="center">2.27M</td>
<td valign="middle" align="center">6.6</td>
<td valign="middle" align="center">333.33</td>
</tr>
<tr>
<td valign="middle" align="left">Baseline</td>
<td valign="middle" align="center">48.4</td>
<td valign="middle" align="center">50.5</td>
<td valign="middle" align="center">52.3</td>
<td valign="middle" align="center">39.0</td>
<td valign="middle" align="center">2.58M</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">294.12</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv12n</td>
<td valign="middle" align="center">39.6</td>
<td valign="middle" align="center">48.4</td>
<td valign="middle" align="center">49.6</td>
<td valign="middle" align="center">37.3</td>
<td valign="middle" align="center">2.53M</td>
<td valign="middle" align="center">5.9</td>
<td valign="middle" align="center">256.41</td>
</tr>
<tr>
<td valign="middle" align="left">Faster-RCNN</td>
<td valign="middle" align="center">42.5</td>
<td valign="middle" align="center">42.7</td>
<td valign="middle" align="center">51.5</td>
<td valign="middle" align="center">32.3</td>
<td valign="middle" align="center">48.64M</td>
<td valign="middle" align="center">216.31</td>
<td valign="middle" align="center">24.7</td>
</tr>
<tr>
<td valign="middle" align="left">RT-DETR r18</td>
<td valign="middle" align="center">38.2</td>
<td valign="middle" align="center">50.7</td>
<td valign="middle" align="center">33.4</td>
<td valign="middle" align="center">33.1</td>
<td valign="middle" align="center">22.75M</td>
<td valign="middle" align="center">62.75</td>
<td valign="middle" align="center">18.3</td>
</tr>
<tr>
<td valign="middle" align="left">DFS-Net (OURS)</td>
<td valign="middle" align="center"><bold>52</bold>.<bold>5</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>50</bold>.<bold>1</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>51</bold>.<bold>8</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>39</bold>.<bold>2</bold>&#x2191;</td>
<td valign="middle" align="center"><bold>2</bold>.<bold>42</bold>&#x2193;</td>
<td valign="middle" align="center"><bold>5</bold>.<bold>9</bold>&#x2193;</td>
<td valign="middle" align="center"><bold>319</bold>.<bold>8</bold>&#x2191;</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Overall, DFS-Net shows stable and competitive performance on both the Eggplant and PlantDoc datasets. On the Eggplant dataset, the proposed method achieves favorable mAP results with fewer parameters and lower computational cost, indicating that the lightweight design contributes to improved efficiency without sacrificing accuracy. On the more challenging PlantDoc dataset, DFS-Net maintains comparable or slightly improved accuracy relative to the baseline, while preserving advantages in inference speed and model compactness. These results suggest that DFS-Net offers a reasonable balance between detection performance and computational efficiency for disease and 517 pest detection in agricultural applications.</p>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Ablation study</title>
<p>To evaluate the contribution of each module to detection performance, YOLOv11n was used as the baseline model, and the PConv, C3K2-MSDA, CSP-MSLA, and SDDH modules were progressively incorporated to ultimately construct the complete DFSNet (OURS) model. The experiments were conducted on the public eggplant pest and disease dataset, and the evaluation metrics described in Section 4.1.2 were employed. The results of the ablation study are presented in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Ablation study results of different modules.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">PConv</th>
<th valign="middle" align="center">C3K2-MSDA</th>
<th valign="middle" align="center">CSP-MSLA</th>
<th valign="middle" align="center">SDDH</th>
<th valign="middle" align="center">P/%</th>
<th valign="middle" align="center">R/%</th>
<th valign="middle" align="center">mAP@50/%</th>
<th valign="middle" align="center">mAP@50-95/%</th>
<th valign="middle" align="center">Params</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Baseline</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">74.1</td>
<td valign="middle" align="center">69.5</td>
<td valign="middle" align="center">72.7</td>
<td valign="middle" align="center">45.3</td>
<td valign="middle" align="center">2.59M</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">325.43</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">78.6</td>
<td valign="middle" align="center">65.3</td>
<td valign="middle" align="center">71.4</td>
<td valign="middle" align="center">44.4</td>
<td valign="middle" align="center">2.54M</td>
<td valign="middle" align="center">6.2</td>
<td valign="middle" align="center">351.51</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">76.1</td>
<td valign="middle" align="center">67.8</td>
<td valign="middle" align="center">72.8</td>
<td valign="middle" align="center">45.0</td>
<td valign="middle" align="center">2.56M</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">355.86</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">79.7</td>
<td valign="middle" align="center">66.6</td>
<td valign="middle" align="center">72.0</td>
<td valign="middle" align="center">44.5</td>
<td valign="middle" align="center">2.54M</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">379.56</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">82.6</td>
<td valign="middle" align="center">79.0</td>
<td valign="middle" align="center">81.0</td>
<td valign="middle" align="center">48.0</td>
<td valign="middle" align="center">1.8M</td>
<td valign="middle" align="center">5.4</td>
<td valign="middle" align="center">332.00</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">80.0</td>
<td valign="middle" align="center">71.1</td>
<td valign="middle" align="center">74.8</td>
<td valign="middle" align="center">46.8</td>
<td valign="middle" align="center">2.56M</td>
<td valign="middle" align="center">6.25</td>
<td valign="middle" align="center">338.0</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2013;</td>
<td valign="middle" align="center">81.6</td>
<td valign="middle" align="center">72.3</td>
<td valign="middle" align="center">72.6</td>
<td valign="middle" align="center">47.9</td>
<td valign="middle" align="center">2.58M</td>
<td valign="middle" align="center">6.45</td>
<td valign="middle" align="center">332.0</td>
</tr>
<tr>
<td valign="middle" align="center">DFSNet (OURS)</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im32"><mml:mrow><mml:mn>81.0</mml:mn><mml:mo>&#x2191;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im33"><mml:mrow><mml:mn>78.3</mml:mn><mml:mo>&#x2191;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im34"><mml:mrow><mml:mn>80.5</mml:mn><mml:mo>&#x2191;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im35"><mml:mrow><mml:mn>48.0</mml:mn><mml:mo>&#x2191;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im36"><mml:mrow><mml:mn>1.8</mml:mn><mml:mo>&#x2193;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im37"><mml:mrow><mml:mn>5.4</mml:mn><mml:mo>&#x2193;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im38"><mml:mrow><mml:mn>378.13</mml:mn><mml:mo>&#x2191;</mml:mo></mml:mrow></mml:math></inline-formula></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The ablation study results indicate that the contributions of different improvement modules within the network exhibit clear hierarchical and complementary effects. The introduction of PConv in the lower backbone layers stabilizes the preservation of fine-grained features, while the integration of C3K2 and MSDA in the middle and higher layers enhances the network&#x2019;s capability to represent multi-scale pest and disease targets. On this basis, the incorporation of CSP-MSLA into the Neck stage facilitates more comprehensive feature fusion, improving the information utilization efficiency across different scales. Finally, with the introduction of the SDDH loss function, the model demonstrates better adaptability in target matching and bounding box regression, particularly reflected in improvements in recall and overall detection stability. These results suggest that the modules do not operate in isolation but collaboratively achieve an optimal balance between performance and efficiency.</p>
<p>To further evaluate the effect of each module on the detection performance of different target categories, a category-level comparison of mAP@50 for four detection classes (FruitRot, FruitBorer, Healthy, MelonThrips) was conducted, as presented in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Category-level mAP@50 results for different ablation blocks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Ablation Block</th>
<th valign="middle" align="left">PConv</th>
<th valign="middle" align="left">C3K2-MSDA</th>
<th valign="middle" align="left">CSP-MSLA</th>
<th valign="middle" align="left">SDDH</th>
<th valign="middle" align="left">FruitRot</th>
<th valign="middle" align="left">FruitBorer</th>
<th valign="middle" align="left">Healthy</th>
<th valign="middle" align="left">MelonThrips</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">72.8%</td>
<td valign="middle" align="left">60.5%</td>
<td valign="middle" align="left">92.6%</td>
<td valign="middle" align="left">49.7%</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">75.4%</td>
<td valign="middle" align="left">63.2%</td>
<td valign="middle" align="left">93.1%</td>
<td valign="middle" align="left">54.6%</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">74.6%</td>
<td valign="middle" align="left">62.4%</td>
<td valign="middle" align="left">92.4%</td>
<td valign="middle" align="left">53.8%</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">75.1%</td>
<td valign="middle" align="left">63.9%</td>
<td valign="middle" align="left">92.8%</td>
<td valign="middle" align="left">55.2%</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">&#x2013;</td>
<td valign="middle" align="left">77.2%</td>
<td valign="middle" align="left">65.1%</td>
<td valign="middle" align="left">93.4%</td>
<td valign="middle" align="left">57.8%</td>
</tr>
<tr>
<td valign="middle" align="left"/>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left"/>
<td valign="middle" align="left">78.6%</td>
<td valign="middle" align="left">67.0%</td>
<td valign="middle" align="left">93.8%</td>
<td valign="middle" align="left">59.9%</td>
</tr>
<tr>
<td valign="middle" align="left">DFS-Net (Ours)</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left">&#x2713;</td>
<td valign="middle" align="left"><bold>87</bold>.<bold>3%</bold>&#x2191;</td>
<td valign="middle" align="left"><bold>76</bold>.<bold>4%</bold>&#x2191;</td>
<td valign="middle" align="left"><bold>94</bold>.<bold>2%</bold>&#x2191;</td>
<td valign="middle" align="left"><bold>65</bold>.<bold>1%</bold>&#x2191;</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The category-level ablation results indicate that the Healthy class exhibits relatively stable performance across different configurations, whereas the FruitRot class shows a consistent improvement trend with the incorporation of multi-scale feature modeling. For small-scale and morphologically complex classes such as FruitBorer and MelonThrips, the collaborative effect of PConv and the multi-scale attention modules significantly enhances feature representation. With the further introduction of the SDDH loss function, the matching and regression performance of these difficult-to-detect classes is improved, leading to overall performance gains that are consistent with the conclusions drawn from the general ablation study.</p>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Visualization</title>
<sec id="s4_6_1">
<label>4.6.1</label>
<title>PConv compared to Conv</title>
<p>A representative eggplant image containing a FruitBorer hole was selected, and shallow feature extraction at the P1 and P2 layers of the backbone network was performed using both PConv and standard Conv. The comparative results are presented in <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>. The above comparison indicates that PConv, by employing multi-directional asymmetric convolutional kernels to model local directional structural information in parallel, enables the network to capture more discriminative edge and texture features at shallow layers. This direction-sensitive feature extraction approach facilitates the preservation of critical fine-grained details during subsequent downsampling and multi-scale feature fusion, providing a clear advantage for small-scale targets with prominent edge features, such as FruitBorer holes. In contrast, standard convolution primarily emphasizes the overall local texture distribution at shallow layers, exhibiting limited capability in distinguishing directional structures and fine edges, which may lead to progressive attenuation of small target features under complex natural backgrounds.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Comparison of shallow feature responses between PConv and standard Conv.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g010.tif">
<alt-text content-type="machine-generated">Input image of a leaf with red outlined squares highlighting areas is shown on the left. On the right, there are four black and white processed images divided into two layers, labeled “First layer” and “Second layer,” with two methods, “PConv” and “Conv,” showcasing different views at resolutions of 256 by 256 and 128 by 128, each highlighting similar regions with red squares.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_6_2">
<label>4.6.2</label>
<title>Visualization of the model&#x2019;s feature localization capability</title>
<p><xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref> presents heatmap comparisons between the proposed DFSNet model and the baseline model on four representative images. These visualizations intuitively reveal the key image regions that the models focus on when detecting different areas or shapes of eggplant fruit diseases and pests. As shown in <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref>, distinct activation patterns can be observed across different categories in the Grad-CAM visualizations. For fruit rot samples, high-response regions are mainly concentrated on the diseased areas and show good spatial consistency with the ground-truth annotations and detection results. For melon thrips, the activation exhibits a clear vertically elongated pattern, which is consistent with the characteristic damage morphology. In healthy samples, no localized abnormal activation is observed, and the responses are primarily distributed over the fruit body. For fruit borer samples, the model produces concentrated activations around the infestation regions. Overall, the heatmap results indicate that the model is able to attend to disease- and pest-related regions while maintaining low responses to background areas.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Comparative visualization of detection results and Grad-CAM heatmaps for eggplant fruit diseases and pests.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g011.tif">
<alt-text content-type="machine-generated">Grid layout of eggplants showing four conditions: Fruit Rot, Melon Thrips, Healthy, and Fruit Borer. Each condition has four columns: original image, ground truth with green boxes, detection results with blue boxes and classification labels, and Grad-CAM heatmaps indicating focus areas for each classification.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_6_3">
<label>4.6.3</label>
<title>Qualitative comparison on small fruit rot detection</title>
<p>The <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref> shows DFSNet (OURS) exhibits the best performance on small Fruit Rot detection, providing more accurate localization and higher confidence than other compared models. YOLOv5n&#x2013;YOLOv12n, Faster R-CNN, and RT-DETR r18 show limited robustness, with low confidence or imprecise bounding boxes under complex backgrounds. These results indicate that DFSNet is more effective in capturing fine-grained features of small disease regions.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Qualitative detection results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g012.tif">
<alt-text content-type="machine-generated">Comparison of eggplant fruit rot detection using various methods. Top row: Original image, ground truth, Yolov5n, Yolov8n. Middle row: Yolov9t, Yolov10n, Yolov12n, Faster RCNN. Bottom row: RT DETR r18, Yolov11n (baseline), DFSNet (OURS). Each method highlights and labels affected areas with various confidence scores.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_6_4">
<label>4.6.4</label>
<title>Visualization of detection results under complex environments</title>
<p>To intuitively compare the performance of different detection models in identifying eggplant fruit diseases and pests under complex greenhouse conditions, representative samples were selected, and the detection results of multiple mainstream object detection models were visualized. As shown in the <xref ref-type="fig" rid="f13"><bold>Figure&#xa0;13</bold></xref>, the detection outputs of YOLOv5n, YOLOv8n, YOLOv9t, YOLOv10n, YOLOv12n, Faster R-CNN, RT-DETR-r18, YOLOv11n (Baseline), and the proposed DFSNet are presented for the same scene. Comparative analysis of bounding box positions, class predictions, and confidence distributions allows for an intuitive assessment of each model&#x2019;s differences in target localization accuracy, class discrimination capability, and adaptability to complex background interference.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Comparative Performance of Different Models in Complex Environments.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1775987-g013.tif">
<alt-text content-type="machine-generated">Comparison of eggplant disease detection using various models. Twelve images show the original image, ground truth, and results from different YOLO versions, Faster RCNN, RT DETR r18, and DFSNet (OURS). Boxes highlight fruit rot and melon thrips, labeled with probability scores for disease detection performance.</alt-text>
</graphic></fig>
<p>The visual comparison of detection results reveals notable differences in the localization and discrimination capabilities of various models under complex backgrounds for eggplant fruit diseases and pests. Some comparative models exhibit overlapping bounding boxes, low confidence scores, or class confusion over fruit surface disease regions, particularly for small-scale targets such as MelonThrips, which are susceptible to occlusion by leaves and interference from background textures. In contrast, DFSNet (OURS) achieves more accurate localization of pest and disease regions, with detection boxes closely aligned with the actual distribution of lesions, while significantly reducing false positives and redundant boxes. Overall, the improved model demonstrates more reliable performance in both target localization stability and class discrimination accuracy, consistent with the results of the previous experiments and ablation analyses, thereby validating the effectiveness and practical applicability of the proposed method in real-world pest and disease detection scenarios.</p>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>This study addresses the task of detecting eggplant fruit diseases and pests under greenhouse conditions, focusing on challenges such as small target scales, diverse morphologies, complex backgrounds, and limited computational resources on edge devices. We propose a lightweight and efficient real-time detection approach. By specifically improving the baseline network architecture, PConv, C3K2-MSDA, and CSP-MSLA modules were incorporated into the backbone and neck structures, and combined with the improved SDDH loss function, resulting in the DFSNet model tailored for complex agricultural scenarios. Experimental results demonstrate that the proposed method achieves a favorable balance between detection accuracy and inference efficiency on the eggplant fruit disease and pest dataset. Compared with various mainstream detection models, DFSNet exhibits superior performance in Precision, Recall, and mAP metrics, while maintaining low parameter counts and computational complexity, satisfying the requirements for real-time detection and deployment in practical natural environments. Ablation studies and visualization analyses further validate the complementary roles of the proposed modules in feature modeling and target discrimination, particularly providing more stable detection for small-scale and morphologically irregular disease and pest targets.</p>
<p>Despite these achievements, there remains room for improvement. First, in scenarios with severe occlusion or significant illumination variations, the model&#x2019;s recognition of weakly textured lesions could be further enhanced. Second, the current study is primarily validated on a single crop dataset, and the generalization capability of the model across different crops and environmental conditions requires further evaluation.</p>
<p>Future work will explore the integration of more sophisticated cross-scale feature interaction mechanisms or temporal information to enhance model adaptability in complex dynamic scenarios. Additionally, techniques such as knowledge distillation or self-supervised learning could be employed to further improve the detection performance of lightweight models under small-sample conditions. Long-term operational stability and practical deployment of the model on agricultural robots or embedded devices will also be investigated to provide more reliable technical support for intelligent pest and disease monitoring in agriculture.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>HS: Methodology, Conceptualization, Investigation, Writing &#x2013; original draft, Formal analysis, Data curation. WF: Data curation, Methodology, Writing &#x2013; review &amp; editing. JZ: Data curation, Investigation, Writing &#x2013; review &amp; editing. MF: Validation, Writing &#x2013; review &amp; editing, Software. FW: Data curation, Writing &#x2013; review &amp; editing, Investigation. RF: Writing &#x2013; review &amp; editing, Supervision, Formal analysis, Funding acquisition.</p></sec>
<ack>
<title>Acknowledgments</title>
<p>The authors wish to acknowledge the contributions of all participants in this study. The authors would like to thank the open-source community of Ultralytics and the agricultural research institutions that provided valuable data and technical support for this work.</p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Agarwal</surname> <given-names>M.</given-names></name>
<name><surname>Gupta</surname> <given-names>S. K.</given-names></name>
<name><surname>Biswas</surname> <given-names>K. K.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Development of efficient cnn model for tomato crop disease identification</article-title>. <source>Sustain. Computing: Inf. Syst.</source> <volume>28</volume>, <elocation-id>100407</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.suscom.2020.100407</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ashurov</surname> <given-names>A. Y.</given-names></name>
<name><surname>Al-Gaashani</surname> <given-names>M. S. A.</given-names></name>
<name><surname>Samee</surname> <given-names>N. A.</given-names></name>
<name><surname>Alkanhel</surname> <given-names>R.</given-names></name>
<name><surname>Atteia</surname> <given-names>G.</given-names></name>
<name><surname>Abdallah</surname> <given-names>H. A.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Enhancing plant disease detection through deep learning: a depthwise cnn with squeeze and excitation integration and residual skip connections</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1505857</pub-id>, PMID: <pub-id pub-id-type="pmid">39925367</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Finder</surname> <given-names>S. E.</given-names></name>
<name><surname>Amoyal</surname> <given-names>R.</given-names></name>
<name><surname>Treister</surname> <given-names>E.</given-names></name>
<name><surname>Freifeld</surname> <given-names>O.</given-names></name>
</person-group> (<year>2024</year>). &#x201c;
<article-title>Wavelet convolutions for large receptive fields</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name>. <fpage>363</fpage>&#x2013;<lpage>380</lpage> (<conf-sponsor>Springer</conf-sponsor>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2407.05848</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Fu</surname> <given-names>R.</given-names></name>
<name><surname>Wang</surname> <given-names>S.</given-names></name>
<name><surname>Dong</surname> <given-names>M.</given-names></name>
<name><surname>Sun</surname> <given-names>H.</given-names></name>
<name><surname>Al-Absi</surname> <given-names>M.</given-names></name>
<name><surname>Zhang</surname> <given-names>K.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Pest detection nin dynamic environments: An adaptive continual test-time domain adaptation strategy</article-title>. <source>Plant Methods</source> <volume>21</volume>, <fpage>53</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13007-025-01371-y</pub-id>, PMID: <pub-id pub-id-type="pmid">40270032</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Han</surname> <given-names>K.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Tian</surname> <given-names>Q.</given-names></name>
<name><surname>Guo</surname> <given-names>J.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Ghostnet: More features from cheap operations</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>1580</fpage>&#x2013;<lpage>1589</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00165</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Howard</surname> <given-names>A.</given-names></name>
<name><surname>Sandler</surname> <given-names>M.</given-names></name>
<name><surname>Chu</surname> <given-names>G.</given-names></name>
<name><surname>Chen</surname> <given-names>L.-C.</given-names></name>
<name><surname>Chen</surname> <given-names>B.</given-names></name>
<name><surname>Tan</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2019</year>). &#x201c;
<article-title>Searching for mobilenetv3</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>. <fpage>1314</fpage>&#x2013;<lpage>1324</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2019.00140</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Howard</surname> <given-names>A. G.</given-names></name>
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Chen</surname> <given-names>B.</given-names></name>
<name><surname>Kalenichenko</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Weyand</surname> <given-names>T.</given-names></name>
<etal/>
</person-group>. (<year>2017</year>). 
<article-title>Mobilenets: Efficient convolutional neural networks for mobile vision applications</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1704.04861</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Huang</surname> <given-names>H.</given-names></name>
<name><surname>He</surname> <given-names>R.</given-names></name>
<name><surname>Sun</surname> <given-names>Z.</given-names></name>
<name><surname>Tan</surname> <given-names>T.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Wavelet-srnet: A wavelet-based cnn for multi-scale face super resolution</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE International Conference on Computer Vision</conf-name>. <fpage>1689</fpage>&#x2013;<lpage>1697</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.187</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kellab</surname> <given-names>R.</given-names></name>
<name><surname>Boulkenafet</surname> <given-names>F.</given-names></name>
<name><surname>Amokrane</surname> <given-names>S.</given-names></name>
<name><surname>Benmakhlouf</surname> <given-names>Z.</given-names></name>
<name><surname>Bensouici</surname> <given-names>C.</given-names></name>
<name><surname>Bounamous</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Chemical profiling and <italic>in vitro</italic> evaluation of the antioxidant, anti-inflammatory, and antibacterial effects of Algerian solanum melongena l</article-title>. <source>Indian J. Pharm. Educ. Res.</source> <volume>59</volume>, <fpage>338</fpage>&#x2013;<lpage>350</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.5530/ijper.20250132</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>Q.</given-names></name>
<name><surname>Shen</surname> <given-names>L.</given-names></name>
<name><surname>Guo</surname> <given-names>S.</given-names></name>
<name><surname>Lai</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Wavelet integrated cnns for noise-robust image classification</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>7245</fpage>&#x2013;<lpage>7254</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00727</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Shi</surname> <given-names>H.</given-names></name>
<name><surname>Du</surname> <given-names>A.</given-names></name>
<name><surname>Mao</surname> <given-names>Y.</given-names></name>
<name><surname>Fan</surname> <given-names>K.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Symptom recognition of disease and insect damage based on mask r-cnn, wavelet transform, and f-rnet</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.922797</pub-id>, PMID: <pub-id pub-id-type="pmid">35937317</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Sun</surname> <given-names>W.</given-names></name>
<name><surname>Ji</surname> <given-names>Y.</given-names></name>
<name><surname>Huang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2025</year>a). 
<article-title>A joint detection and tracking paradigm based on reinforcement learning for compact hfswr</article-title>. <source>IEEE J. Selected Topics Appl. Earth Observations Remote Sens.</source> <volume>18</volume>, <fpage>1995</fpage>&#x2013;<lpage>2009</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSTARS.2024.3504813</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Sun</surname> <given-names>W.</given-names></name>
<name><surname>Ji</surname> <given-names>Y.</given-names></name>
<name><surname>Huang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2025</year>b). 
<article-title>S2g-gcn: A plot classification network integrating spectrum-to-graph modeling and graph convolutional network for compact hfswr</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>22</volume>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2025.3623931</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Hao</surname> <given-names>Z.</given-names></name>
<name><surname>Han</surname> <given-names>K.</given-names></name>
<name><surname>Tang</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Ghostnetv3: Exploring the training strategies for compact models</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2404.11202</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>P.</given-names></name>
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
<name><surname>Zhang</surname> <given-names>K.</given-names></name>
<name><surname>Lin</surname> <given-names>L.</given-names></name>
<name><surname>Zuo</surname> <given-names>W.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Multi-level wavelet-cnn for image restoration</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops</conf-name>. <fpage>773</fpage>&#x2013;<lpage>782</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPRW.2018.00121</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Pan</surname> <given-names>C.</given-names></name>
<name><surname>Wang</surname> <given-names>S.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Liu</surname> <given-names>C.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Ssd-yolo: A lightweight network for rice leaf disease detection</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1643096</pub-id>, PMID: <pub-id pub-id-type="pmid">40901551</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Panchananam</surname> <given-names>L. S.</given-names></name>
<name><surname>Chandaliya</surname> <given-names>P. K.</given-names></name>
<name><surname>Akhtar</surname> <given-names>Z.</given-names></name>
<name><surname>Upla</surname> <given-names>K.</given-names></name>
<name><surname>Ramachandra</surname> <given-names>R.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Waveletfusion: Enhancing plant leaf disease classification with multi-scale feature extraction and explainable ai</article-title>. <source>Expert Syst. Appl.</source> <volume>285</volume>, <elocation-id>127947</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2025.127947</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Qiu</surname> <given-names>C.</given-names></name>
<name><surname>Yue</surname> <given-names>T.</given-names></name>
<name><surname>Hu</surname> <given-names>X.</given-names></name>
</person-group> (<year>2024</year>). &#x201c;
<article-title>Reconstruction-free cascaded adaptive compressive sensing</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>2620</fpage>&#x2013;<lpage>2630</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.00253</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Salka</surname> <given-names>T. D.</given-names></name>
<name><surname>Hanafi</surname> <given-names>M. B.</given-names></name>
<name><surname>Rahman</surname> <given-names>S. M. S. A. A.</given-names></name>
<name><surname>Zulperi</surname> <given-names>D. B. M.</given-names></name>
<name><surname>Omar</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Plant leaf disease detection and classification using convolution neural networks model: A review</article-title>. <source>Artif. Intell. Rev.</source> <volume>58</volume>, <fpage>322</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-025-11234-6</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Sandler</surname> <given-names>M.</given-names></name>
<name><surname>Howard</surname> <given-names>A.</given-names></name>
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Zhmoginov</surname> <given-names>A.</given-names></name>
<name><surname>Chen</surname> <given-names>L.-C.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Mobilenetv2: Inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2018.00474</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shafik</surname> <given-names>W.</given-names></name>
<name><surname>Tufail</surname> <given-names>A.</given-names></name>
<name><surname>De Silva</surname> <given-names>L. C.</given-names></name>
<name><surname>Haji Mohd Apong</surname> <given-names>R. A.</given-names></name>
<name><surname>Kim</surname> <given-names>K.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Deep learning technique for plant disease classification and pest detection and model explainability elevating agricultural sustainability</article-title>. <source>BMC Plant Biol.</source> <volume>25</volume>, <fpage>1491</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12870-025-07377-x</pub-id>, PMID: <pub-id pub-id-type="pmid">41184784</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Song</surname> <given-names>W.</given-names></name>
<name><surname>Hao</surname> <given-names>L.</given-names></name>
<name><surname>Hao</surname> <given-names>G.</given-names></name>
<name><surname>Hao</surname> <given-names>Q.</given-names></name>
<name><surname>Xu</surname> <given-names>Y.</given-names></name>
<name><surname>Cui</surname> <given-names>L.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Deformable object detection network for lightweight cucumber leaf disease detection</article-title>. <source>Proc. CCF Conf. Comput. Supported Cooperative Work Soc. Computing</source>. <volume>2344</volume>, <fpage>255</fpage>&#x2013;<lpage>265</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978981-96-2376-1_19</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Spisni</surname> <given-names>E.</given-names></name>
<name><surname>Valerii</surname> <given-names>M. C.</given-names></name>
<name><surname>De Fazio</surname> <given-names>L.</given-names></name>
<name><surname>Rotondo</surname> <given-names>E.</given-names></name>
<name><surname>Di Natale</surname> <given-names>M.</given-names></name>
<name><surname>Giovanardi</surname> <given-names>E.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>A khorasan wheat-based diet improves systemic inflammatory profile in semi-professional basketball players: A randomized crossover pilot study</article-title>. <source>J. Sci. Food Agric.</source> <volume>100</volume>, <fpage>4101</fpage>&#x2013;<lpage>4107</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/jsfa.9947</pub-id>, PMID: <pub-id pub-id-type="pmid">31347165</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sun</surname> <given-names>H.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Cheng</surname> <given-names>Z.</given-names></name>
<name><surname>Al-Absi</surname> <given-names>M. A.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>A multi-scale detection model for tomato leaf diseases with small target detection head</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1598534</pub-id>, PMID: <pub-id pub-id-type="pmid">41036394</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tang</surname> <given-names>Y.</given-names></name>
<name><surname>Han</surname> <given-names>K.</given-names></name>
<name><surname>Guo</surname> <given-names>J.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Ghostnetv2: Enhance cheap operation with long-range attention</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>35</volume>, <fpage>9969</fpage>&#x2013;<lpage>9982</lpage>.
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Frery</surname> <given-names>A. C.</given-names></name>
<name><surname>Li</surname> <given-names>M.</given-names></name>
<name><surname>Ren</surname> <given-names>P.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Underwater image enhancement via histogram similarity-oriented color compensation complemented by multiple attribute adjustment</article-title>. <source>Intelligent Mar. Technol. Syst.</source> <volume>1</volume>, <fpage>12</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s44295-023-00015-y</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>H.</given-names></name>
<name><surname>Zhang</surname> <given-names>W.</given-names></name>
<name><surname>Xu</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Ren</surname> <given-names>P.</given-names></name>
</person-group> (<year>2026</year>). 
<article-title>Watercyclediffusion: Visual&#x2013;textual fusion empowered underwater image enhancement</article-title>. <source>Inf. Fusion</source> <volume>127</volume>, <elocation-id>103693</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.inffus.2025.103693</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>L.</given-names></name>
<name><surname>Zheng</surname> <given-names>Z.</given-names></name>
<name><surname>Qi</surname> <given-names>L.</given-names></name>
<name><surname>Ma</surname> <given-names>X.</given-names></name>
<name><surname>Liang</surname> <given-names>Z.</given-names></name>
<name><surname>Chen</surname> <given-names>G.</given-names></name>
</person-group> (<year>2014</year>). 
<article-title>Field detection method of rice leaf blast lesions based on image processing</article-title>. <source>Res. Agric. Mechanization</source>. <volume>1</volume>, <fpage>32</fpage>&#x2013;<lpage>35</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3969/j.issn.1003-188X.2014.09.007</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>J.</given-names></name>
<name><surname>Liu</surname> <given-names>S.</given-names></name>
<name><surname>Wu</surname> <given-names>J.</given-names></name>
<name><surname>Su</surname> <given-names>X.</given-names></name>
<name><surname>Hai</surname> <given-names>N.</given-names></name>
<name><surname>Huang</surname> <given-names>X.</given-names></name>
</person-group> (<year>2025</year>). &#x201c;
<article-title>Pinwheel-shaped convolution and scale-based dynamic loss for infrared small target detection</article-title>,&#x201d; in <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name>, Vol. <volume>39</volume>. <fpage>9202</fpage>&#x2013;<lpage>9210</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2412.16986</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Pg-detr: A lightweight and efficient detection transformer for early stage pomegranate fruit detection</article-title>. <source>IEEE Access</source>. <volume>13</volume>, <fpage>155547</fpage>&#x2013;<lpage>155559</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2025.3605887</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>W.</given-names></name>
<name><surname>Cong</surname> <given-names>M.</given-names></name>
<name><surname>Wang</surname> <given-names>L.</given-names></name>
</person-group> (<year>2003</year>). &#x201c;
<article-title>Algorithms for optical weak small targets detection and tracking</article-title>,&#x201d; in <conf-name>Proceedings of the 2003 International Conference on Neural Networks and Signal Processing</conf-name>, Vol. <volume>1</volume>. <fpage>643</fpage>&#x2013;<lpage>647</lpage> (<conf-sponsor>IEEE</conf-sponsor>).
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Song</surname> <given-names>J.</given-names></name>
<name><surname>Yu</surname> <given-names>X.</given-names></name>
<name><surname>Ji</surname> <given-names>X.</given-names></name>
</person-group> (<year>2025</year>b). 
<article-title>Wmc-rtdetr: A lightweight tea disease detection model</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1574920</pub-id>, PMID: <pub-id pub-id-type="pmid">40395277</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>W.</given-names></name>
<name><surname>Sun</surname> <given-names>X.</given-names></name>
<name><surname>Zhou</surname> <given-names>L.</given-names></name>
<name><surname>Xie</surname> <given-names>X.</given-names></name>
<name><surname>Zhao</surname> <given-names>W.</given-names></name>
<name><surname>Liang</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Dual-branch collaborative learning network for crop disease identification</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1117478</pub-id>, PMID: <pub-id pub-id-type="pmid">36844059</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>C.</given-names></name>
<name><surname>Zhang</surname> <given-names>T.</given-names></name>
<name><surname>Shang</surname> <given-names>G.</given-names></name>
</person-group> (<year>2025</year>a). 
<article-title>Mavm-unet: Multiscale aggregated vision mambau-net for field rice pest detection</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1635310</pub-id>, PMID: <pub-id pub-id-type="pmid">40880872</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhao</surname> <given-names>J.-X.</given-names></name>
<name><surname>Liu</surname> <given-names>J.-J.</given-names></name>
<name><surname>Fan</surname> <given-names>D.-P.</given-names></name>
<name><surname>Cao</surname> <given-names>Y.</given-names></name>
<name><surname>Yang</surname> <given-names>J.</given-names></name>
<name><surname>Cheng</surname> <given-names>M.-M.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Egnet: Edge guidance network for salient object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>. <fpage>8779</fpage>&#x2013;<lpage>8788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2019.00887</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>J.</given-names></name>
<name><surname>Yang</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2024</year>). &#x201c;
<article-title>Contextual boundary aware network for salient object detection</article-title>,&#x201d; in <conf-name>Proceedings of the 2024 7th International Conference on Image and Graphics Processing</conf-name>. <fpage>19</fpage>&#x2013;<lpage>24</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3647649.3647653</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/777389">Xiao Ming Zhang</ext-link>, Yunnan Agricultural University, China</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2863105">Hao Wang</ext-link>, Laoshan National Laboratory, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3065544">Yulong Nan</ext-link>, Yancheng Institute of Technology, China</p></fn>
</fn-group>
</back>
</article>