<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1663813</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Fast real-time detection and counting of thrips in greenhouses with multi-level feature attention and fusion</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>He</surname>
<given-names>Zhangzhang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3163144/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Xinyue</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gao</surname>
<given-names>Ying</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Yu</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Guo</surname>
<given-names>Yuheng</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhai</surname>
<given-names>Tong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wei</surname>
<given-names>Xiaochen</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Huan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhu</surname>
<given-names>Haipeng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fu</surname>
<given-names>Yongkun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhang</surname>
<given-names>Zhiliang</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2974896/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Food and Biology, Jingchu University of Technology</institution>, <addr-line>Jingmen, Hubei</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Computer Science, Yangtze University</institution>, <addr-line>Jingzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Ministry of Agriculture and Rural Affairs of the People&#x2019;s Republic of China (MARA) Key Laboratory of Sustainable Crop Production in the Middle Reaches of the Yangtze River (Co-Construction by Ministry and Province), College of Agriculture, Yangtze University</institution>, <addr-line>Jingzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1858294/overview">Bimlesh Kumar</ext-link>, Indian Institute of Technology Guwahati, India</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3136373/overview">Qing Dong</ext-link>, Northeastern University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3149378/overview">Wenchao Xiang</ext-link>, Hebei University of Technology, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Zhiliang Zhang, <email xlink:href="mailto:zzl.st@yangtzeu.edu.cn">zzl.st@yangtzeu.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>21</day>
<month>08</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1663813</elocation-id>
<history>
<date date-type="received">
<day>11</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 He, Chen, Gao, Zhang, Guo, Zhai, Wei, Li, Zhu, Fu and Zhang.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>He, Chen, Gao, Zhang, Guo, Zhai, Wei, Li, Zhu, Fu and Zhang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Thrips can damage over 200 species across 62 plant families, causing significant economic losses worldwide. Their tiny size, rapid reproduction, and wide host range make them prone to outbreaks, necessitating precise and efficient population monitoring methods. Existing intelligent counting methods lack effective solutions for tiny pests like thrips. In this work, we propose the Thrip Counting and Detection Network (TCD-Net). TCD-Net is an fully convolutional network consisting of a backbone network, a feature pyramid, and an output head. First, we propose a lightweight backbone network, PartialNeXt, which optimizes convolution layers through Partial Convolution (PConv), ensuring both network performance and reduced complexity. Next, we design a lightweight channel-spatial hybrid attention mechanism to further refine multi-scale features, enhancing the model&#x2019;s ability to extract global and local features with minimal computational cost. Finally, we introduce the Adaptive Feature Mixer Feature Pyramid Network (AFM-FPN), where the Adaptive Feature Mixer (AFM) replaces the traditional element-wise addition at the P level, enhancing the model&#x2019;s ability to select and retain thrips features, improving detection performance for extremely small objects. The model is trained with the Object Counting Loss (OC Loss) specifically designed for the detection of tiny pests, allowing the network to predict a small spot region for each thrips, enabling real-time and precise counting and detection. We collected a dataset containing over 47K thrips annotations to evaluate the model&#x2019;s performance. The results show that TCD-Net achieves an F1 score of 85.67%, with a counting result correlation of 75.50%. The model size is only 21.13M, with a computational cost of 114.36 GFLOPs. Compared to existing methods, TCD-Net achieves higher thrips counting and detection accuracy with lower computational complexity. The dataset is publicly available at <ext-link ext-link-type="uri" xlink:href="http://www.github.com/ZZL0897/thrip_leaf_dataset">github.com/ZZL0897/thrip_leaf_dataset</ext-link>.</p>
</abstract>
<kwd-group>
<kwd>thrip</kwd>
<kwd>pest counting</kwd>
<kwd>pest detection</kwd>
<kwd>precision agriculture</kwd>
<kwd>lightweight network</kwd>
</kwd-group>
<counts>
<fig-count count="14"/>
<table-count count="10"/>
<equation-count count="12"/>
<ref-count count="49"/>
<page-count count="20"/>
<word-count count="9222"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Thrips belong to the order Thysanoptera and the family Thripidae. These insects are small in size, reproduce rapidly, and have a body length of less than 2mm. They are typically yellow, brown, or black in color. The eggs vary in shape, including kidney-shaped, round, and oval, with colors ranging from colorless to white and yellow (<xref ref-type="bibr" rid="B42">Zhang, 2011</xref>; <xref ref-type="bibr" rid="B39">Wu et&#xa0;al., 2018</xref>). Thrips exhibit diverse feeding habits, predominantly phytophagous. Thrips exhibit diverse feeding habits, predominantly phytophagous. They can damage over 200 crop species from 62 families, including Cucurbitaceae, Fabaceae, Brassicaceae, and Solanaceae (<xref ref-type="bibr" rid="B20">Kirk et&#xa0;al., 2021</xref>). Thrips inflict significant economic losses worldwide. Controlling thrips is challenging for three main reasons: 1) Their small size and strong concealment tendencies, as they prefer to hide in flowers, tender tips, and the undersides of leaves, making detection difficult. 2) Their short life cycle and rapid reproduction, which contribute to the rapid development of resistance to chemical pesticides, leading to outbreaks. 3) Their broad host range, strong dispersal ability, and excellent ecological adaptability, enabling severe damage to various crops (<xref ref-type="bibr" rid="B31">Steenbergen et&#xa0;al., 2018</xref>). Therefore, It is crucial to accurately detect and count thrips.</p>
<p>Traditional manual counting methods for pests are time-consuming and labor-intensive, while computer vision and deep learning-based intelligent detection technologies can significantly improve monitoring efficiency (<xref ref-type="bibr" rid="B46">Zhang et&#xa0;al., 2020b</xref>, <xref ref-type="bibr" rid="B48">2024</xref>, <xref ref-type="bibr" rid="B47">2024</xref>; <xref ref-type="bibr" rid="B26">Liu et&#xa0;al., 2025</xref>; <xref ref-type="bibr" rid="B44">Zhang et&#xa0;al., 2025</xref>). Current research on pest intelligent detection and counting mainly focuses on improvements to object detection algorithms. Key improvements include optimizing feature extraction backbones, enhancing the Feature Pyramid Network (FPN), improving the Region Proposal Network (RPN), and optimizing anchor generation and selection mechanisms to better suit pest counting and detection tasks (<xref ref-type="bibr" rid="B17">Jiao et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B10">Dong et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B25">Liu et&#xa0;al., 2021a</xref>; <xref ref-type="bibr" rid="B34">Wang et&#xa0;al., 2021b</xref>; <xref ref-type="bibr" rid="B19">Jiao et&#xa0;al., 2022b</xref>; <xref ref-type="bibr" rid="B33">Wang et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B9">Dong et&#xa0;al., 2024b</xref>). For instance, <xref ref-type="bibr" rid="B35">Wang et&#xa0;al. (2021a)</xref> and <xref ref-type="bibr" rid="B18">Jiao et&#xa0;al. (2022a)</xref> both made improvements to R-CNN by incorporating attention mechanisms into the network, enriching the features extracted to enhance detection performance. <xref ref-type="bibr" rid="B8">Dong et&#xa0;al. (2024a)</xref> made comprehensive improvements to the YOLO model, effectively enhancing the model&#x2019;s feature attention capabilities and multi-scale feature extraction, increasing accuracy while reducing model parameters. These studies demonstrated the strong benchmark performance of object detection in pest counting and detection tasks. They have made effective improvements to address challenges such as small pest size and complex backgrounds, promoting the application of object detection methods in agricultural pest detection expert systems.</p>
<p>However, detecting and counting extremely small pests like thrips and planthoppers still poses challenges. Small object detection has consistently posed a challenge for object detectors, often resulting in False Negatives (FNs) and False Positives (FPs). The limited features and low signal-to-noise ratio of extremely small pests hinder object detectors from extracting sufficient features or accurately locating the anchors (<xref ref-type="bibr" rid="B41">Zhan et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B9">Dong et&#xa0;al., 2024b</xref>; <xref ref-type="bibr" rid="B48">Zhang et&#xa0;al., 2024a</xref>). Some scholars have explored solutions to these challenges. <xref ref-type="bibr" rid="B16">He et&#xa0;al. (2020)</xref> and <xref ref-type="bibr" rid="B21">Lee et&#xa0;al. (2020)</xref> both used Faster R-CNN for intelligent detection of brown planthoppers and tea thrips, respectively. <xref ref-type="bibr" rid="B35">Wang et&#xa0;al. (2021a)</xref> and <xref ref-type="bibr" rid="B34">Wang et&#xa0;al. (2021b)</xref> improved RPN and incorporated feature attention mechanisms to enhance detection performance for small pests. <xref ref-type="bibr" rid="B7">De Cesaro et&#xa0;al. (2022)</xref> utilized Mask R-CNN for counting aphids and parasitic wasps, achieving approximately 80% result correlation. <xref ref-type="bibr" rid="B22">Li et&#xa0;al. (2022)</xref> proposed a two-stage detection method for whiteflies and thrips, initially locating pests using spectral features, followed by recognition using Support Vector Machines (SVM). <xref ref-type="bibr" rid="B33">Wang et&#xa0;al. (2023)</xref> developed an anchor-free framework and a dynamic detection head, achieving competitive results on two multi-class small-object pest datasets. <xref ref-type="bibr" rid="B9">Dong et&#xa0;al. (2024b)</xref> designed multi-scale feature aggregation and dynamic perception modules, achieving optimal detection performance. <xref ref-type="bibr" rid="B40">Yang et&#xa0;al. (2024)</xref> introduced a super-resolution module and multi-level feature fusion in YOLOv8, achieving a 57% mAP for detecting extremely small pests. <xref ref-type="bibr" rid="B48">Zhang et&#xa0;al. (2024a)</xref> proposed an innovative rice planthopper detection method based on a fully convolutional architecture and object counting loss, achieving an F1 score of 92.36%. <xref ref-type="bibr" rid="B1">Banerjee et&#xa0;al. (2024)</xref> and <xref ref-type="bibr" rid="B38">Wu et&#xa0;al. (2024)</xref> designed IoT-based thrips pest monitoring systems, which effectively improved monitoring efficiency for thrips populations in their experimental environments.</p>
<p>The aforementioned studies provide innovative research ideas and improvement pathways for counting and detecting extremely small pests. However, research on intelligent counting methods for thrips remains limited. Existing methods for precise counting and detection of thrips still have significant room for improvement in detection accuracy and model runtime efficiency. Therefore, this paper focuses on thrips as the research subject, collects thrips infestation data from <italic>Spathiphyllum floribundum &#x2018;Clevelandii&#x2019;</italic> cultivated in greenhouses, and proposes a new real-time counting and detection algorithm for thrips, offering an efficient and reliable intelligent method for monitoring small pests in greenhouses. The main contributions of this paper are as follows:</p>
<list list-type="order">
<list-item>
<p>Thrip Counting and Detection Network (TCD-Net). A fully convolutional network based on a multi-level attention mechanism and feature adaptive fusion is built. The Object Counting Loss (OC Loss), designed for extremely small pests, is used to train the network, enabling real-time and accurate detection and counting of thrips in greenhouses.</p>
</list-item>
<list-item>
<p>Optimized backbone network and feature attention mechanism. The PartialNeXt backbone network is proposed, the convolution layers of ConvNeXtV2 are optimized using Partial Convolution (PConv), improving the network&#x2019;s computational efficiency and feature reuse capability. Then, a channel-spatial hybrid attention (HA) mechanism that balances performance and efficiency is designed to enhance detection stability.</p>
</list-item>
<list-item>
<p>Multi-scale feature adaptive fusion: The Adaptive Feature Mixer Feature Pyramid Network (AFM-FPN) is proposed, using Adaptive Feature Mixer (AFM) for adaptive fusion of P-level multi-scale features, enhancing the model&#x2019;s ability to select and retain thrips features, thereby improving detection accuracy for extremely small objects.</p>
</list-item>
<list-item>
<p>We collect a thrips dataset consisting of 5,618 images and 47,726 annotations. Extensive experiments and comparisons are conducted on this dataset to verify the superiority of TCD-Net in detection accuracy and computational efficiency.</p>
</list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Materials</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data acquisition</title>
<p>Our team collected the dataset from July to September 2024 in the Plant Growth Chamber at Jingchu Sci-tech Park, Jingchu University of Technology, using potted <italic>Spathiphyllum floribundum &#x2018;Clevelandii&#x2019;</italic>. The temperature in the growth chamber was 25&#xb0;C, with humidity levels ranging from 50% to 70%, and light intensity was 10,000 lux. The thrips species identified on the infected leaves was <italic>Megalurothrips usitatus</italic>. Data collection was carried out by six plant protection students. They randomly took 2&#x2013;3 images of thrips on the leaves at different time intervals using smartphones, keeping only the clearest image at each location. The shooting environment is shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Plant greenhouse.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g001.tif">
<alt-text content-type="machine-generated">Shelves filled with green potted plants are displayed. The plants have broad, vibrant leaves and are arranged neatly in rows on metal racks, creating a lush, organized appearance in an indoor setting.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Dataset</title>
<p>After data collection was completed, a total of 5,618 images were selected to form the dataset, and all images were resized to a resolution of 1280&#xd7;1280. The thrips annotations were performed collaboratively by six photographers, followed by a second round of verification to ensure annotation accuracy. The annotation tool used was Labelme, with the initial annotation results in json format. Subsequently, we converted the annotation results to COCO and YOLO formats for easy comparison with other methods. The dataset contains a total of 47,726 thrips annotations. The dataset was split into training, validation, and test sets in a 6:2:2 ratio, and specific statistics are shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. The dataset is publicly available at <ext-link ext-link-type="uri" xlink:href="http://www.github.com/ZZL0897/thrip_leaf_dataset">github.com/ZZL0897/thrip_leaf_dataset</ext-link>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Dataset information.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" colspan="2" align="center">Train</th>
<th valign="middle" colspan="2" align="center">Validation</th>
<th valign="middle" colspan="2" align="center">Test</th>
<th valign="middle" colspan="2" align="center">Statistics</th>
</tr>
<tr>
<th valign="middle" align="center">Images</th>
<th valign="middle" align="center">Annotations</th>
<th valign="middle" align="center">Images</th>
<th valign="middle" align="center">Annotations</th>
<th valign="middle" align="center">Images</th>
<th valign="middle" align="center">Annotations</th>
<th valign="middle" align="center">Avg. num</th>
<th valign="middle" align="center">Avg. bbox area</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style="">3370</td>
<td valign="middle" align="center" style="">28934</td>
<td valign="middle" align="center" style="">1124</td>
<td valign="middle" align="center" style="">9407</td>
<td valign="middle" align="center" style="">1124</td>
<td valign="middle" align="center" style="">9385</td>
<td valign="middle" align="center" style="">8.5</td>
<td valign="middle" align="center" style="">176px</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>It is worth noting that the average pixel area of the thrips bounding boxes in the images is only 176px, with widths ranging from 2px to 54px and heights ranging from 2px to 56px. The ratio of the average pixel area of the bounding boxes to the image pixel area is only 0.011%, which highlights the fact that thrips are extremely small targets in the images, making accurate detection a significant challenge.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Proposed method</title>
<sec id="s3_1">
<label>3.1</label>
<title>Network construction</title>
<p>The overall structure of TCD-Net is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>. Its modular design is similar to that of a typical object detection network. The backbone network extracts rich multi-scale feature information from the input image, with attention mechanisms further enhancing the feature representation. These multi-scale features are fed into the FPN to improve the network&#x2019;s performance in detecting small objects (<xref ref-type="bibr" rid="B23">Lin et&#xa0;al., 2017</xref>). Finally, the output head generates the final predictions. Unlike traditional object detection methods, this network is fully convolutional. The output head consists of four 1&#xd7;1 convolutions, which reduce the output channel count of the FPN to 1, and interpolate it back to the input size, ultimately combining the results into a single prediction output.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The overall structure of TCD-Net. The network architecture consists of four components. The backbone network extracts fundamental image features and outputs four sets of multi-scale feature maps. These four feature maps are then fed into the Hybrid Attention (HA) for further refinement, followed by adaptive feature fusion through the AFM-FPN. Finally, four 1&#xd7;1 convolutional layers serve as the output heads to generate the prediction results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g002.tif">
<alt-text content-type="machine-generated">Diagram of a neural network architecture. Arrows indicate flow from &#x201c;Input&#x201d; to &#x201c;PartialNeXt_Nano&#x201d; as the Backbone. Hybrid attention (HA) modules follow, leading to the &#x201c;Adaptive Feature Mixer Feature Pyramid Network&#x201d; as the Neck. The final stage is the Head, culminating in &#x201c;Output."</alt-text>
</graphic>
</fig>
<p>A regular fully convolutional network cannot count and localize tiny objects. We address this by using a specially designed loss function during training, allowing the network to accept object detection labels and enabling the counting and detection of small pests in images. The implementation process will be detailed in Sections 3.2 and 3.3.</p>
<sec id="s3_1_1">
<label>3.1.1</label>
<title>Feature extraction backbone</title>
<p>The choice of feature extraction backbone plays a crucial role in the performance of the model. We improve the ConvNeXtV2 and propose the PartialNeXt, which offers higher computational efficiency and better feature extraction performance. The introduction of ConvNeXtV2 has elevated the convolutional neural network model to new heights in both computational efficiency and model performance (<xref ref-type="bibr" rid="B36">Woo et&#xa0;al., 2023</xref>). However, its key feature extraction convolution layer uses a 7&#xd7;7 Depthwise Convolution (DWConv), which reduces the model&#x2019;s parameter count and computation load. But due to increased memory access frequency and insufficient hardware optimization, the computational speed is actually reduced. Therefore, we replace the DWConv in ConvNeXtV2 with Partial Convolution (PConv) to enhance the model&#x2019;s computational speed. The core idea of PConv is that there is significant redundancy in the massive feature maps of the model. PConv performs traditional convolution operations only on a small portion of the feature map, while the remaining majority of the feature map is directly passed to the next layer. This achieves a balance between model efficiency and performance (<xref ref-type="bibr" rid="B5">Chen et&#xa0;al., 2023</xref>). The operation process of PConv is shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Partial convolution. PConv only performs traditional convolution operations on a small portion of the feature map, and the rest is directly passed to the next layer. This reduces computational redundancy and memory access frequency, and with the use of traditional convolutions, it benefits from better hardware support, improving computation speed.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network operation with an input block leading to an output block. A direct identity mapping arrow connects input to output. A separate path shows a convolutional operation (Conv2D) using small yellow convolution kernels. Labels define symbols: an arrow for Identity, an asterisk for Conv2D, and colored blocks for Conv kernels.</alt-text>
</graphic>
</fig>
<p>The structural parameters of the backbone network refer to the Nano version of ConvNeXtV2, which offers good feature extraction ability while maintaining low parameter and computation counts. The overall structure of PartialNeXt is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4A</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>
<bold>(A)</bold> The overall structure of PartialNeXt, its layers and channels are designed according to ConvNeXt Nano; <bold>(B)</bold> The structure of the Downsample layer; <bold>(C)</bold> The structure of the PartialNeXt Block, its key improvement is to use partial convolution to optimize feature extraction.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g004.tif">
<alt-text content-type="machine-generated">Diagram illustrating the PartialNeXT architecture. Panel A shows the flow from input through Conv2d, Layer Norm, and several PartialNeXT Blocks with dimensions 80, 160, 320, and 640 interspersed with downsampling steps. Panel B details a downsampling step involving Layer Norm and Conv2d with kernel size 2 and stride 2. Panel C elaborates the PartialNeXT Block, starting with Partial Conv2d, followed by Layer Norm, Conv2d, GELU activation, GRN, and Conv2d again, ending with a summation operation.</alt-text>
</graphic>
</fig>
<p>The network structure of PartialNeXt adopts a hierarchical design, divided into four stages. Each stage contains a downsampling layer, with the number of blocks and channels in each stage consistent with ConvNeXtV2 Nano. The stages, from shallow to deep, contain [2, 2, 8, 2] PartialNeXt Blocks with corresponding channel counts of [80, 160, 320, 640]. Multi-scale features are crucial for object detection tasks, and these four stages can extract features at four different scales, C2 to C5, for subsequent feature fusion. The structure of the Downsample layer is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4B</bold>
</xref>, responsible for reducing the resolution of feature maps and expanding the channel count. At the beginning of each stage, a convolution layer with a kernel size of 2 and a stride of 2 reduces the resolution of the feature map by half while doubling the number of channels. Layer Normalization is applied to ensure stable feature distribution, enhancing model training efficiency. The structure of the PartialNeXt Block is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4C</bold>
</xref>. Each Block starts with PConv, which is the most critical improvement, with a kernel size of 7. We use the default parameters from the PConv paper, where the ratio of the feature map for feature extraction to the feature map for direct forward is 1:3. A 1&#xd7;1 convolution is used for cross-channel information fusion, while the other modules follow the ConvNeXtV2 design.</p>
</sec>
<sec id="s3_1_2">
<label>3.1.2</label>
<title>Hybrid attention</title>
<p>Although the model employs a fully convolutional architecture, its objective is to achieve accurate counting and localization of tiny thrips rather than pursuing precise contour segmentation. Therefore, we introduce a lightweight hybrid channel-spatial attention mechanism. This mechanism focuses on enhancing the detection accuracy for small targets while introducing only minimal additional computational overhead. After the backbone network outputs four multi-scale features (C2&#x2013;C5), all are fed into the HA module for feature extraction.</p>
<p>For the input feature <italic>f</italic>, we first compute its channel attention, then calculate its spatial attention, and finally add its residual, as shown in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>. Below, we will detail the channel attention and spatial attention mechanisms.</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Most channel attention mechanisms apply global average pooling to the feature map, which captures only single-channel information. This offers limited improvements for detecting small objects, as global pooling tends to weaken the features of tiny targets. In our channel attention mechanism, we combine both local and global features to enhance performance on small objects while keeping computational overhead minimal (<xref ref-type="bibr" rid="B32">Wan et&#xa0;al., 2023</xref>). As shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, the input feature map undergoes adaptive average pooling to produce a local pooling result of size ls, followed by global average pooling applied to the local result to obtain the global pooling result. Local pooling emphasizes local region features, while global pooling captures the distribution characteristics of the entire feature map. Both local and global pooling results are passed through a 1D convolution to extract features and compute attention. The global attention is interpolated to the size of the local attention and fused by element-wise addition. Finally, the fused result is interpolated to the input size and multiplied with the input feature map to generate the final channel attention map.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Mixed local channel attention. Integrating local and global features by using average pooling of different sizes in channel attention.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g005.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a data processing pipeline. It begins with an input represented as a three-dimensional block labeled \((H, W, C)\). The process involves several transformations: LAP, GAP, Conv1d with \(k=3\), reshaping, and interpolation, resulting in a final output block. The data undergoes dimensional changes, eventually combining blocks through interpolation, and ends with an output. Arrows indicate the direction and types of operations applied throughout the pipeline.</alt-text>
</graphic>
</fig>
<p>The implementation of spatial attention is straightforward. We adopt the spatial attention module from the Convolutional Block Attention Module (CBAM) (<xref ref-type="bibr" rid="B37">Woo et&#xa0;al., 2018</xref>), which incurs minimal computational overhead, as shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>. First, we extract distribution information of the spatial features by performing average and max pooling along the channel dimension. Then, a 2D convolution is applied to compute spatial attention.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Spatial attention.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g006.tif">
<alt-text content-type="machine-generated">Flowchart depicting a neural network process with four stages. The first stage shows an input cube with dimensions \(H, W, C\). It transitions to a rectangle labeled \(H, W, 2\) through MaxPool and AvgPool operations. The second transition applies a Conv2d with kernel size three to produce \(H, W, 1\). The final stage applies a Sigmoid function, retaining dimensions \(H, W, 1\).</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_1_3">
<label>3.1.3</label>
<title>Adaptive feature mixer feature pyramid network</title>
<p>Feature Pyramid Networks (FPN) have become a standard paradigm for small object detection tasks, as they enhance small object feature information (<xref ref-type="bibr" rid="B23">Lin et&#xa0;al., 2017</xref>). Traditional FPNs fuse features through sampling and element-wise addition. However, this fusion method is not conducive to the flow of information between multi-scale feature maps. The element-wise addition could lead to the accumulation of abnormal feature information or cause the weakening of important features (<xref ref-type="bibr" rid="B6">Dai et&#xa0;al., 2021</xref>). To address this issue, we propose the Adaptive Feature Mixer Feature Pyramid Network (AFM-FPN). AFM-FPN uses an Adaptive Feature Mixer (AFM) module to perform adaptive weighted fusion of features, as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Adaptive feature mixer feature pyramid network. Optimizing the traditional element wise addition method for P-level features to use AFM module for feature adaptive fusion to enhance performance.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g007.tif">
<alt-text content-type="machine-generated">Diagram illustrating the AFM-FPN Neck. It involves stages C2 to C5 connected through identity mappings. Feature maps F2 to F5 undergo adaptive feature mixing and 1x1 convolution with 256 channels to produce outputs P2 to P5. These outputs feed into prediction heads, which aggregate using element-wise summation in the output head section. Arrows and symbols denote processing steps.</alt-text>
</graphic>
</fig>
<p>The AFM module is divided into two branches: spatial feature extraction and channel feature extraction. It assigns fusion weights on a pixel-by-pixel basis for the two features to be fused, as shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. The two features to be fused are then added element-wise. Two 1&#xd7;1 convolutions are used to obtain spatial feature weights with size (h, w, d). Global average pooling is applied to compress the spatial size of the feature map to 1&#xd7;1, and a Feed Forward Network (FFN) is used to encode the channel feature weights. The channel feature weights are broadcasted and added element-wise with the spatial feature weights, followed by activation with the <italic>Sigmoid</italic> function to obtain the adaptive fusion weight <italic>W</italic>, with size (h, w, d). The features <italic>f<sub>1</sub>
</italic> and <italic>f<sub>2</sub>
</italic> are then weighted and fused using <italic>W</italic>, as shown in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Adaptive feature mixer. By extracting the spatial and channel features of the input features, fusion weights are assigned pixel-wise for the two features to be fused.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g008.tif">
<alt-text content-type="machine-generated">Diagram depicting a neural network module with dimensional operations. A feature map \( f_1 + f_2 \) undergoes a 1x1 Conv2D and ReLU, reducing depth from \( d \) to \( d/4 \), then back to \( d \). The result is combined using a sigmoid function to produce \( W_{gate} \). Global average pooling and feed-forward networks are also illustrated.</alt-text>
</graphic>
</fig>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Loss function</title>
<p>A pure fully convolutional network predicts the target&#x2019;s mask during training to enable precise segmentation of target pixels, but it lacks the capability for counting and detecting targets. <xref ref-type="bibr" rid="B48">Zhang et&#xa0;al. (2024a)</xref> observed that existing object detection methods struggle to count tiny pests, as the model struggles to learn the precise location and contours of the target due to missing features, leading to poor performance. They proposed RPH-Counter, using Object Counting Loss (OC Loss) to train the fully convolutional network and incorporating a self-attention mechanism to enhance the model&#x2019;s feature extraction capability, achieving precise detection of field planthoppers. Thrips are even smaller than planthoppers, presenting a greater challenge to model performance. Therefore, we further optimized the model and used OC Loss to train the fully convolutional network to enhance the detection performance for thrips. Our method uses object-level annotations similar to object detection, and the training process of the network model is shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Training process of TCD-Net.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g009.tif">
<alt-text content-type="machine-generated">Diagram showing a training process workflow. It begins with &#x201c;1. Input Data&#x201d; leading to &#x201c;2. FCN Model,&#x201d; then &#x201c;3. Output.&#x201d; &#x201c;4. Object Counting Loss&#x201d; loop links output back to the model with &#x201c;5. Backpropagation.&#x201d; Outer arrows indicate a continuous cycle.</alt-text>
</graphic>
</fig>
<p>The OC Loss optimizes the model&#x2019;s prediction of object centers by focusing on the center points, restricting the model&#x2019;s prediction range for each object according to the annotated bounding box, and continuously constraining false positives during training, as shown in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>. The three sub-goals are optimized together during training, extending the original semantic segmentation capability of the fully convolutional network to include object detection and counting.</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:munder>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#xfe38;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>Localization</mml:mtext>
<mml:mo>&#xa0;</mml:mo>
<mml:mtext>loss</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>+</mml:mo>
<mml:munder>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#xfe38;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>Boundary</mml:mtext>
<mml:mo>&#xa0;</mml:mo>
<mml:mtext>loss</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>+</mml:mo>
<mml:munder>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mi>F</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#xfe38;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>False</mml:mtext>
<mml:mo>&#xa0;</mml:mo>
<mml:mtext>positive</mml:mtext>
<mml:mo>&#xa0;</mml:mo>
<mml:mtext>loss</mml:mtext>
</mml:mrow>
</mml:munder>
</mml:mrow>
</mml:math>
</disp-formula>
<p>After forward propagation, the model generates a prediction matrix <italic>P</italic>, which has the same size as the input image. For each pixel <italic>i</italic>, the raw output value is denoted as <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. To convert this into a probability score, the <italic>Sigmoid</italic> activation function is applied to the model&#x2019;s output. Let <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">/</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>+</mml:mo>
<mml:mtext>exp</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> be the <italic>Sigmoid</italic> probability of thrip at pixel <italic>i</italic>, the closer the value is to 1, the higher the likelihood that the position corresponds to a thrip.</p>
<p>Two ground-truth matrices, <italic>T<sub>L</sub>
</italic> and <italic>T<sub>B</sub>
</italic>, are defined, both matching the size of the input image. Matrix <italic>T<sub>L</sub>
</italic> stores the center locations of pests, assigning a value of 1 to the exact center of each pest and 0 to all other pixels. This serves as a precise localization target during training. On the other hand, <italic>T<sub>B</sub>
</italic> represents the object boundaries, assigning a value of 0 to pixels within the annotated bounding boxes and 1 to all other regions. This matrix is designed to guide the model in distinguishing pest boundaries from their surrounding areas. In the following sections, we will provide a comprehensive breakdown of the three sub-loss functions, each tailored to address specific aspects of the training objective.</p>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Localization loss</title>
<p>
<xref ref-type="bibr" rid="B2">Bearman et&#xa0;al. (2016)</xref> proposed a point-supervised semantic segmentation loss function that only requires point-level annotations to achieve approximate object contour segmentation. We applied and integrated this loss function into the Localization loss component of the OC Loss, enabling the model to accurately localize objects. The Localization loss optimizes the model to predict a region around each object&#x2019;s center, granting the model localization capabilities, as shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>.</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>&#x2124;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mtext>log</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Based on the object bounding box annotations, we first compute the coordinates of each object&#x2019;s center point and generate the ground truth matrix <italic>T<sub>L</sub>
</italic> for the object center points. The target center point label is 1, let <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2124;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> be the set of coordinates in <italic>T<sub>L</sub>
</italic> where the label is 1. For these coordinates with label value of 1, let the predicted value of the corresponding position in the model prediction result <italic>P</italic> be <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We aim to ensure that the model&#x2019;s output value at these positions is close to 1. This optimization objective ensures that the model can accurately localize each thrips. To provide more comprehensive training, we introduce a dynamic parameter <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where the contribution to the loss increases with the number of targets in the image.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Boundary loss</title>
<p>Localization loss only optimizes the model&#x2019;s prediction of each object&#x2019;s center region but does not provide guidance or constraints on the predicted region&#x2019;s boundaries, which can lead to model &#x201c;laziness&#x201d;, resulting in a lack of constraint on the predicted region. Boundary loss constrains the model&#x2019;s predicted range using the boundary information from the annotated bounding boxes, ensuring that the model predicts a small region around each thrips center. We pre-load a matrix <italic>T<sub>B</sub>
</italic> containing the boundary information for all targets in the Dataloader. In this matrix, the value of the element corresponding to the target bounding box position is 1, and only these positions hold a value of 1. <italic>T<sub>B</sub>
</italic> can indicate the boundary coordinates of each target.</p>
<p>Let <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2124;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> be the set of coordinates in <italic>T<sub>B</sub>
</italic> where the label is 1. For these boundary coordinates, let the predicted value of the corresponding position in the model prediction result <italic>P</italic> be <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We aim to ensure that the model&#x2019;s output at these positions is close to 0. Boundary loss is formulated as in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>.</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>&#x2124;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mtext>log</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>This optimization objective constrains the model&#x2019;s predicted range, ensuring that the center of the predicted region for each object is accurate. Similarly, we introduce a dynamic parameter <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. When the image contains more targets, the targets may be closer to each other. Therefore, we assign higher weight to Boundary Loss to ensure that each target remains independently detected.</p>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>False positive loss</title>
<p>Localization loss and Boundary loss contribute only to the model&#x2019;s prediction of positive samples, without encouraging the model to learn the characteristics of negative samples. Therefore, we also incorporate False Positive Loss to train the model&#x2019;s ability to detect negative samples. The procedure for this is as follows: during training, we identify regions that the model incorrectly predicts as positive samples and encourage the model to predict these regions as background, as described in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mi>F</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>&#x2124;</mml:mi>
<mml:mi>F</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mtext>log</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The process for calculating erroneous prediction regions is as follows: First, we use a connected component labeling algorithm to assign unique labels to each independent region in the predicted result <italic>P</italic>. Then, we element-wise multiply <italic>P</italic> with the ground truth center point matrix <italic>T<sub>L</sub>
</italic> to obtain the prediction regions that contain ground truth target centers. Finally, the remaining regions are identified as erroneous predictions. Let <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2124;</mml:mi>
<mml:mi>F</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> be the set of coordinates in <italic>P</italic> corresponding to these erroneous regions. We aim for the model&#x2019;s output at these positions to be close to 0.</p>
</sec>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Thrip counting and detection</title>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Thrip counting</title>
<p>The counting of thrips is achieved by calculating the number of independent regions in the model&#x2019;s prediction <italic>P</italic>. This is done using a connected component labeling algorithm (<xref ref-type="bibr" rid="B14">He et&#xa0;al., 2017</xref>), specifically implemented using the <italic>label</italic> method from the Scipy library.</p>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Thrip detection</title>
<p>Thrip localization and detection results are obtained by calculating the centroid coordinates of each independent region. First, we extract the set of non-zero labels from the labeled matrix, excluding the background. Then, we construct a 2D coordinate matrix with the same dimensions as the input, where each pixel&#x2019;s row and column indices are recorded. Both the labeled matrix and coordinate matrix are flattened into 1D arrays for vectorized computation. Histogram statistics are used to count the number of pixels for each label, and weighted accumulation is performed on the row and column coordinates to obtain the total vertical and horizontal coordinates for the pixels in each connected region. Finally, the centroid coordinates are computed for each label using the centroid calculation formula.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental results</title>
<sec id="s4_1">
<label>4.1</label>
<title>Implementation details</title>
<p>The hardware used for model training and inference consists of an Intel Core I9 12900K CPU with 64GB of memory and an NVIDIA RTX 4090 GPU. The operating system is Ubuntu 22.04.1 LTS, with CUDA version 12.1. The model is built on Python 3.9 and PyTorch 2.1.2.</p>
<sec id="s4_1_1">
<label>4.1.1</label>
<title>Model details</title>
<p>In PartialNeXt, the ratio between feature maps processed by PConv for feature extraction and those directly bypassed is 1:3, with a kernel size of 7. The downsampling rates for C2&#x2013;C5 feature maps are 4&#xd7;, 8&#xd7;, 16&#xd7;, and 32&#xd7;, with channel counts of 80, 160, 320, and 640, respectively. When calculating the mixed attention for C2-C5, the local size for each layer is 32, 16, 8, and 4, respectively. The kernel size for the Conv1D in channel attention is 3, while the kernel size for Conv2D in spatial attention is also 3. All feature maps are adjusted to 256 channels in the FPN, outputting four multi-scale features with 256 channels. Finally, four 1&#xd7;1 convolutions reduce the channel count of the four multi-scale features to 1, which is then resampled back to the input size and merged, with the <italic>Sigmoid</italic> activation function applied, resulting in the final prediction.</p>
</sec>
<sec id="s4_1_2">
<label>4.1.2</label>
<title>Details of the methods used for comparison</title>
<p>We compare TCD-Net with existing methods, including one-stage detectors: YOLOv8 and YOLOv11 (<xref ref-type="bibr" rid="B29">Sharma et&#xa0;al., 2024</xref>). Two-stage detectors include Faster R-CNN (<xref ref-type="bibr" rid="B28">Ren et&#xa0;al., 2015</xref>), Cascade R-CNN (<xref ref-type="bibr" rid="B4">Cai and Vasconcelos, 2018</xref>) and Dynamic R-CNN (<xref ref-type="bibr" rid="B43">Zhang et&#xa0;al., 2020a</xref>). DETR-based detectors include Deformable DETR (<xref ref-type="bibr" rid="B49">Zhu et&#xa0;al., 2020</xref>) and DDQ-DETR (<xref ref-type="bibr" rid="B45">Zhang et&#xa0;al., 2023</xref>). We also compare with the recently proposed RPH-Counter (<xref ref-type="bibr" rid="B48">Zhang et&#xa0;al., 2024a</xref>). YOLO is implemented using the official open-source code, with the Large version of the model. The two-stage detectors and DETR-based detectors are implemented using the MMDetection framework, with the backbone network using ResNet50 pre-trained on ImageNet 1K. For anchor-based detectors, the anchor generation size is adapted to the target size of the rice planthopper dataset.</p>
</sec>
<sec id="s4_1_3">
<label>4.1.3</label>
<title>Training details</title>
<p>During training, random flipping is used for data augmentation. The batch size is set to 1, and the Adam optimizer is used with a learning rate of 1e-5 and weight decay of 1e-4. All methods are trained for 100 epochs.</p>
</sec>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Evaluation metrics</title>
<sec id="s4_2_1">
<label>4.2.1</label>
<title>Detection accuracy</title>
<p>The model&#x2019;s localization accuracy can be evaluated by checking whether the predicted region&#x2019;s centroid lies within the ground truth bounding box. Object detection methods determine this by calculating the center point of the predicted box. The criteria for TP, FP, and FN are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Criteria for determining TP, FP, and FN.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Flag</th>
<th valign="middle" align="left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" style="">True positive (TP)</td>
<td valign="middle" align="left" style="">The centroid of the predicted region lies within the ground truth bounding box</td>
</tr>
<tr>
<td valign="middle" align="left" style="">False positive (FP)</td>
<td valign="middle" align="left" style="">The centroid of the predicted region does not lie within any ground truth bounding box</td>
</tr>
<tr>
<td valign="middle" align="left" style="">False negative (FN)</td>
<td valign="middle" align="left" style="">There is no centroid of the predicted region within the ground truth bounding box</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The model&#x2019;s detection accuracy is evaluated using Precision, Recall, and F1 score, as shown in <xref ref-type="disp-formula" rid="eq7">Equations 7</xref>-<xref ref-type="disp-formula" rid="eq9">9</xref>. Our method uses a confidence threshold of 0.5, while the confidence threshold for object detection methods is determined by finding the value corresponding to the highest F1 score on the Precision-Recall curve.</p>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s4_2_2">
<label>4.2.2</label>
<title>Counting error</title>
<p>The algorithm&#x2019;s stability is evaluated using the Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE). Let <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent the ground truth and predicted number of targets in the <italic>i-th</italic> image, respectively. <italic>N</italic> be the number of images. The calculations are shown in <xref ref-type="disp-formula" rid="eq10">Equations 10</xref> and <xref ref-type="disp-formula" rid="eq11">11</xref>.</p>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>=</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
</disp-formula>
<p>R-squared (R&#xb2;) evaluates the similarity between the algorithm&#x2019;s counting results and the actual results, as shown in <xref ref-type="disp-formula" rid="eq12">Equation 12</xref>, <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="true">&#xaf;</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. The R<sup>2</sup> value ranges from 0 to 1, with higher values indicating that the algorithm more accurately reflects the pest situation.</p>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
<mml:mo>&#x200b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="true">&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
<mml:mo>&#x200b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="true">&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Training results</title>
<p>We visualized the reduction in loss during training, as well as the changes in counting error and accuracy on the validation set, as shown in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>. First, the model&#x2019;s training loss steadily decreased, with all sub-loss functions being well optimized. Meanwhile, in each evaluation cycle, the counting error on the validation set generally showed a decreasing trend, while the F1 score showed an increasing trend. This indicates that, after training, the model successfully achieved the objective of detecting and counting thrips in the images.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Visualization results of the training process.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g010.tif">
<alt-text content-type="machine-generated">Three line charts showing model training and validation metrics over 100 epochs. The first chart shows the training losses: total, localization, boundary, and false positive, all decreasing. The second chart displays validation metrics: F1 score, precision, and recall, initially increasing and stabilizing around 0.8 to 0.9. The third chart shows validation MAE and RMSE, both decreasing and stabilizing. Each chart includes a legend for clarity.</alt-text>
</graphic>
</fig>
<p>We further visualized the model&#x2019;s prediction, presented in the form of heatmaps, as shown in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>. After sufficient training, the model demonstrated the ability to detect thrips while being insensitive to the background. For each thrip, the model predicts a small spot area, and the predicted range is confined within the thrip&#x2019;s body size. Subsequently, the number of independent regions can be calculated using a connected component labeling algorithm, and by calculating the centroid of each region, precise detection and counting of thrips can be achieved.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Visualization of model output.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g011.tif">
<alt-text content-type="machine-generated">Nine close-up images of blue-tinted leaves with small red specks scattered across their surfaces. Each leaf displays a distinct vein pattern, highlighting the texture and variation in speck placement.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Quantitative analysis</title>
<p>We compared TCD-Net with some existing methods widely used for pest counting. First, we compared the detection performance of these models, and the results are shown in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. TCD-Net significantly outperforms the one-stage detectors, with both higher Precision and Recall. YOLOv8l and YOLOv11l show relatively weaker performance, with lower F1 scores and Recall rates compared to other methods, likely due to their inadequate small object detection performance. The two-stage detectors performed relatively better in the thrips detection task. Compared to the one-stage detectors, the two-stage detectors showed a significant improvement in Recall. However, their drawback lies in lower Precision, which leads to more false positives, resulting in suboptimal F1 scores. Deformable DETR achieved higher detection performance, with a primary advantage in Precision. However, due to the global attention mechanism of the Transformer, small object sparse features are prone to being overwhelmed by the background when calculated on high-dimensional feature maps. Additionally, the one-to-one matching (O2O) of predicted boxes in DETR results in far fewer positive samples than the one-to-many matching (O2M) in traditional detectors, which may reduce performance in small object detection tasks (<xref ref-type="bibr" rid="B30">Shihua et&#xa0;al., 2025</xref>). Therefore, current DETR-based detectors still face significant limitations in detecting extremely small objects, with lower Recall in thrips detection leading to many missed detections. TCD-Net demonstrated the best overall performance on both the validation and test sets, with an F1 score significantly higher than other methods. Moreover, it achieved a good balance between Precision and Recall.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of detection accuracy with existing methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Method</th>
<th valign="middle" colspan="3" align="center">Val</th>
<th valign="middle" colspan="3" align="center">Test</th>
</tr>
<tr>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style="">TCD-Net</td>
<td valign="middle" align="center" style="">86.20%</td>
<td valign="middle" align="center" style="">85.30%</td>
<td valign="middle" align="center" style="">87.12%</td>
<td valign="middle" align="center" style="">85.67%</td>
<td valign="middle" align="center" style="">85.18%</td>
<td valign="middle" align="center" style="">86.17%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">RPH-Counter</td>
<td valign="middle" align="center" style="">83.14%</td>
<td valign="middle" align="center" style="">83.35%</td>
<td valign="middle" align="center" style="">82.93%</td>
<td valign="middle" align="center" style="">82.98%</td>
<td valign="middle" align="center" style="">82.67%</td>
<td valign="middle" align="center" style="">83.29%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Faster R-CNN</td>
<td valign="middle" align="center" style="">80.68%</td>
<td valign="middle" align="center" style="">78.41%</td>
<td valign="middle" align="center" style="">83.10%</td>
<td valign="middle" align="center" style="">80.91%</td>
<td valign="middle" align="center" style="">79.31%</td>
<td valign="middle" align="center" style="">82.58%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Cascade R-CNN</td>
<td valign="middle" align="center" style="">81.05%</td>
<td valign="middle" align="center" style="">78.66%</td>
<td valign="middle" align="center" style="">83.59%</td>
<td valign="middle" align="center" style="">81.11%</td>
<td valign="middle" align="center" style="">79.80%</td>
<td valign="middle" align="center" style="">82.46%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Dynamic R-CNN</td>
<td valign="middle" align="center" style="">81.29%</td>
<td valign="middle" align="center" style="">81.69%</td>
<td valign="middle" align="center" style="">80.88%</td>
<td valign="middle" align="center" style="">81.23%</td>
<td valign="middle" align="center" style="">82.63%</td>
<td valign="middle" align="center" style="">79.89%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Deformable DETR</td>
<td valign="middle" align="center" style="">82.67%</td>
<td valign="middle" align="center" style="">86.22%</td>
<td valign="middle" align="center" style="">79.40%</td>
<td valign="middle" align="center" style="">82.43%</td>
<td valign="middle" align="center" style="">86.09%</td>
<td valign="middle" align="center" style="">79.07%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">DDQ-DETR</td>
<td valign="middle" align="center" style="">82.97%</td>
<td valign="middle" align="center" style="">85.02%</td>
<td valign="middle" align="center" style="">81.02%</td>
<td valign="middle" align="center" style="">82.28%</td>
<td valign="middle" align="center" style="">84.13%</td>
<td valign="middle" align="center" style="">80.52%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">YOLOv8l</td>
<td valign="middle" align="center" style="">77.52%</td>
<td valign="middle" align="center" style="">76.35%</td>
<td valign="middle" align="center" style="">78.73%</td>
<td valign="middle" align="center" style="">77.04%</td>
<td valign="middle" align="center" style="">75.90%</td>
<td valign="middle" align="center" style="">78.22%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">YOLOv11l</td>
<td valign="middle" align="center" style="">76.89%</td>
<td valign="middle" align="center" style="">75.83%</td>
<td valign="middle" align="center" style="">77.98%</td>
<td valign="middle" align="center" style="">76.48%</td>
<td valign="middle" align="center" style="">75.56%</td>
<td valign="middle" align="center" style="">77.43%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We further compared the counting accuracy of these methods, and the results are shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. TCD-Net once again demonstrates its advantage, with the lowest MAE and RMSE and the highest R<sup>2</sup> value, indicating that its counting results are the closest to the actual values, with the best stability. Considering the detection accuracy results, models with higher detection performance also show higher counting accuracy, reflecting a more accurate assessment of pest conditions.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Comparison of counting accuracy with existing methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Method</th>
<th valign="middle" colspan="3" align="center">Val</th>
<th valign="middle" colspan="3" align="center">Test</th>
</tr>
<tr>
<th valign="middle" align="center">MAE</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
<th valign="middle" align="center">MAE</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style="">TCD-Net</td>
<td valign="middle" align="center" style="">1.43</td>
<td valign="middle" align="center" style="">2.43</td>
<td valign="middle" align="center" style="">76.80%</td>
<td valign="middle" align="center" style="">1.49</td>
<td valign="middle" align="center" style="">2.48</td>
<td valign="middle" align="center" style="">75.50%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">RPH-Counter</td>
<td valign="middle" align="center" style="">1.69</td>
<td valign="middle" align="center" style="">2.66</td>
<td valign="middle" align="center" style="">65.62%</td>
<td valign="middle" align="center" style="">1.73</td>
<td valign="middle" align="center" style="">2.75</td>
<td valign="middle" align="center" style="">65.41%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Faster R-CNN</td>
<td valign="middle" align="center" style="">2.12</td>
<td valign="middle" align="center" style="">3.14</td>
<td valign="middle" align="center" style="">62.57%</td>
<td valign="middle" align="center" style="">2.13</td>
<td valign="middle" align="center" style="">3.17</td>
<td valign="middle" align="center" style="">62.23%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Cascade R-CNN</td>
<td valign="middle" align="center" style="">2.09</td>
<td valign="middle" align="center" style="">3.08</td>
<td valign="middle" align="center" style="">64.59%</td>
<td valign="middle" align="center" style="">2.09</td>
<td valign="middle" align="center" style="">3.06</td>
<td valign="middle" align="center" style="">62.85%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Dynamic R-CNN</td>
<td valign="middle" align="center" style="">1.96</td>
<td valign="middle" align="center" style="">2.99</td>
<td valign="middle" align="center" style="">63.54%</td>
<td valign="middle" align="center" style="">2.02</td>
<td valign="middle" align="center" style="">3.10</td>
<td valign="middle" align="center" style="">60.51%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Deformable DETR</td>
<td valign="middle" align="center" style="">1.99</td>
<td valign="middle" align="center" style="">2.87</td>
<td valign="middle" align="center" style="">65.24%</td>
<td valign="middle" align="center" style="">2.01</td>
<td valign="middle" align="center" style="">2.99</td>
<td valign="middle" align="center" style="">64.78%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">DDQ-DETR</td>
<td valign="middle" align="center" style="">1.88</td>
<td valign="middle" align="center" style="">2.85</td>
<td valign="middle" align="center" style="">66.13%</td>
<td valign="middle" align="center" style="">2.03</td>
<td valign="middle" align="center" style="">3.01</td>
<td valign="middle" align="center" style="">64.63%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">YOLOv8l</td>
<td valign="middle" align="center" style="">2.14</td>
<td valign="middle" align="center" style="">3.22</td>
<td valign="middle" align="center" style="">60.67%</td>
<td valign="middle" align="center" style="">2.15</td>
<td valign="middle" align="center" style="">3.26</td>
<td valign="middle" align="center" style="">59.73%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">YOLOv11l</td>
<td valign="middle" align="center" style="">2.19</td>
<td valign="middle" align="center" style="">3.29</td>
<td valign="middle" align="center" style="">59.92%</td>
<td valign="middle" align="center" style="">2.21</td>
<td valign="middle" align="center" style="">3.34</td>
<td valign="middle" align="center" style="">58.61%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Finally, we compared the computational complexity of these methods. The comparison was based on four aspects: model parameter count, computational load, training speed, and inference speed, with the results shown in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>. When comparing with the one-stage detection models, YOLOv8 and YOLOv11, TCD-Net has lower theoretical parameter count and computational load. It also has slightly faster training and inference speeds than YOLO, while achieving significantly better detection performance. For more complex models, such as Deformable DETR and RPH-Counter, the detection performance of these models is slightly lower than that of TCD-Net, but their computational complexity is significantly higher, especially Deformable DETR, which fails to meet real-time inference speeds. In comparison with RPH-Counter, TCD-Net&#x2019;s computational load is less than half, and its inference speed is approximately 1.5 times faster. In summary, TCD-Net not only achieves higher detection and counting accuracy but also maintains a relatively low computational load, with its inference speed surpassing the real-time detection requirement.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Comparison of model complexity with existing methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">Params (M)</th>
<th valign="middle" align="center">FLOPs (G)</th>
<th valign="middle" align="center">Training speed (it/s)</th>
<th valign="middle" align="center">Inference FPS on GPU</th>
<th valign="middle" align="center">Inference FPS on CPU</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style="">TCD-Net</td>
<td valign="middle" align="center" style="">21.13</td>
<td valign="middle" align="center" style="">114.36</td>
<td valign="middle" align="center" style="">20.76</td>
<td valign="middle" align="center" style="">91.66</td>
<td valign="middle" align="center" style="">1.67</td>
</tr>
<tr>
<td valign="middle" align="center" style="">RPH-Counter</td>
<td valign="middle" align="center" style="">36.37</td>
<td valign="middle" align="center" style="">247.58</td>
<td valign="middle" align="center" style="">14.06</td>
<td valign="middle" align="center" style="">62.66</td>
<td valign="middle" align="center" style="">0.94</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Faster R-CNN</td>
<td valign="middle" align="center" style="">41.35</td>
<td valign="middle" align="center" style="">322.42</td>
<td valign="middle" align="center" style="">13.21</td>
<td valign="middle" align="center" style="">38.76</td>
<td valign="middle" align="center" style="">0.16</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Cascade R-CNN</td>
<td valign="middle" align="center" style="">69.16</td>
<td valign="middle" align="center" style="">350.22</td>
<td valign="middle" align="center" style="">11.52</td>
<td valign="middle" align="center" style="">34.36</td>
<td valign="middle" align="center" style="">0.16</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Dynamic R-CNN</td>
<td valign="middle" align="center" style="">41.75</td>
<td valign="middle" align="center" style="">323.60</td>
<td valign="middle" align="center" style="">12.92</td>
<td valign="middle" align="center" style="">38.46</td>
<td valign="middle" align="center" style="">0.17</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Deformable DETR</td>
<td valign="middle" align="center" style="">41.21</td>
<td valign="middle" align="center" style="">319.21</td>
<td valign="middle" align="center" style="">3.45</td>
<td valign="middle" align="center" style="">14.86</td>
<td valign="middle" align="center" style="">0.24</td>
</tr>
<tr>
<td valign="middle" align="center" style="">DDQ-DETR</td>
<td valign="middle" align="center" style="">48.31</td>
<td valign="middle" align="center" style="">437.31</td>
<td valign="middle" align="center" style="">3.09</td>
<td valign="middle" align="center" style="">12.95</td>
<td valign="middle" align="center" style="">0.15</td>
</tr>
<tr>
<td valign="middle" align="center" style="">YOLOv8l</td>
<td valign="middle" align="center" style="">43.63</td>
<td valign="middle" align="center" style="">275.32</td>
<td valign="middle" align="center" style="">15.62</td>
<td valign="middle" align="center" style="">52.44</td>
<td valign="middle" align="center" style="">0.53</td>
</tr>
<tr>
<td valign="middle" align="center" style="">YOLOv11l</td>
<td valign="middle" align="center" style="">25.31</td>
<td valign="middle" align="center" style="">147.46</td>
<td valign="middle" align="center" style="">17.04</td>
<td valign="middle" align="center" style="">74.29</td>
<td valign="middle" align="center" style="">0.75</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Visualization</title>
<p>We visualized the detection and counting results of each method for a more intuitive comparison, as shown in <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref>. Upon observing the detection results of Faster R-CNN and Cascade R-CNN, it is evident that they suffer from insufficient detection precision, with many FPs present. YOLOv11l&#x2019;s detection results also include noticeable FN and FP, leading to higher counting discrepancies in some cases. The detection results of Deformable DETR contain fewer FN and FP compared to one-stage and two-stage detectors, but due to its lower recall rate, the counting results are fewer than the actual number of targets. Compared to existing methods, TCD-Net has fewer FN and FP, and its counting results are closer to the actual numbers. However, in some cases, the target detection method may exhibit significant missed detections and false detections, as shown in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure S1</bold>
</xref>. This is primarily due to the small proportion of thrips&#x2019; features, making accurate identification difficult. The visualized results align with the quantitative analysis, further confirming the comprehensive advantage of TCD-Net in the thrips detection and counting task.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Visual comparison of prediction results with existing methods.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g012.tif">
<alt-text content-type="machine-generated">Comparison of leaf images across different algorithms: GT (Ground Truth), TCD-Net, Faster R-CNN, Cascade R-CNN, Deformable DETR, and YOLOv11. Each row shows predictions on leaves, marked with labeled dots indicating true positives in green, false positives in red, and false negatives in yellow.</alt-text>
</graphic>
</fig>
<p>Finally, as shown in <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref>, we present a set of detection and counting results from TCD-Net. TCD-Net demonstrates high stability, with only a small number of FN and FP in the detection results, providing strong algorithmic support for the intelligent monitoring and management of thrips.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Visualization of the prediction results of TCD-Net.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g013.tif">
<alt-text content-type="machine-generated">Grid of leaf images labeled with predicted numbers, each marked with colored dots. Green denotes true positive, red denotes false positive, and yellow denotes false negative. Labels include predictions like Pred:22, Pred:6, and Pred:18, indicating model assessments.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Comparative analysis</title>
<sec id="s4_6_1">
<label>4.6.1</label>
<title>Comparison of backbone</title>
<p>First, we compared PartialNeXt with several existing backbone networks, without using any attention mechanisms in the network. The features from four levels of the backbone network were input into the vanilla FPN for feature fusion. The performance comparison results are shown in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>. When using PartialNeXt, the model outperforms several existing backbone networks in terms of F1 score, RMSE, and R<sup>2</sup> on both the validation and test sets. Compared to ConvNeXtV2-Nano, after applying PConv, the model&#x2019;s performance significantly improves, demonstrating that using PConv is a better choice than DWConv. When compared to classic backbone networks such as ResNet-50 (<xref ref-type="bibr" rid="B15">He et&#xa0;al., 2016</xref>), Swin Transformer-Tiny (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2021b</xref>), and FasterNet-S (<xref ref-type="bibr" rid="B5">Chen et&#xa0;al., 2023</xref>), PartialNeXt, through lightweight design and PConv optimization, is able to extract richer features. At the same time, PartialNeXt maintains a high degree of lightweight efficiency, as shown in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S1</bold>
</xref>. Compared to the larger backbone network Swin Transformer-Tiny, PartialNeXt achieves higher performance and a 4.5&#xd7; faster inference speed. In comparison with ConvNeXtV2-Nano, PartialNeXt delivers significantly higher performance with minimal efficiency loss.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Performance comparison of backbone.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Backbone</th>
<th valign="middle" colspan="3" align="center">Val</th>
<th valign="middle" colspan="3" align="center">Test</th>
</tr>
<tr>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style="">ResNet-50</td>
<td valign="middle" align="center" style="">82.66%</td>
<td valign="middle" align="center" style="">2.77</td>
<td valign="middle" align="center" style="">65.14%</td>
<td valign="middle" align="center" style="">81.91%</td>
<td valign="middle" align="center" style="">3.01</td>
<td valign="middle" align="center" style="">63.79%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">Swin Transformer-Tiny</td>
<td valign="middle" align="center" style="">82.99%</td>
<td valign="middle" align="center" style="">2.75</td>
<td valign="middle" align="center" style="">65.21%</td>
<td valign="middle" align="center" style="">82.57%</td>
<td valign="middle" align="center" style="">2.99</td>
<td valign="middle" align="center" style="">64.21%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">ConvNeXt-Tiny</td>
<td valign="middle" align="center" style="">82.84%</td>
<td valign="middle" align="center" style="">2.78</td>
<td valign="middle" align="center" style="">65.09%</td>
<td valign="middle" align="center" style="">82.20%</td>
<td valign="middle" align="center" style="">2.96</td>
<td valign="middle" align="center" style="">63.88%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">ConvNeXtV2-Nano</td>
<td valign="middle" align="center" style="">82.91%</td>
<td valign="middle" align="center" style="">2.77</td>
<td valign="middle" align="center" style="">64.99%</td>
<td valign="middle" align="center" style="">82.21%</td>
<td valign="middle" align="center" style="">2.94</td>
<td valign="middle" align="center" style="">62.82%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">FasterNet-S</td>
<td valign="middle" align="center" style="">82.58%</td>
<td valign="middle" align="center" style="">2.88</td>
<td valign="middle" align="center" style="">66.42%</td>
<td valign="middle" align="center" style="">81.80%</td>
<td valign="middle" align="center" style="">3.07</td>
<td valign="middle" align="center" style="">59.99%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">PartialNeXt</td>
<td valign="middle" align="center" style="">83.69%</td>
<td valign="middle" align="center" style="">2.79</td>
<td valign="middle" align="center" style="">71.25%</td>
<td valign="middle" align="center" style="">83.56%</td>
<td valign="middle" align="center" style="">2.87</td>
<td valign="middle" align="center" style="">65.87%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_6_2">
<label>4.6.2</label>
<title>Comparison of attention mechanism</title>
<p>Next, we fixed the backbone network as PartialNeXt and used the vanilla FPN for feature fusion. We compared the performance and efficiency of different attention mechanisms. The performance comparison results are shown in <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref>. When using our proposed HA to process the multi-level feature maps of the backbone network, model performance improves, especially in R<sup>2</sup>, which shows a notable enhancement. This indicates that, after using HA, the model&#x2019;s output becomes more stable. Meanwhile, as shown in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S2</bold>
</xref>, the computational cost of HA is lower than that of CBAM, and the inference speed is only slightly lower than MLCA, achieving a good balance between model performance and efficiency.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Performance comparison of attention mechanism.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Attention</th>
<th valign="middle" colspan="3" align="center">Val</th>
<th valign="middle" colspan="3" align="center">Test</th>
</tr>
<tr>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style="">-</td>
<td valign="middle" align="center" style="">83.69%</td>
<td valign="middle" align="center" style="">2.79</td>
<td valign="middle" align="center" style="">71.25%</td>
<td valign="middle" align="center" style="">83.56%</td>
<td valign="middle" align="center" style="">2.87</td>
<td valign="middle" align="center" style="">65.87%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">CBAM</td>
<td valign="middle" align="center" style="">85.12%</td>
<td valign="middle" align="center" style="">2.67</td>
<td valign="middle" align="center" style="">72.44%</td>
<td valign="middle" align="center" style="">84.20%</td>
<td valign="middle" align="center" style="">2.68</td>
<td valign="middle" align="center" style="">67.17%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">MLCA</td>
<td valign="middle" align="center" style="">84.81%</td>
<td valign="middle" align="center" style="">2.64</td>
<td valign="middle" align="center" style="">71.46%</td>
<td valign="middle" align="center" style="">84.37%</td>
<td valign="middle" align="center" style="">2.58</td>
<td valign="middle" align="center" style="">68.99%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">HA</td>
<td valign="middle" align="center" style="">85.54%</td>
<td valign="middle" align="center" style="">2.46</td>
<td valign="middle" align="center" style="">75.34%</td>
<td valign="middle" align="center" style="">84.94%</td>
<td valign="middle" align="center" style="">2.61</td>
<td valign="middle" align="center" style="">69.04%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_6_3">
<label>4.6.3</label>
<title>Comparison of FPN</title>
<p>Finally, we compared the performance and efficiency of different FPNs. With the backbone network fixed as PartialNeXt and no attention mechanism, the performance comparison results are shown in <xref ref-type="table" rid="T8">
<bold>Table&#xa0;8</bold>
</xref>. Using our proposed AFM-FPN further enhanced the model&#x2019;s feature fusion mechanism, improving the model&#x2019;s detection and counting performance for thrips. At the same time, as shown in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S3</bold>
</xref>, the model with AFM-FPN has lower parameters and computational cost, balancing model performance and efficiency effectively.</p>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>Performance comparison of FPN.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">FPN</th>
<th valign="middle" colspan="3" align="center">Val</th>
<th valign="middle" colspan="3" align="center">Test</th>
</tr>
<tr>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R<sup>2</sup>
</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style="">Vanilla FPN</td>
<td valign="middle" align="center" style="">83.69%</td>
<td valign="middle" align="center" style="">2.79</td>
<td valign="middle" align="center" style="">71.25%</td>
<td valign="middle" align="center" style="">83.56%</td>
<td valign="middle" align="center" style="">2.87</td>
<td valign="middle" align="center" style="">65.87%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">PAFPN</td>
<td valign="middle" align="center" style="">85.05%</td>
<td valign="middle" align="center" style="">2.48</td>
<td valign="middle" align="center" style="">72.27%</td>
<td valign="middle" align="center" style="">84.58%</td>
<td valign="middle" align="center" style="">2.62</td>
<td valign="middle" align="center" style="">70.01%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">AFM-FPN</td>
<td valign="middle" align="center" style="">85.93%</td>
<td valign="middle" align="center" style="">2.45</td>
<td valign="middle" align="center" style="">75.75%</td>
<td valign="middle" align="center" style="">85.35%</td>
<td valign="middle" align="center" style="">2.55</td>
<td valign="middle" align="center" style="">71.03%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4_7">
<label>4.7</label>
<title>Ablation study</title>
<sec id="s4_7_1">
<label>4.7.1</label>
<title>Loss ablation</title>
<p>We conducted an ablation study on the components of the loss function, and the visualization results are shown in <xref ref-type="fig" rid="f14">
<bold>Figure&#xa0;14</bold>
</xref>. When only &#x2112;<italic>
<sub>L</sub>
</italic> is used, the model exhibits &#x201c;laziness,&#x201d; predicting the entire image as the foreground to include all thrips. When &#x2112;<italic>
<sub>L</sub>
</italic>+&#x2112;<italic>
<sub>B</sub>
</italic> is used, the lack of constraints on false positives leads to a large number of false positive predictions. When &#x2112;<italic>
<sub>L</sub>
</italic>+&#x2112;<italic>
<sub>F</sub>
</italic> is used, the model predicts a larger spot for each thrips, but due to the absence of constraints on prediction boundaries, the model is unable to separate thrips that are close together. When the complete loss function is used, the model predicts a smaller spot for each thrips, with individuals well separated, and false positives are constrained, achieving precise detection and counting of thrips.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Visual comparison of loss ablation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1663813-g014.tif">
<alt-text content-type="machine-generated">Grid of leaf images showing different detection methods for small objects. Rows display varied lighting conditions: normal, blue filtration, and combinations of luminance and edge detection. Annotations appear in green (true positive), red (false positive), and blue (multiple objects). Each column applies a different detection technique, each labeled beneath: Original, L&#x2097;, L&#x2097;+L&#x1d66;, L&#x2097;+L&#x209a;, and L&#x2097;+L&#x1d66;+L&#x209a;.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4_7_2">
<label>4.7.2</label>
<title>Network module ablation</title>
<p>We conducted ablation experiments on the three key improvements we proposed to validate their effectiveness, and the model performance comparison results are shown in <xref ref-type="table" rid="T9">
<bold>Table&#xa0;9</bold>
</xref>. Each of the three proposed improvement modules effectively enhances the model&#x2019;s performance, and when combined, they exhibit significant synergistic effects. First, when used individually, each of these modules improves the evaluation metrics, confirming the independent effectiveness of each module. Then, combining two modules further enhances performance, with PConv+AFM-FPN performing the best, showing an 8.21% improvement in R<sup>2</sup> on the test set. Finally, when all three improvements are combined, the model achieves optimal performance, with the test set F1 reaching 85.67%, RMSE reduced by 15.6%, and R<sup>2</sup> increased by 12.68%. These results significantly outperform the baseline and any combination of submodules, demonstrating the rationality and necessity of the multi-module collaborative design.</p>
<table-wrap id="T9" position="float">
<label>Table&#xa0;9</label>
<caption>
<p>Ablation study on model performance.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">PConv</th>
<th valign="middle" rowspan="2" align="center">HA</th>
<th valign="middle" rowspan="2" align="center">AFM-FPN</th>
<th valign="middle" colspan="3" align="center">Val</th>
<th valign="middle" colspan="3" align="center">Test</th>
</tr>
<tr>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R2</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">RMSE</th>
<th valign="middle" align="center">R2</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">82.91%</td>
<td valign="middle" align="center" style="">2.77</td>
<td valign="middle" align="center" style="">64.99%</td>
<td valign="middle" align="center" style="">82.21%</td>
<td valign="middle" align="center" style="">2.94</td>
<td valign="middle" align="center" style="">62.82%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">83.69%</td>
<td valign="middle" align="center" style="">2.79</td>
<td valign="middle" align="center" style="">71.25%</td>
<td valign="middle" align="center" style="">83.56%</td>
<td valign="middle" align="center" style="">2.87</td>
<td valign="middle" align="center" style="">65.87%</td>
</tr>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">83.20%</td>
<td valign="middle" align="center" style="">2.73</td>
<td valign="middle" align="center" style="">66.97%</td>
<td valign="middle" align="center" style="">82.66%</td>
<td valign="middle" align="center" style="">2.83</td>
<td valign="middle" align="center" style="">64.55%</td>
</tr>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">83.68%</td>
<td valign="middle" align="center" style="">2.66</td>
<td valign="middle" align="center" style="">68.49%</td>
<td valign="middle" align="center" style="">83.07%</td>
<td valign="middle" align="center" style="">2.90</td>
<td valign="middle" align="center" style="">65.88%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">85.54%</td>
<td valign="middle" align="center" style="">2.46</td>
<td valign="middle" align="center" style="">75.34%</td>
<td valign="middle" align="center" style="">84.94%</td>
<td valign="middle" align="center" style="">2.61</td>
<td valign="middle" align="center" style="">69.04%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">85.93%</td>
<td valign="middle" align="center" style="">2.45</td>
<td valign="middle" align="center" style="">75.75%</td>
<td valign="middle" align="center" style="">85.35%</td>
<td valign="middle" align="center" style="">2.55</td>
<td valign="middle" align="center" style="">71.03%</td>
</tr>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">84.39%</td>
<td valign="middle" align="center" style="">2.55</td>
<td valign="middle" align="center" style="">72.70%</td>
<td valign="middle" align="center" style="">83.60%</td>
<td valign="middle" align="center" style="">2.72</td>
<td valign="middle" align="center" style="">67.36%</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">86.20%</td>
<td valign="middle" align="center" style="">2.43</td>
<td valign="middle" align="center" style="">76.80%</td>
<td valign="middle" align="center" style="">85.67%</td>
<td valign="middle" align="center" style="">2.48</td>
<td valign="middle" align="center" style="">75.50%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T10">
<bold>Table&#xa0;10</bold>
</xref>, we further investigated the impact of these improvements on the model&#x2019;s computational efficiency. First, PConv, due to the use of partial vanilla convolution, results in a noticeable increase in parameters and computational load (+5M Params, +25.96G FLOPs), but has minimal impact on training speed. HA, with almost no increase in parameters and computation, slightly reduces the inference speed, indicating that the computational overhead of its attention mechanism is manageable. The multi-scale fusion structure introduced by AFM-FPN also only slightly increases the computational burden (+0.2M Params, +4.49G FLOPs), while maintaining high training and inference efficiency. When combining the modules, the inference speed drops to 91.66 it/s but still exceeds the real-time requirements. Overall, the modules achieve a good balance between computational cost and performance improvement.</p>
<table-wrap id="T10" position="float">
<label>Table&#xa0;10</label>
<caption>
<p>Ablation study on model efficiency.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">PConv</th>
<th valign="middle" align="center">HA</th>
<th valign="middle" align="center">AFM-FPN</th>
<th valign="middle" align="center">Params (M)</th>
<th valign="middle" align="center">FLOPs (G)</th>
<th valign="middle" align="center">Training speed (it/s)</th>
<th valign="middle" align="center">Inference speed (it/s)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">15.93</td>
<td valign="middle" align="center" style="">83.89</td>
<td valign="middle" align="center" style="">21.69</td>
<td valign="middle" align="center" style="">113.02</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">20.93</td>
<td valign="middle" align="center" style="">109.85</td>
<td valign="middle" align="center" style="">21.32</td>
<td valign="middle" align="center" style="">106.07</td>
</tr>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">15.93</td>
<td valign="middle" align="center" style="">83.91</td>
<td valign="middle" align="center" style="">21.35</td>
<td valign="middle" align="center" style="">107.08</td>
</tr>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">16.13</td>
<td valign="middle" align="center" style="">88.38</td>
<td valign="middle" align="center" style="">21.44</td>
<td valign="middle" align="center" style="">108.11</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">20.93</td>
<td valign="middle" align="center" style="">109.87</td>
<td valign="middle" align="center" style="">21.13</td>
<td valign="middle" align="center" style="">100.56</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">21.13</td>
<td valign="middle" align="center" style="">114.34</td>
<td valign="middle" align="center" style="">20.99</td>
<td valign="middle" align="center" style="">96.31</td>
</tr>
<tr>
<td valign="middle" align="center" style=""/>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">16.13</td>
<td valign="middle" align="center" style="">88.40</td>
<td valign="middle" align="center" style="">21.18</td>
<td valign="middle" align="center" style="">97.54</td>
</tr>
<tr>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">&#x2713;</td>
<td valign="middle" align="center" style="">21.13</td>
<td valign="middle" align="center" style="">114.36</td>
<td valign="middle" align="center" style="">20.76</td>
<td valign="middle" align="center" style="">91.66</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="discussion">
<label>5</label>
<title>Discussion</title>
<p>We identified the performance shortcomings of existing methods in thrips detection and made key improvements to address these issues. The main advantages of TCD-Net include: 1) State-of-the-art optimization: TCD-Net follows the latest neural network optimization approaches, improving the model&#x2019;s performance through enhancements in feature extraction, attention mechanisms, and multi-scale feature fusion. 2) Model efficiency: While optimizing the model, we ensure its efficiency by using methods with low parameter and computational requirements, rather than merely stacking modules, achieving a balance between performance and efficiency. 3) Specialized loss function: We use a loss function tailored for small object pest detection, avoiding the issue in traditional object detection methods where it is difficult to predict and match precise small target bounding boxes, ensuring the model&#x2019;s baseline performance.</p>
<p>However, this work still faces some limitations. First, regarding the dataset, we have collected a thrips dataset with over 47K+ annotations in a greenhouse, and the public release of this dataset can contribute to the field of extremely small pest detection. Although TCD-Net has shown good performance in our environment, greenhouse and field conditions are nearly infinitely complex, and the diversity and scale of the dataset still require further development. While data collection and annotation took considerable time and incurred high labor costs, it remains crucial to gather richer datasets in future work. New data augmentation techniques can be explored, such as using generative models like GANs and diffusion models to synthesize new data, which can be combined with the original dataset, reducing annotation costs and increasing data richness (<xref ref-type="bibr" rid="B27">Lu et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B47">Zhang et&#xa0;al., 2024b</xref>). Furthermore, unsupervised and weakly supervised methods can be explored for model training to reduce the need for large annotated datasets and enhance model generalization (<xref ref-type="bibr" rid="B3">Bollis et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B13">Han et&#xa0;al., 2025</xref>).</p>
<p>Regarding method optimization, further development of the model&#x2019;s attention and feature fusion mechanisms is an ongoing direction that requires continued exploration. At the same time, model efficiency must be considered to ensure feasibility in practical deployment. The loss function also needs further development. While it has been successful for small pest counting, its current support for large-scale, multi-class tasks is limited. Future work could focus on optimizing the localization loss part of the loss function to enhance its multi-class support capabilities. Another potential avenue is the development of hybrid or multi-branch networks to improve support for large-scale pest detection. For example, using a hybrid machine learning and deep learning structure could enhance model performance, or employing a combined density estimation and object detection network could simultaneously improve pest detection and counting accuracy (<xref ref-type="bibr" rid="B11">Gao et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B12">Han et&#xa0;al., 2024</xref>).</p>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusions</title>
<p>This paper presents an efficient model for thrips counting and detection, capable of performing real-time, accurate counting and detection of thrips on the leaves of <italic>Spathiphyllum floribundum &#x2018;Clevelandii&#x2019;</italic> in greenhouses. TCD-Net is a unique fully convolutional network structure, which utilizes our designed efficient PartialNeXt as the backbone network, combined with lightweight Hybrid Attention and AFM-FPN to extract and fuse rich thrips features. By predicting a small region for each thrips, TCD-Net achieves precise counting and detection. Experiments were conducted on a dataset containing over 47K thrips annotations, and the results demonstrate that TCD-Net provides highly accurate counting and detection performance, while maintaining low model complexity and an inference speed that far exceeds real-time detection. On the test set, TCD-Net achieved an F1 score of 85.67% and a counting result correlation of 75.50%, outperforming existing methods in both counting and detection accuracy. Additionally, the model size (21.13M parameters) and theoretical computational load (114.36 GFLOPs) are less than half that of two-stage object detection methods, while the inference speed (91.66 it/s) is more than twice as fast as that of two-stage object detection methods. In summary, TCD-Net achieves higher thrips counting and detection accuracy with lower computational complexity, demonstrating its potential for detecting extremely small pests. Future optimization directions include further improving model training and inference speeds, integrating with patrol robots for intelligent pest monitoring in greenhouses, and expanding its application to other types of pests, contributing to intelligent pest reporting systems.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Material</bold>
</xref>. Further inquiries can be directed to the corresponding author/s.</p>
</sec>
<sec id="s8" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The manuscript presents research on animals that do not require ethical approval for their study.</p>
</sec>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>ZH: Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Conceptualization, Investigation. XC: Data curation, Investigation, Validation, Writing &#x2013; review &amp; editing. YiG: Data curation, Investigation, Writing &#x2013; review &amp; editing, Funding acquisition. YZ: Data curation, Investigation, Visualization, Writing &#x2013; review &amp; editing. YuG: Investigation, Software, Writing &#x2013; review &amp; editing. TZ: Data curation, Investigation, Writing &#x2013; review &amp; editing. XW: Data curation, Software, Writing &#x2013; review &amp; editing. HL: Data&#xa0;curation, Investigation, Writing &#x2013; review &amp; editing. HZ: Data curation, Investigation, Writing &#x2013; review &amp; editing. YF: Data&#xa0;curation, Investigation, Writing &#x2013; review &amp; editing. ZZ: Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Investigation, Methodology, Software.</p>
</sec>
<sec id="s10" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by Young and Middle-aged Leading Science and Technology Innovation Team by the Education Department of Hubei Province (No. T2023032) and the Scientific Research Project of Jingchu University of Technology (No. ZD202304).</p>
</sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s12" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s13" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s14" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fpls.2025.1663813/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fpls.2025.1663813/full#supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Banerjee</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sarda</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Khandelwal</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Gajarushi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Gawande</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Velmurugan</surname> <given-names>R.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). &#x201c;<article-title>IoT-based sensing system for thrips pest and disease management in&#xa0;onion crop</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Applied Sensing Conference (APSCON))</conf-name>. <publisher-loc>Goa,&#xa0;India</publisher-loc>: <publisher-name>IEEE.</publisher-name>
</citation></ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bearman</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Russakovsky</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Ferrari</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Fei-Fei</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>What&#x2019;s the point:&#xa0;semantic segmentation with point supervision</article-title>,&#x201d; in <conf-name>Proceedings of the European Conference on Computer Vision (ECCV)</conf-name>. <publisher-name>Springer, Cham, Switzerland.</publisher-name> <fpage>549</fpage>&#x2013;<lpage>565</lpage>.</citation></ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bollis</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Maia</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Pedrini</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Avila</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Weakly supervised attention-based models using activation maps for citrus mite and insect pest classification</article-title>. <source>Comput. Electron. Agric.</source> <volume>195</volume>, <elocation-id>106839</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.106839</pub-id>
</citation></ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cai</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Vasconcelos</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Cascade R-CNN: delving into high quality object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR))</conf-name>. <publisher-loc>Salt Lake City, UT, USA</publisher-loc>: <publisher-name>IEEE.</publisher-name> <fpage>6154</fpage>&#x2013;<lpage>6162</lpage>.</citation></ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kao</surname> <given-names>S.-H.</given-names>
</name>
<name>
<surname>He</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhuo</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>C.-H.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>Run, don't walk: chasing higher FLOPS for faster neural networks</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE</publisher-name>. <fpage>12021</fpage>&#x2013;<lpage>12031</lpage>.</citation></ref>
<ref id="B6">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Gieseke</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Oehmcke</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Barnard</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Attentional feature fusion</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)</conf-name>. <publisher-loc>Waikoloa, HI, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. <fpage>3560</fpage>&#x2013;<lpage>3569</lpage>.</citation></ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>De Cesaro</surname> <given-names>T.</given-names>
<suffix>Jr.</suffix>
</name>
<name>
<surname>Rieder</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Di Dom&#xea;nico</surname> <given-names>J. R.</given-names>
</name>
<name>
<surname>Lau</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>InsectCV: A system for insect detection in the lab from trap images</article-title>. <source>Ecol. Inf.</source> <volume>67</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2021.101516</pub-id>
</citation></ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>a). <article-title>PestLite: A novel YOLO-based deep learning technique for crop pest detection</article-title>. <source>Agriculture</source> <volume>14</volume>, <fpage>228</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture14020228</pub-id>
</citation></ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname> <given-names>S. F.</given-names>
</name>
<name>
<surname>Teng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>J. M.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R. J.</given-names>
</name>
</person-group> (<year>2024</year>b). <article-title>ESA-Net: An efficient scale-aware network for small crop pest detection</article-title>. <source>Expert Syst. Appl.</source> <volume>236</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2023.121308</pub-id>
</citation></ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>CRA-Net: A channel recalibration feature pyramid network for detecting small pests</article-title>. <source>Comput. Electron. Agric.</source> <volume>191</volume>, <elocation-id>106518</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106518</pub-id>
</citation></ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Xue</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Lennox</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Stevens</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Developing a hybrid convolutional neural network for automatic aphid counting in sugar beet fields</article-title>. <source>Comput. Electron. Agric.</source> <volume>220</volume>, <elocation-id>108910</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.108910</pub-id>
</citation></ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>BED-YOLO: an enhanced YOLOv8 for high-precision real-time bearing defect detection</article-title>. <source>IEEE Trans. Instrumentation Measurement</source> <volume>73</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIM.2024.3472791</pub-id>
</citation></ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Weakly supervised camouflaged object detection as Progressive Perception Learning</article-title>. <source>Knowledge-Based Syst.</source> <volume>325</volume>, <elocation-id>113993</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.knosys.2025.113993</pub-id>
</citation></ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Chao</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>The connected-component labeling problem: A review of state-of-the-art algorithms</article-title>. <source>Pattern Recognition</source> <volume>70</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.patcog.2017.04.018</pub-id>
</citation></ref>
<ref id="B15">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Las Vegas, NV, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</citation></ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Brown rice planthopper (<italic>Nilaparvata lugens</italic> Stal) detection based on deep learning</article-title>. <source>Precis. Agric.</source> <volume>21</volume>, <fpage>1385</fpage>&#x2013;<lpage>1402</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-020-09726-2</pub-id>
</citation></ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>S. F.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>S. Y.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>C. J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H. Q.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>AF-RCNN: An anchor-free convolutional neural network for multi-categories agricultural pest detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>174</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105522</pub-id>
</citation></ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>a). <article-title>Global context-aware-based deformable residual network module for precise pest recognition and detection</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.895944</pub-id>, PMID: <pub-id pub-id-type="pmid">35720529</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>C. J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>J. M.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>b). <article-title>Adaptive feature fusion pyramid network for multi-classes agricultural pest detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>195</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.106827</pub-id>
</citation></ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kirk</surname> <given-names>W. D. J.</given-names>
</name>
<name>
<surname>de Kogel</surname> <given-names>W. J.</given-names>
</name>
<name>
<surname>Koschier</surname> <given-names>E. H.</given-names>
</name>
<name>
<surname>Teulon</surname> <given-names>D. A. J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Semiochemicals for thrips and their use in pest management</article-title>. <source>Annu. Rev. Entomology</source> <volume>66</volume>, <fpage>101</fpage>&#x2013;<lpage>119</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1146/annurev-ento-022020-081531</pub-id>, PMID: <pub-id pub-id-type="pmid">33417819</pub-id></citation></ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname> <given-names>S. H.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>S. R.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>S. F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Identification of tea foliar diseases and pest damage under practical field conditions using a convolutional neural network</article-title>. <source>Plant Pathol.</source> <volume>69</volume>, <fpage>1731</fpage>&#x2013;<lpage>1739</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/ppa.13251</pub-id>
</citation></ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>W. Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Z. K.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>J. W.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>T. F.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>C. H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Detection of small-sized insects in sticky trapping images using spectral residual model and machine learning</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.915543</pub-id>, PMID: <pub-id pub-id-type="pmid">35837447</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T.-Y.</given-names>
</name>
<name>
<surname>Dollar</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Hariharan</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Belongie</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Honolulu, HI</publisher-loc>: <publisher-name>IEEE</publisher-name>.</citation></ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y. T.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y. X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>b). &#x201c;<article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>. <publisher-loc>Montreal, QC, Canada</publisher-loc>: <publisher-name>IEEE.</publisher-name> <fpage>9992</fpage>&#x2013;<lpage>10002</lpage>.</citation></ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Sudirman</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>a). <article-title>Deep learning based automatic multiclass wild pest monitoring approach using hybrid global and local activated features</article-title>. <source>IEEE Trans. Ind. Inf.</source> <volume>17</volume>, <fpage>7589</fpage>&#x2013;<lpage>7598</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TII.2020.2995208</pub-id>
</citation></ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>J. Q.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Y. J.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B. C.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>G. H.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>Y. L.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>RicePest-DETR: A transformer-based model for accurately identifying small rice pest by end-to-end detection mechanism</article-title>. <source>Comput. Electron. Agric.</source> <volume>235</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110373</pub-id>
</citation></ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Olaniyi</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Generative adversarial networks (GANs) for image augmentation in agriculture: A systematic review</article-title>. <source>Comput. Electron. Agric.</source> <volume>200</volume>, <elocation-id>107208</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107208</pub-id>
</citation></ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>) &#x201c;<article-title>Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks</article-title>,&#x201d; in <source>Proceedings of the Advances in Neural Information Processing Systems (NeurIPS)</source>. Eds. <person-group person-group-type="editor">
<name>
<surname>Cortes</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Lawrence</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Sugiyama</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Garnett</surname> <given-names>R.</given-names>
</name>
</person-group>. <publisher-loc>Cambridge, MA, United States</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>, PMID: <pub-id pub-id-type="pmid">27295650</pub-id></citation></ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sharma</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Longchamps</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Comparative performance of YOLOv8, YOLOv9, YOLOv10, YOLOv11 and Faster R-CNN models for detection of multiple weed species</article-title>. <source>Smart Agric. Technol.</source> <volume>9</volume>, <elocation-id>100648</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.atech.2024.100648</pub-id>
</citation></ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shihua</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhichao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xiaodong</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yongjun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Xi</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>DEIM: DETR with improved matching for fast convergence</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. <publisher-loc>Nashville, TN, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>.</citation></ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Steenbergen</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Abd-el-Haliem</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Bleeker</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Dicke</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Escobar-Bravo</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>G.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Thrips advisor: exploiting thrips-induced defences to combat pests on crops</article-title>. <source>J. Exp. Bot.</source> <volume>69</volume>, <fpage>1837</fpage>&#x2013;<lpage>1848</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/jxb/ery060</pub-id>, PMID: <pub-id pub-id-type="pmid">29490080</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname> <given-names>D. H.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>R. S.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>S. Y.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Lang</surname> <given-names>X. L.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>Z. J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Mixed local channel attention for object detection</article-title>. <source>Eng. Appl. Artif. Intell.</source> <volume>123</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2023.106442</pub-id>
</citation></ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>R. J.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>S. F.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>J. M.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Z. L.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>S. J.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>OSAF-Net: A one-stage anchor-free detector for small-target crop pest detection</article-title>. <source>Appl. Intell.</source> <volume>53</volume>, <fpage>24895</fpage>&#x2013;<lpage>24907</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10489-023-04862-4</pub-id>
</citation></ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>b). <article-title>S-RPN: Sampling-balanced region proposal network for small crop pest detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>187</volume>, <elocation-id>106290</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106290</pub-id>
</citation></ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>a). <article-title>Convolutional neural network based automatic pest monitoring system using hand-held mobile image analysis towards non-site-specific wild environment</article-title>. <source>Comput. Electron. Agric.</source> <volume>187</volume>, <elocation-id>106268</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106268</pub-id>
</citation></ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Woo</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Debnath</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>R. H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>X. L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Kweon</surname> <given-names>I. S.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>ConvNeXt V2: co-designing and scaling convNets with masked autoencoders</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE.</publisher-name> <fpage>16133</fpage>&#x2013;<lpage>16142</lpage>.</citation></ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Woo</surname> <given-names>S. H.</given-names>
</name>
<name>
<surname>Park</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J. Y.</given-names>
</name>
<name>
<surname>Kweon</surname> <given-names>I. S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>CBAM: convolutional block attention module</article-title>,&#x201d; in <conf-name>Proceedings of the European Conference on Computer Vision (ECCV)</conf-name>. <publisher-loc>Switzerland</publisher-loc>: <publisher-name>Springer, Cham</publisher-name>. <fpage>3</fpage>&#x2013;<lpage>19</lpage>.</citation></ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>G. H.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>N. F.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Chan</surname> <given-names>P. C.</given-names>
</name>
<name>
<surname>Bhat</surname> <given-names>S. A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Thrips detection and monitoring system in mango greenhouses based on deep learning image recognition</article-title>. <source>IEEE Trans. Instrumentation Measurement</source> <volume>73</volume>, <fpage>1-18</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/tim.2024.3403191</pub-id>
</citation></ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>S. Y.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>L. D.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X. R.</given-names>
</name>
<name>
<surname>Xing</surname> <given-names>Z. L.</given-names>
</name>
<name>
<surname>Lei</surname> <given-names>Z. R.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>Y. L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A decade of a thrips invasion in China: lessons learned</article-title>. <source>Ecotoxicology</source> <volume>27</volume>, <fpage>1032</fpage>&#x2013;<lpage>1038</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10646-017-1864-6</pub-id>, PMID: <pub-id pub-id-type="pmid">29027089</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>SRNet-YOLO: A model for detecting tiny and very tiny pests in cotton fields based on super-resolution reconstruction</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1416940</pub-id>, PMID: <pub-id pub-id-type="pmid">39184581</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhan</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>She</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>An improved Yolov5 real-time detection method for small objects captured by UAV</article-title>. <source>Soft Computing</source> <volume>26</volume>, <fpage>361</fpage>&#x2013;<lpage>373</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00500-021-06407-8</pub-id>
</citation></ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Z.-Q.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Animal biodiversity: an outline of higher-level classification and survey of taxonomic richness (Addenda 2013)</article-title>. <source>Zootaxa</source>. <volume>3703</volume>, <page-range>1&#x2013;82</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.11646/zootaxa.3703.1.1</pub-id>, PMID: <pub-id pub-id-type="pmid">26146682</pub-id></citation></ref>
<ref id="B43">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>a). &#x201c;<article-title>Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training</article-title>,&#x201d; in <source>
<italic>Computer Vision &#x2013; ECCV</italic> 2020</source>. Eds. <person-group person-group-type="editor">
<name>
<surname>Vedaldi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Bischof</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Brox</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Frahm</surname> <given-names>J.-M.</given-names>
</name>
</person-group> (<publisher-name>Springer International Publishing</publisher-name>), <publisher-loc>Springer, Cham, Switzerland.</publisher-loc> <fpage>260</fpage>&#x2013;<lpage>275</lpage>.</citation></ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ke</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>K.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>). <article-title>Towards accurate field counting of small pests with visual prompts</article-title>. <source>Comput. Electron. Agric.</source> <volume>237</volume>, <elocation-id>110635</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110635</pub-id>
</citation></ref>
<ref id="B45">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Pang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lyu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>Dense distinct query for end-to-end object detection</article-title>,&#x201d; in <conf-name>: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE.</publisher-name> <fpage>7329</fpage>&#x2013;<lpage>7338</lpage>.</citation></ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhan</surname> <given-names>W.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>b). <article-title>Application of spatio-temporal context and convolution neural network (CNN) in grooming behavior of bactrocera minax (Diptera: trypetidae) detection and statistics</article-title>. <source>Insects</source> <volume>11</volume>, <elocation-id>565</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/insects11090565</pub-id>, PMID: <pub-id pub-id-type="pmid">32846918</pub-id></citation></ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhan</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>b). <article-title>Mask-guided dual-perception generative adversarial network for synthesizing complex maize diseased leaves to augment datasets</article-title>. <source>Eng. Appl. Artif. Intell.</source> <volume>136</volume>, <elocation-id>108875</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2024.108875</pub-id>
</citation></ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhan</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>a). <article-title>RPH-Counter: Field detection and counting of rice planthoppers using a fully convolutional network with object-level supervision</article-title>. <source>Comput. Electron. Agric.</source> <volume>225</volume>, <elocation-id>109242</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109242</pub-id>
</citation></ref>
<ref id="B49">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Su</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Deformable DETR: deformable transformers for end-to-end object detection</article-title>,&#x201d; in <conf-name>Proceedings of the International Conference on Learning Representations (ICLR)</conf-name>. <publisher-name>Austria. OpenReview.net.</publisher-name>
</citation></ref>
</ref-list>
</back>
</article>