<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2024.1393138</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Small target tea bud detection based on improved YOLOv5 in complex background</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Wang</surname>
<given-names>Mengjie</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2352114"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Li</surname>
<given-names>Yang</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Meng</surname>
<given-names>Hewei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Zhiwei</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2702654"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gui</surname>
<given-names>Zhiyong</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Yaping</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Dong</surname>
<given-names>Chunwang</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2325329"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Mechanical and Electrical Engineering, Shihezi University</institution>, <addr-line>Shihezi</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Key Laboratory of Tea Quality and Safety Control, Ministry of Agriculture and Rural Affairs, Tea Research Institute, Chinese Academy of Agricultural Sciences</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Tea Research Institute of Shandong Academy of Agricultural Sciences</institution>, <addr-line>Jinan</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Parvathaneni Naga Srinivasu, Prasad V. Potluri Siddhartha Institute of Technology, India</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Kaushik Kumar Panigrahi, Orissa University of Agriculture and Technology, India</p>
<p>Uddagiri Sirisha, Prasad V. Potluri Siddhartha Institute of Technology, India</p>
<p>Shakeel Ahmed, King Faisal University, Saudi Arabia</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Yaping Li, <email xlink:href="mailto:liyaping425@163.com">liyaping425@163.com</email>; Chunwang Dong, <email xlink:href="mailto:dongchunwang@163.com">dongchunwang@163.com</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>06</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1393138</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>02</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>05</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Wang, Li, Meng, Chen, Gui, Li and Dong</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Wang, Li, Meng, Chen, Gui, Li and Dong</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Tea bud detection is the first step in the precise picking of famous teas. Accurate and fast tea bud detection is crucial for achieving intelligent tea bud picking. However, existing detection methods still exhibit limitations in both detection accuracy and speed due to the intricate background of tea buds and their small size. This study uses YOLOv5 as the initial network and utilizes attention mechanism to obtain more detailed information about tea buds, reducing false detections and missed detections caused by different sizes of tea buds; The addition of Spatial Pyramid Pooling Fast (SPPF) in front of the head to better utilize the attention module&#x2019;s ability to fuse information; Introducing the lightweight convolutional method Group Shuffle Convolution (GSConv) to ensure model efficiency without compromising accuracy; The Mean-Positional-Distance Intersection over Union (MPDIoU) can effectively accelerate model convergence and reduce the training time of the model. The experimental results demonstrate that our proposed method achieves precision (P), recall rate (R) and mean average precision (mAP) of 93.38%, 89.68%, and 95.73%, respectively. Compared with the baseline network, our proposed model&#x2019;s P, R, and mAP have been improved by 3.26%, 11.43%, and 7.68%, respectively. Meanwhile, comparative analyses with other deep learning methods using the same dataset underscore the efficacy of our approach in terms of P, R, mAP, and model size. This method can accurately detect the tea bud area and provide theoretical research and technical support for subsequent tea picking.</p>
</abstract>
<kwd-group>
<kwd>object detection</kwd>
<kwd>deep information extraction</kwd>
<kwd>lightweight</kwd>
<kwd>MPDIoU</kwd>
<kwd>YOLOv5</kwd>
<kwd>attention mechanism</kwd>
</kwd-group>
<counts>
<fig-count count="10"/>
<table-count count="4"/>
<equation-count count="15"/>
<ref-count count="25"/>
<page-count count="12"/>
<word-count count="5443"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>China is a leading tea-producing country, boasting vast tea cultivation areas and high yields. In 2021, China&#x2019;s tea gardens covered a total area of 3,217 thousand hectares, yielding 3.16 million tons of tea (<xref ref-type="bibr" rid="B18">Statistics Bureau of the People&#x2019;s Republic of China, 2021</xref>). Despite the extensive cultivation area and output, the picking method for famous teas remains primarily manual, which is both time-consuming and costly. In recent years, the scarcity of tea-picking laborers and the shortened tea-picking season have posed challenges to harvesting famous teas. Consequently, within the current trend of artificial intelligence, the automation and intelligent picking of high-quality tea are becoming imperative. Mechanized tea picking in China currently relies on indiscriminate picking using reciprocating cutting, suitable only for low-quality teas and unable to meet the requirements for the precise bud-by-bud picking of famous tea varieties. Therefore, as artificial intelligence continues to develop, hand-picking methods will likely be replaced by intelligent picking. The intelligent picking of high-quality tea has emerged as a recent research hotspot. Key to this endeavor is the recognition of tea buds, and achieving accurate and rapid tea bud detection will drive the intelligent picking and industrial development of famous teas, holding significant practical significance. This study will thus focus on the precise and rapid detection of tea buds to contribute to the advancement of intelligent tea picking methodologies.</p>
<p>Currently, in the agricultural field, two primary target detection methods are utilized: traditional image segmentation and deep learning (<xref ref-type="bibr" rid="B10">LeCun et&#xa0;al., 2015</xref>). Traditional image segmentation methods rely on distinguishing targets such as litchi (<xref ref-type="bibr" rid="B23">Yu et&#xa0;al., 2021</xref>), apples (<xref ref-type="bibr" rid="B13">Li et&#xa0;al., 2021</xref>), and passion fruits (<xref ref-type="bibr" rid="B19">Tu et&#xa0;al., 2018</xref>) from complex backgrounds by leveraging image color, texture, and other features, alongside manually crafted segmentation criteria. For tea bud detection, <xref ref-type="bibr" rid="B24">Zhang et&#xa0;al. (2021)</xref> proposed an enhanced watershed algorithm, yielding favorable results in tea bud recognition. <xref ref-type="bibr" rid="B21">Wu et&#xa0;al. (2015)</xref> discovered that within the Lab color space, the K-means clustering method exhibited the highest tea bud recognition rate at a shooting distance of 5cm. In these studies, segmentation targets were delineated based on disparities in color and shape. While commendable results have been attained, the intricate tea plantation backgrounds depicted in images present a significant challenge influencing segmentation recognition. Diverse growth environments, lighting conditions, and shooting angles can substantially impede tea bud recognition. Consequently, traditional image segmentation methods often struggle to achieve robust detection outcomes in real-world tea plantation settings.</p>
<p>In recent years, with the development and popularization of deep learning techniques, advanced detection technology has found applications across various agricultural fields. In the detection of small targets such as tea buds, <xref ref-type="bibr" rid="B15">Qian et&#xa0;al. (2020)</xref> proposed a semantic segmentation network for tea buds based on TS SegNet, and <xref ref-type="bibr" rid="B6">Hu et&#xa0;al. (2021)</xref> proposed a semantic segmentation method based on DP-NET for segmenting and recognizing tea buds in a natural scene, both of which achieved good segmentation results. Although the semantic segmentation method can segment the target significantly, it is complex, slow and difficult to produce a dataset, which is not suitable for large-scale detection models. Therefore, <xref ref-type="bibr" rid="B25">Zhu et&#xa0;al. (2022)</xref> explored the Faster R-CNN model and VGG16 feature extraction network to detect the category of tea buds, which significantly improved the model&#x2019;s detection effect when removing individual buds. <xref ref-type="bibr" rid="B22">Xu et&#xa0;al. (2022)</xref> proposed a variable-domain two-level fusion network detection and classification method, which combined the fast detection capability of YOLOv3 and the high-precision classification capability of DenseNet201 to achieve 95.71% accuracy in detecting side buds. <xref ref-type="bibr" rid="B4">Gui et&#xa0;al. (2023)</xref> proposed a lightweight tea bud detection model based on YOLOv5_l. Using the Ghost_conv module instead of the original convolution, a floating point operation reduction of 52.402G and a parameter reduction of 22.71M were achieved. <xref ref-type="bibr" rid="B14">Li et&#xa0;al. (2023)</xref> proposed a tea bud detection algorithm based on SE-YOLOv5_m. SENet was introduced into the CNN, and the accuracy reached 91.88% by using weights to filter the key features of each convolutional channel.</p>
<p>Ultimately, although numerous scholars have conducted research on tea bud recognition using deep learning, practical application still faces significant challenges such as low detection accuracy, slow processing speed, and high computational costs. These limitations render existing methods unsuitable for deployment on mobile devices, necessitating further research. During the special period of tea bud picking, it is crucial to recognize tea buds quickly and accurately. In this study, we employ YOLOv5 as the foundational network and integrate lightweight and other modules to enhance model accuracy, reduce computational overhead, and enable rapid detection of tea buds amidst complex backgrounds. The specific improvement method is as follows:</p>
<list list-type="numbered">
<list-item>
<p>A Coordinate Attention (CA) mechanism has been integrated after the C3 module in the backbone network to enhance the network&#x2019;s focus on tea buds.</p>
</list-item>
<list-item>
<p>Spatial Pyramid Pooling Fast (SPPF) is applied to the head to deeply extract the semantic information introduced by the enhanced feature extraction network, overcoming the large amount of low-level semantic information in the shallow network that cannot better utilize the information fusion function of the CA_block.</p>
</list-item>
<list-item>
<p>A cross-stage partial network (VoV_GSCSP) is used to replace the C3 module in the neck network, ensuring that the model is lightweight without affecting accuracy.</p>
</list-item>
<list-item>
<p>Replace the GIoU in the initial network with a new metric of the high-precision boundary regression loss function Mean-Positional-Distance Intersection over Union (MPDIoU), thereby accelerating model convergence and reducing model training time.</p>
</list-item>
</list>
<p>In the remainder of the paper, the second section outlines the details of image acquisition, data enhancement, and the overall network structure. The third section presents the test results, while the fourth section concludes the paper.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Image acquisition and preprocessing</title>
<p>The tea bud images utilized in this study were obtained from the Shengzhou Tea Base of the Tea Research Institute at the Chinese Academy of Agricultural Sciences (120.825542E, 29.748715N). White tea variety was selected, and the images were captured in March 2022 using Huawei Mate40 and Xiaomi10 smartphones. During the capture, the phones were positioned approximately 0.4 meters away from the tea trees, resulting in a total of 730 images of white tea buds. The images were taken under various conditions, including strong light after rain, cloudy days after rain, and sunny days. The dataset was annotated using Labelimg software. To evaluate the model&#x2019;s training effectiveness, 73 samples were chosen from the original 730 unprocessed images as the test set. To enhance the generalization capability of the target detection model with limited data, the remaining 657 images underwent augmentation techniques such as mirroring, brightness adjustments, rotation within a range of &#xb1;45&#xb0;, and the addition of Gaussian noise. This augmentation resulted in a total of 1314 tea bud images, which were split into training and validation sets at a ratio of 9:1. Specifically, the training set contained 1182 images, while the validation set contained 132 images. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> illustrates an example of the initial dataset and the data augmentation process.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Initial dataset and data augmentation examples.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g001.tif"/>
</fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>YOLOv5 algorithm</title>
<p>In recent years, the YOLO series has undergone several iterations, with the release of YOLOv5 (<xref ref-type="bibr" rid="B3">Glenn, 2020</xref>) in 2020 marking significant advancements in both small target detection accuracy and speed. This model excels in extracting deeper semantic information, rendering it highly adaptable to evolving working scenarios (<xref ref-type="bibr" rid="B2">Gao et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B1">Da Costa et&#xa0;al., 2020</xref>), resulting in improved recognition precision and robustness (<xref ref-type="bibr" rid="B9">Khosravi et&#xa0;al., 2021</xref>). Compared to other YOLO series models, YOLOv5 emerges as the optimal choice for real-time detection of tea gardens in unstructured environments due to its simple network architecture, smaller model size, efficient deployment and operation, and the potential for further speed enhancement through lightweight module integration. In the context of picking famous tea, where detection efficiency directly impacts picking efficiency, selecting YOLOv5 with its superior detection efficiency serves as the initial network for tea bud detection in this study.</p>
<p>In the YOLOv5 network, its size can be adjusted by modifying its width and depth, resulting in four different versions: YOLOv5_s, YOLOv5_m, YOLOv5_l and YOLOv5_x, each with corresponding parameters 7.30 &#xd7; 10<sup>6</sup>, 2.14 &#xd7; 10<sup>7</sup>, 4.71 &#xd7; 10<sup>7</sup> and 8.78 &#xd7; 10<sup>7</sup>, respectively. Generally, fewer parameters generally lead to faster computation time but lower precision. Selecting the appropriate variant is key to fully utilizing the power of the YOLOv5 assay. In this study, these four models were trained using the initial dataset without data augmentation, and the results are shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. YOLOv5_m exhibits the highest detection precision, while YOLOv5_x has the highest recall and average precision. Considering both accuracy and effectiveness in tea bud detection, YOLOv5_m was chosen as the initial network for tea detection, and enhancements were made to its structure for improved performance.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Test results of the model.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">mAP@0.5(%)</th>
<th valign="middle" align="center">FLOPs(G)</th>
<th valign="middle" align="center">Model size(MB)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">YOLOv5_s</td>
<td valign="middle" align="center">80.61</td>
<td valign="middle" align="center">78.61</td>
<td valign="middle" align="center">85.23</td>
<td valign="middle" align="center">
<bold>16.477</bold>
</td>
<td valign="middle" align="center">
<bold>26.95</bold>
</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">
<bold>85.20</bold>
</td>
<td valign="middle" align="center">78.95</td>
<td valign="middle" align="center">87.78</td>
<td valign="middle" align="center">50.598</td>
<td valign="middle" align="center">80.32</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_l</td>
<td valign="middle" align="center">83.39</td>
<td valign="middle" align="center">81.66</td>
<td valign="middle" align="center">88.58</td>
<td valign="middle" align="center">114.559</td>
<td valign="middle" align="center">177.88</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_x</td>
<td valign="middle" align="center">82.20</td>
<td valign="middle" align="center">
<bold>84.69</bold>
</td>
<td valign="middle" align="center">
<bold>89.09</bold>
</td>
<td valign="middle" align="center">217.795</td>
<td valign="middle" align="center">332.81</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values is the optimal value of the comparison results of the four models.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The improved YOLOv5 network structure is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2C</bold>
</xref>. In the backbone network, the CA attention mechanism is integrated after all the C3 modules to enhance the model&#x2019;s focus on target regions and improve its attention towards specific features and contextual details. Additionally, the SPP module is replaced with the SPPF module, enabling the model to effectively capture target information across various scales, thus expanding its perception range and enhancing target detection performance. The backbone module before and after the improvement is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2A</bold>
</xref>. In the neck, all the C3 modules are replaced with VoV_GSCSP modules, which aims to fully combine the CA attention mechanism in the backbone with the degree of attention to the target, so that the model can better understand the global information and local details of the target in the image, and ensure the accuracy of the model while making the model lightweight. The neck module before and after the improvement is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2B</bold>
</xref>. In the head, to address the challenge of utilizing deep semantic information from the VoV_GSCSP module in the neck layer, an SPPF module is introduced before the head. This module extracts deep semantic features from the neck, further refining the model&#x2019;s accuracy and performance.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>
<bold>(A)</bold> The backbone network structure before and after improvement, <bold>(B)</bold> the neck network structure before and after improvement, and <bold>(C)</bold> structure of the improved YOLOv5_m model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g002.tif"/>
</fig>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Attention mechanism</title>
<p>To better focus on the overall tea bud, this study introduced the CA_block (<xref ref-type="bibr" rid="B5">Hou et&#xa0;al., 2021</xref>) into the C3 module to better extract the deep features of tea buds. Location information is crucial for capturing target structures in visual detection. CA_block is a method of enhancing the interaction and correlation between different channels, which can not only be easily inserted into the core module of a lightweight network, but also capture channel and position information of images, strengthen regions of interest, reduce redundant information, and improve the expression ability of the model to achieve overall attention to tea buds. Its structure is shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. It overcomes the problem of the Squeeze-and-Excitation networks (<xref ref-type="bibr" rid="B7">Hu et&#xa0;al., 2018</xref>) module only focusing on channel confidence while ignoring spatial information.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Coordinate attention structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g003.tif"/>
</fig>
<p>CA_block performs global average pooling of feature maps with input size C * H * W from both X and Y directions by obtaining channel and position information, to obtain remote spatial interaction of position information. Mathematical expressions for the feature maps in both directions <xref ref-type="disp-formula" rid="eq1">
<bold>Equations 1</bold>
</xref>, <xref ref-type="disp-formula" rid="eq2">
<bold>2</bold>
</xref>.</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>W</mml:mi>
</mml:mfrac>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>H</mml:mi>
</mml:mfrac>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Here, the variable <inline-formula>
<mml:math display="inline" id="im1">
<mml:mi>c</mml:mi>
</mml:math>
</inline-formula> refers to the channel, and <inline-formula>
<mml:math display="inline" id="im2">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im3">
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula> represent the height and width of the input feature map. Specifically, <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents the input x in channel <italic>c</italic>.</p>
<p>Next, concatenate the obtained feature maps, using 1 &#xd7; 1 convolution module reduces the dimension by 1/r (r is the reduction rate), and after batch standardization and activation function processing, <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is obtained. <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is then activated by the sigmoid function to obtain an intermediate feature map <italic>f</italic> that encodes spatial information in both horizontal and vertical directions. The intermediate feature map <italic>f</italic> can be described as <xref ref-type="disp-formula" rid="eq3">
<bold>Equation 3</bold>
</xref>.</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im7">
<mml:mi>&#x3b4;</mml:mi>
</mml:math>
</inline-formula> denotes the nonlinear activation function.</p>
<p>Subsequently, along the spatial dimension, perform a split operation on <italic>f</italic> to obtain <italic>f<sup>h</sup>
</italic> and <italic>f<sup>w</sup>
</italic>, using 1 &#xd7; 1 convolution to dimensionality up operation, and then the feature maps <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are obtained. Then, the attention weights <italic>g<sup>w</sup>
</italic> and <italic>g<sup>h</sup>
</italic> of the feature maps in height and width are obtained through Sigmoid. It can be mathematically defined as <xref ref-type="disp-formula" rid="eq4">
<bold>Equations 4</bold>
</xref>, <xref ref-type="disp-formula" rid="eq5">
<bold>5</bold>
</xref>.</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msup>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msup>
<mml:mi>g</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>&#x3c3;</italic> denotes the sigmoid function.</p>
<p>Finally, the attention weight feature map for both the h and w directions is obtained by multiplying and weighting the feature map. The calculation method is shown in <xref ref-type="disp-formula" rid="eq6">
<bold>Equation 6</bold>
</xref>.</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Cross-stage partial network</title>
<p>To improve the detection efficiency of the model and reduce the number of model parameters, this study introduces a lightweight convolution method, Group Shuffle Convolution (GSConv) (<xref ref-type="bibr" rid="B12">Li et&#xa0;al., 2022</xref>), which can decrease the model complexity while maintaining essentially unchanged accuracy. Current lightweight models typically reduce the number of parameters and FLOPs through Depth Separated Convolution (DSC). However, the channel information of DSC input images is separated during the calculation process. When used DSC alone, it can reduce the feature extraction and fusion capabilities. The convolutional structure of DSC and Standard Convolution (SC) is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4A</bold>
</xref>. GSConv is a convolutional method that combines SC and DSC through shuffle, which can mix the information generated by SC into DSC, overcoming the problem of lower feature extraction and fusion capability compared to SC when using DSC alone.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Structural Diagram of <bold>(A)</bold> Standard Convolution (SC) and Depth Separated Convolution (DSC), <bold>(B)</bold> Group Shuffle Convolution(GSConv), <bold>(C)</bold> Group Shuffle bottleneck (GSbottleneck) and <bold>(D)</bold> cross-stage partial network (VoV_GSCSP).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g004.tif"/>
</fig>
<p>In addition, based on the study of enhancing network learning capabilities such as DensNet (<xref ref-type="bibr" rid="B8">Huang et&#xa0;al., 2017</xref>), VoVNet (<xref ref-type="bibr" rid="B11">Lee et&#xa0;al., 2019</xref>), and CSPNet (<xref ref-type="bibr" rid="B20">Wang et&#xa0;al., 2020</xref>), GSConv was introduced into the bottleneck to form the Group Shuffle bottleneck (GSbottleneck) module. Finally, a one-time aggregation method was used to apply GSbottleneck to the C3 module, forming a cross-stage partial network VoV-GSCSP, which achieved a reduction in model computation without affecting accuracy. When GSConv is applied in a backbone network, it will increase the depth of the network, increasing the computational complexity of each layer, and thus increasing the computational complexity of the model. After passing through convolutional and pooling layers in the feature map of the neck section, the width and height of the feature map are reduced, while the number of channels increases. When the feature map is transmitted through multiple layers of the network, the feature map of the neck section becomes slender, which can be better transformed into more expressive features. Therefore, this study only applies GSConv to the neck section. <xref ref-type="fig" rid="f4">
<bold>Figures&#xa0;4B&#x2013;D</bold>
</xref> illustrates the structural diagrams of GSConv, GSbottleneck, and VoV_GSCSP, respectively.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>SPP and SPPF</title>
<p>The Spatial Pyramid Pooling (SPP) layer is instrumental in capturing multi-scale features of the target by focusing on spatial information. Typically integrated into the last layer of a convolutional neural network, the SPP layer divides the input feature map into grids of varying sizes, extracting feature vectors from each grid. This process involves three parallel max-pooling operations and an input branch, followed by concatenating the resulting features to obtain multi-scale representations, which helps the network to better capture the feature information of the target at different scales, and improves the accuracy and robustness of the network.</p>
<p>SPPF is an optimization of the structure based on SPP, which introduces more pooling layers to improve the performance of the model while including the feature extraction and fusion techniques of the SPP module. Notably, SPPF replaces the parallel branches of SPP with serial connections, sequentially outputting feature vectors layer by layer. This modification includes adding downward output branches to the first and second layers of the three maximally pooled serial connections, thereby increasing the module&#x2019;s pooling depth. The structure of SPP and SPPF is shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>. In this study, SPPF is mainly used to solve the problem of too large a scale of object scale change for local and global feature inputs of tea buds. Moreover, SPPF achieves a better balance between performance and speed, which is very suitable for models such as tea buds that need detection accuracy and need to satisfy detection speed. Therefore, this study replaces the SPP in the backbone network with SPPF to enhance the multi-scale feature extraction of the network. Meanwhile, SPPF is applied to the header to deeply extract the deep semantic information introduced by the enhanced feature extraction network, which overcomes the problem that a large amount of low-level semantic information in the shallow network cannot be better fused with the deep semantic information in CA_block.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Structure of SPP and SPPF.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g005.tif"/>
</fig>
</sec>
<sec id="s2_6">
<label>2.6</label>
<title>Maximized partial intersection over union</title>
<p>To better locate tea buds, this study introduces a new metric, the High Precision Boundary Regression Box (BBR) loss function MPDIoU (<xref ref-type="bibr" rid="B17">Siliang and Yong, 2023</xref>), which uses the minimum point distance intersection ratio to calculate the similarity measure between the predicted box and the real box, as shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>. MPDIoU is an optimization based on Intersection over Union (IoU), which optimizes the calculation method of the overlapping area between the predicted box and the ground truth. It is used to solve the problem of GIoU failure in the initial network when the predicted box and the ground truth overlap highly. IoU is used to calculate the ratio of the intersection and union of predicted boxes and ground truth boxes, and the calculation formula can be described as <xref ref-type="disp-formula" rid="eq7">
<bold>Equation 7</bold>
</xref>.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Calculation loss factors for MPDIoU.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g006.tif"/>
</fig>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x222a;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>A represents the ground truth box, B represents the predicted box, A&#x2229;B represents the area of the intersection area, and A&#x222a;B represents the area of the union area.</p>
<p>After obtaining the upper left corner coordinates (x<sub>1</sub>
<sup>prd</sup>,y<sub>1</sub>
<sup>prd</sup>), lower right corner coordinates (x<sub>2</sub>
<sup>prd</sup>,y<sub>2</sub>
<sup>prd</sup>) of the prediction box, the upper left corner coordinates (x<sub>1</sub>
<sup>gt</sup>,y<sub>1</sub>
<sup>gt</sup>), lower right corner coordinates (x<sub>2</sub>
<sup>gt</sup>,y<sub>2</sub>
<sup>gt</sup>) of the real box, and the width (w) and height (h) of the corresponding feature map, the MPDIoU is calculated as in <xref ref-type="disp-formula" rid="eq8">
<bold>Equations 8</bold>
</xref>&#x2013;<xref ref-type="disp-formula" rid="eq11">
<bold>11</bold>
</xref>.</p>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>3</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>3</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>3</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Among them, d<sub>1</sub>
<sup>2</sup>, d<sub>2</sub>
<sup>2</sup>, and d<sub>3</sub>
<sup>2</sup> represent the square of the distance between the upper left corner coordinates of the prediction box and the ground truth box, the square of the distance between the lower right corner coordinates of the prediction box and the ground truth box, and the square of the width (w) and height (h) of the corresponding feature map, respectively. The introduction of d<sub>1</sub>
<sup>2</sup>, d<sub>2</sub>
<sup>2</sup>, and d<sub>3</sub>
<sup>2</sup> is aimed at amplifying the differences between the bounding boxes on both sides, in order to better reflect the positional differences between the two boxes when calculating the similarity between the ground truth box and the predicted box.</p>
<p>Finally, the loss function can be expressed as in <xref ref-type="disp-formula" rid="eq12">
<bold>Equation 12</bold>
</xref>.</p>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s2_7">
<label>2.7</label>
<title>Training environment</title>
<p>The deep learning framework used in this study is PyTorch. The experiments were conducted on a Windows 10 machine with an Intel (R) Core (TM) i7-10700 CPU with a clock speed of 2.90 GHz, 32.0 GB of RAM, and an NVIDIA GeForce RTX3090 24 G GPU. The hyperparameters for model training are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Training model hyperparameters.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">Batch size</th>
<th valign="middle" align="center">Learning rate</th>
<th valign="middle" align="center">Epochs</th>
<th valign="middle" align="center">Optimizer</th>
<th valign="middle" align="center">Momentum</th>
<th valign="middle" align="center">Weight dedcay</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Faster-RCNN</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">1e-3</td>
<td valign="middle" align="center">200</td>
<td valign="middle" align="center">Adam</td>
<td valign="middle" align="center">0.9</td>
<td valign="middle" align="center">0.0005</td>
</tr>
<tr>
<td valign="middle" align="center">SSD</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">1e-3</td>
<td valign="middle" align="center">200</td>
<td valign="middle" align="center">Adam</td>
<td valign="middle" align="center">0.9</td>
<td valign="middle" align="center">0.0005</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv3</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">1e-3</td>
<td valign="middle" align="center">200</td>
<td valign="middle" align="center">Adam</td>
<td valign="middle" align="center">0.9</td>
<td valign="middle" align="center">0.0005</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv4</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">1e-3</td>
<td valign="middle" align="center">200</td>
<td valign="middle" align="center">Adam</td>
<td valign="middle" align="center">0.9</td>
<td valign="middle" align="center">0.0005</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">1e-3</td>
<td valign="middle" align="center">200</td>
<td valign="middle" align="center">Adam</td>
<td valign="middle" align="center">0.9</td>
<td valign="middle" align="center">0.0005</td>
</tr>
<tr>
<td valign="middle" align="center">Ours</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">1e-3</td>
<td valign="middle" align="center">200</td>
<td valign="middle" align="center">Adam</td>
<td valign="middle" align="center">0.9</td>
<td valign="middle" align="center">0.0005</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Results and Discussion</title>
<sec id="s3_1">
<label>3.1</label>
<title>Evaluating indicator</title>
<p>The detection model in this study was evaluated using precision (P), recall rate (R), and mean average precision (mAP), where P represents the proportion of accurate predictions in all predicted examples and R represents the proportion of accurate predictions in all true examples. The mAP denotes the comprehensive accuracy indicator to evaluate detection model. The formulae are calculated for P, R and mAP as in <xref ref-type="disp-formula" rid="eq13">
<bold>Equations 13</bold>
</xref>&#x2013;<xref ref-type="disp-formula" rid="eq15">
<bold>15</bold>
</xref>.</p>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>TP: Number of positive samples predicted as positive samples</p>
<p>FP: Number of negative samples predicted as positive samples</p>
<p>FN: Number of positive samples predicted as negative samples</p>
<p>N: Indicates the number of types of buds detected (only one type of tea buds is studied in this paper, so N equals 1)</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Test comparison before and after model improvement</title>
<p>In this paper, the benchmark and improved networks are compared based on the data augmentation set. The test results are presented in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. The results indicate that, compared to the initial network, the improved network yielded an increase of 3.26%, 11.43%, and 7.68% for P, R, and mAP, respectively. And the number of parameters decreased by 0.319 M, GFLOPs decreased by 1.343 G, and model size decreased by 1.21 MB. Therefore, it is evident from the comparison before and after model improvement that the improved model can achieve higher accuracy while reducing the complexity of model calculations. Meanwhile, we utilize the confusion matrix to assess the model&#x2019;s prediction accuracy for tea buds and backgrounds, facilitating a comprehensive evaluation of its performance. <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref> shows the number of correct and incorrect predictions of tea buds on the test set by the model before and after the improvement. The results reveal that the enhanced model increases the number of correctly predicted tea buds by 123 and reduces the misclassification of backgrounds as tea buds by 26, thereby effectively demonstrating the improved detection performance of the model.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Training results of different target detection methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">P(%)</th>
<th valign="middle" align="center">R(%)</th>
<th valign="middle" align="center">mAP@0.5(%)</th>
<th valign="middle" align="center">Params(M)</th>
<th valign="middle" align="center">FLOPs(G)</th>
<th valign="middle" align="center">Model size(MB)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Faster-RCNN</td>
<td valign="middle" align="center">56.02</td>
<td valign="middle" align="center">88.95</td>
<td valign="middle" align="center">81.73</td>
<td valign="middle" align="center">136.689</td>
<td valign="middle" align="center">369.719</td>
<td valign="middle" align="center">521.43</td>
</tr>
<tr>
<td valign="middle" align="center">SSD</td>
<td valign="middle" align="center">82.80</td>
<td valign="middle" align="center">45.79</td>
<td valign="middle" align="center">69.44</td>
<td valign="middle" align="center">23.612</td>
<td valign="middle" align="center">60.756</td>
<td valign="middle" align="center">90.07</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv3</td>
<td valign="middle" align="center">85.82</td>
<td valign="middle" align="center">66.36</td>
<td valign="middle" align="center">82.28</td>
<td valign="middle" align="center">61.524</td>
<td valign="middle" align="center">65.597</td>
<td valign="middle" align="center">234.69</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv4</td>
<td valign="middle" align="center">87.67</td>
<td valign="middle" align="center">61.00</td>
<td valign="middle" align="center">81.21</td>
<td valign="middle" align="center">63.938</td>
<td valign="middle" align="center">59.953</td>
<td valign="middle" align="center">243.90</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">90.12</td>
<td valign="middle" align="center">78.25</td>
<td valign="middle" align="center">88.05</td>
<td valign="middle" align="center">21.056</td>
<td valign="middle" align="center">50.598</td>
<td valign="middle" align="center">80.32</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Ours</bold>
</td>
<td valign="middle" align="center">
<bold>93.38</bold>
</td>
<td valign="middle" align="center">
<bold>89.68</bold>
</td>
<td valign="middle" align="center">
<bold>95.73</bold>
</td>
<td valign="middle" align="center">
<bold>20.737</bold>
</td>
<td valign="middle" align="center">
<bold>49.255</bold>
</td>
<td valign="middle" align="center">
<bold>79.11</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>The confusion matrix of the model before and after improvement.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g007.tif"/>
</fig>
<p>Additionally, to visually depict the performance disparity before and after model improvement, this study conducted an analysis of P, R, and mAP using the variance chi-square test. In the ANOVA chi-square test, setting the significance level at 0.05, a probability value lower than this threshold signifies a notable difference in model performance before and after improvement. The probability values of P, R, and mAP before and after model enhancement in this study are 0.000567, 0.002694, and 0.000264, respectively. These values are all below 0.05, indicating a significant difference in model performance before and after the improvement, further confirming that the enhanced model achieves higher detection accuracy.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Grad-CAM visualization</title>
<p>The Grad-CAM (<xref ref-type="bibr" rid="B16">Selvaraju et&#xa0;al., 2017</xref>) thermal diagram is used to visualize the regions of interest in the model related to the target when extracting the target features. By displaying the regions of interest through red regions, Grad-CAM can intuitively extract the regions of interest on the tea bud image. In this study, Grad-CAM is used to detect the degree of attention paid to tea bud features before and after model improvement. As the red circle shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>, by improving the contraction and expansion of the Grad-CAM red regions of the network, the model&#x2019;s attention is more focused on the tea buds. This weakens the initial network&#x2019;s focus on background information, and the improved network also learns the characteristics of the buds that the initial network cannot pay attention to, thus proving that the improved network is more effective in paying attention to tea buds. This also verifies that the improved model improves the precision of the network.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Initial network and improve the network Grad-CAM visualization.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g008.tif"/>
</fig>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Ablation test</title>
<p>The results of the ablation test are presented in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>, and the P, R, and mAP of the improved network are significantly improved, while the size of the model is also reduced. Through analysis of the data, the result showed that incorporating the CA_block, SPPF, and VoV_GSCSP module into the network significantly improves the P, R, and mAP of the detection network while reducing the model size. This improvement can be attributed to the integration of depth feature information from the VoV_GSCSP module, CA_block, and SPPF module. This integration fully leverages the overall attention of the network to tea buds, improving the detection accuracy of tea buds. Furthermore, the lightweight convolution method of GSConv in the VoV_GSCSP effectively reduces the size of the model.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Results of the ablation test.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Data enhancement</th>
<th valign="middle" align="center">CA</th>
<th valign="middle" align="center">SPPF</th>
<th valign="middle" align="center">VoV_GSCSP</th>
<th valign="middle" align="center">MPDIoU</th>
<th valign="middle" align="center">P(%)</th>
<th valign="middle" align="center">R(%)</th>
<th valign="middle" align="center">mAP(%)</th>
<th valign="middle" align="center">Model size (MB)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">
</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">
</td>
<td valign="top" align="center"/>
<td valign="middle" align="center">85.20</td>
<td valign="middle" align="center">78.95</td>
<td valign="middle" align="center">87.78</td>
<td valign="middle" align="center">80.32</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="top" align="center"/>
<td valign="middle" align="center">90.12</td>
<td valign="middle" align="center">78.25</td>
<td valign="middle" align="center">88.05</td>
<td valign="middle" align="center">80.32</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="top" align="center"/>
<td valign="middle" align="center">91.67</td>
<td valign="middle" align="center">79.52</td>
<td valign="middle" align="center">89.69</td>
<td valign="middle" align="center">80.88</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="top" align="center"/>
<td valign="middle" align="center">90.89</td>
<td valign="middle" align="center">79.69</td>
<td valign="middle" align="center">89.74</td>
<td valign="middle" align="center">88.28</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="top" align="center"/>
<td valign="middle" align="center">
<bold>95.52</bold>
</td>
<td valign="middle" align="center">86.92</td>
<td valign="middle" align="center">92.86</td>
<td valign="middle" align="center">79.11</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5_m</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="top" align="center">&#x221a;</td>
<td valign="middle" align="center">93.38</td>
<td valign="middle" align="center">
<bold>89.68</bold>
</td>
<td valign="middle" align="center">
<bold>95.73</bold>
</td>
<td valign="middle" align="center">
<bold>79.11</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The meaning of the symbol "&#x221a;" is to add the corresponding module to the model.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>From <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>, a notable contrast emerges between the training loss and validation loss of GIoU, suggesting inadequate learning of tea bud characteristics and resulting in suboptimal performance on the validation set. Conversely, the disparity between the training loss and validation loss of MPDIoU is comparatively minimal, indicating effective learning of tea bud features and yielding satisfactory detection outcomes. Notably, GIoU achieves convergence after approximately 210 epochs of training, while MPDIoU reaches convergence around 190 epochs, showcasing the efficacy of MPDIoU in accelerating model convergence and reducing training duration.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Training and validation loss functions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g009.tif"/>
</fig>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Comparison of different models</title>
<p>This study compares the improved YOLOv5_m model with YOLOv3, YOLOv4, SSD, and Faster R-CNN under data-enhanced conditions. The test results are presented in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>.</p>
<p>The results demonstrated that the method proposed in this study outperformed other models in terms of P and mAP. <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10A</bold>
</xref> illustrates the detection results of P, R, and mAP for all models. Additionally, the proposed method exhibits lower computational complexity in terms of floating-point operations, number of parameters, and model size. Faster R-CNN, as a two-stage detector, tends to produce a high number of false detections when recognizing small targets like tea buds, leading to lower accuracy. Conversely, the SSD model may experience many missed detections during the detection process, resulting in a lower R-value. Although the YOLO series also encounters some missed detections, its performance is comparatively better than SSD and Faster R-CNN. YOLOv5, which is an improvement of YOLOv3 and YOLOv4, is particularly well-suited for detecting small targets, exhibiting higher detection accuracy and faster speed compared to YOLOv3 and YOLOv4. <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10B</bold>
</xref> displays the detection results of Faster R-CNN, SSD, YOLOv3, YOLOv4, YOLOv5, and the proposed method under varying environmental conditions.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>We will replace this sentence with: Detection result of <bold>(A)</bold> Precision, Recall, Average Precision Curves and <bold>(B)</bold> example of test results of this method and several test models.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1393138-g010.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10B</bold>
</xref> shows that the tea buds are densely packed and numerous in a single picture, which increases the difficulty of detection. The method proposed in this study has a better detection effect than other models in terms of precision detection. Although there may still be some omissions, the effect is superior to that of other models, making it suitable for tea bud detection in real tea plantations.</p>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusion</title>
<p>In this study, due to the density of the tea buds and the complexity of the background environment, existing detection methods struggle to obtain accurate results. To address this issue, we propose an improved YOLOv5_m tea bud detection method to enhance the accuracy and robustness of the detection algorithm. The optimization methods and detection results of this study are as follows:</p>
<list list-type="simple">
<list-item>
<p>(1) The fusion of deep feature information from VoV_GSCSP, CA_block, and SPPF modules enhanced the overall attention of the network to tea buds and improved the detection accuracy of tea buds.</p>
</list-item>
<list-item>
<p>(2) Introducing MPDIoU instead of GIoU has achieved rapid convergence of the model and reduced the training time of the model.</p>
</list-item>
<list-item>
<p>(3) The improved YOLOv5_m model achieved a P of 93.38%, a R of 89.68%, and an mAP of 95.73% while maintaining its size or slightly reducing it. These results demonstrate the model&#x2019;s effectiveness in detecting tea buds. Additionally, the model parameters consist of only 20.737 M, the floating point number is 49.255 G, and the size of the model is 79.11 MB, all of which are superior to other deep learning methods.</p>
</list-item>
</list>
<p>The experimental results show that the improved YOLOv5 model has excellent detection performance, which provides technical and theoretical support for automatic picking of high-quality tea leaves. However, the detection accuracy of the model is still a major challenge in complex lighting environments, color interference and old leaf occlusion. Future research will be devoted to collecting tea bud datasets under different scenarios, optimizing the model structure, and improving the ability to detect tea buds under typical unstructured environments in tea gardens. In addition, this study is limited to single-target detection and does not address tea bud categorization (such as single bud, one-leaf-one-bud, or two-leaf-one-bud) or differentiation between different tea varieties. Future work will focus on classifying tea bud classes based on improving the accuracy of tea bud detection, thus enhancing the practical applicability of the model.</p>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>MJW: Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. YL: Conceptualization, Funding acquisition, Methodology, Project administration, Resources, Writing &#x2013; review &amp; editing. HM: Data curation, Methodology, Writing &#x2013; review &amp; editing. ZC: Data curation, Methodology, Software, Writing &#x2013; review &amp; editing. ZG: Conceptualization, Methodology, Software, Writing &#x2013; review &amp; editing. CD: Writing &#x2013; review &amp; editing. YPL: Writing &#x2013; review &amp; editing.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. Zhejiang Provincial Natural Science Foundation of China under Grant No. LTGN23C130004, the Innovative Program of the Chinese Academy of Agricultural Sciences (Y2024QC24), the Central Public-Interest Scientific Institution Basal Research Fund (1610212021004), and the Key R&amp;D Program of Zhejiang (2022C02052, 2023C02009).</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>The authors would like to thank the reviewers and members of the editorial team for their comments and contributions.</p>
</ack>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Da Costa</surname> <given-names>A. Z.</given-names>
</name>
<name>
<surname>Figueroa</surname> <given-names>H. E.</given-names>
</name>
<name>
<surname>Fracarolli</surname> <given-names>J. A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Computer vision based detection of external defects on tomatoes using deep learning</article-title>. <source>Biosyst. Eng.</source> <volume>190</volume>, <fpage>131</fpage>&#x2013;<lpage>144</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2019.12.003</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>X.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Application of near-infrared hyperspectral imaging with machine learning methods to identify geographical origins of dry narrow-leaved oleaster (Elaeagnus angustifolia) fruits</article-title>. <source>Foods</source> <volume>8</volume>, <fpage>620</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/foods8120620</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Glenn</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>) <source>yolov5. Git code</source>. Available at: <uri xlink:href="https://github.com/ultralytics/yolov5">https://github.com/ultralytics/yolov5</uri>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gui</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A lightweight tea bud detection model based on YOLOv5</article-title>. <source>Comput. Electron. Agric.</source> <volume>205</volume>, <fpage>107636</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107636</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hou</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Coordinate attention for efficient mobile network design</article-title>,&#x201d; in <conf-name>In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <publisher-loc>Nashville, TN, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. <fpage>13713</fpage>&#x2013;<lpage>13722</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01350</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Bao</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Semantic segmentation of tea geometrid in natural scene images using discriminative pyramid network</article-title>. <source>Appl. Soft Computing</source> <volume>113</volume>, <fpage>107984</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.asoc.2021.107984</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Squeeze-and-excitation networks</article-title>,&#x201d; in <conf-name>In Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <publisher-loc>Salt Lake City, UT, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. <fpage>7132</fpage>&#x2013;<lpage>7141</lpage>.doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2018.00745</pub-id></citation>
</ref> 
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>van der Maaten</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Weinberger</surname> <given-names>K. Q.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Densely connected convolutional networks</article-title>,&#x201d; in <conf-name>In Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <publisher-loc>Honolulu, HI, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. <fpage>4700</fpage>&#x2013;<lpage>4708</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2017.243</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khosravi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Saedi</surname> <given-names>S. I.</given-names>
</name>
<name>
<surname>Rezaei</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Real-time recognition of on-branch olive ripening stages by a deep convolutional neural network</article-title>. <source>Scientia Hortic.</source> <volume>287</volume>, <fpage>110252</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.scienta.2021.110252</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>LeCun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Bengio</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hinton</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Deep learning</article-title>. <source>nature</source> <volume>521</volume>, <fpage>436</fpage>&#x2013;<lpage>444</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/nature14539</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lee</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hwang</surname> <given-names>J. W.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Bae</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Park</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>An energy and GPU-computation efficient backbone network for real-time object detection</article-title>,&#x201d; in <conf-name>In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops</conf-name>. <publisher-loc>Long Beach, CA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPRW47913.2019</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhan</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Slim-neck by GSConv: A better design paradigm of detector architectures for autonomous vehicles</article-title>. <source>arXiv preprint</source>. arXiv:2206.02424. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2206.02424</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Hou</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A novel green apple segmentation algorithm based on ensemble U-Net under complex orchard environment</article-title>. <source>Comput. Electron. Agric.</source> <volume>180</volume>, <fpage>105900</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105900</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A tea buds counting method based on YOLOv5 and Kalman filter tracking algorithm</article-title>. <source>Plant Phenomics</source> <volume>5</volume>, <fpage>0030</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.34133/plantphenomics.0030</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qian</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tea sprouts segmentation via improved deep convolutional encoder-decoder network</article-title>. <source>IEICE Trans. Inf. Syst.</source> <volume>103</volume>, <fpage>476</fpage>&#x2013;<lpage>479</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1587/transinf.2019EDL8147</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Selvaraju</surname> <given-names>R. R.</given-names>
</name>
<name>
<surname>Cogswell</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Das</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Vedantam</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Parikh</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Batra</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Grad-cam: Visual explanations from deep networks via gradient-based localization</article-title>,&#x201d; in <conf-name>In Proceedings of the IEEE international conference on computer vision</conf-name>. <publisher-loc>Venice, Italy</publisher-loc>: <publisher-name>IEEE</publisher-name>.<fpage>618</fpage>&#x2013;<lpage>626</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.74</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Siliang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yong</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>MPDIoU: A loss for efficient and accurate bounding box regression</article-title>. <source>arXiv preprints</source>. <volume>arXiv:2307.07662</volume>:<page-range>1&#x2013;13</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2307.07662</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<person-group person-group-type="author">
<collab>Statistics Bureau of the People&#x2019;s Republic of China</collab>
</person-group>. (<year>2021</year>). <source>&#x201c;China Statistical Yearbook 2021&#x201d;</source>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xue</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Detection of passion fruits and maturity classification using Red-Green-Blue Depth images</article-title>. <source>Biosyst. Eng.</source> <volume>175</volume>, <fpage>156</fpage>&#x2013;<lpage>167</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2018.09.004</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H. Y. M.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P. Y.</given-names>
</name>
<name>
<surname>Hsieh</surname> <given-names>J. W.</given-names>
</name>
<name>
<surname>Yeh</surname> <given-names>I. H.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>CSPNet: A new backbone that can enhance learning capability of CNN</article-title>,&#x201d; in <conf-name>In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops</conf-name>. <publisher-loc>Seattle, WA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>. Vol. <volume>pp</volume>. <fpage>390</fpage>&#x2013;<lpage>391</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPRW50498.2020.00203</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Gu</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Tea buds image identification based on lab color model and K-means clustering</article-title>. <source>J. Chin. Agric. Mechanization</source> <volume>36</volume>, <fpage>161</fpage>&#x2013;<lpage>164</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13733/j.jcam.issn.2095-5553.2015.05.040</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Detection and classification of tea buds based on deep learning</article-title>. <source>Comput. Electron. Agric.</source> <volume>192</volume>, <fpage>106547</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106547</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A litchi fruit recognition method in a natural environment using RGB-D images</article-title>. <source>Biosyst. Eng.</source> <volume>204</volume>, <fpage>50</fpage>&#x2013;<lpage>63</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2021.01.015</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Method of famous tea sprout identification and segmentation based on improved watershed algorithm</article-title>. <source>Comput. Electron. Agric.</source> <volume>184</volume>, <fpage>106108</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106108</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Tea bud detection based on faster R-CNN network</article-title>. <source>Trans. CSAM</source> <volume>53</volume>, <fpage>217</fpage>&#x2013;<lpage>224</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>