<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2023.1132909</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A longan yield estimation approach based on UAV images and deep learning</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Denghui</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2153819"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sun</surname>
<given-names>Xiaoxuan</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1626865"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jia</surname>
<given-names>Yuhang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yao</surname>
<given-names>Zhongwei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lin</surname>
<given-names>Peiyi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Yingyi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhou</surname>
<given-names>Haobo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhou</surname>
<given-names>Zhengqi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Kaixuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Shi</surname>
<given-names>Linlin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2153928"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Jun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1859079"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Engineering, South China Agricultural University</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Guangdong Laboratory for Lingnan Modern Agriculture</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Key Laboratory of South China Agricultural Plant Molecular Analysis and Genetic Improvement, Guangdong Provincial Key Laboratory of Applied Botany, South China Botanical Garden, Chinese Academy of Sciences</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>South China National Botanical Garden</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>University of Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Zhanyou Xu, Agricultural Research Service (USDA), United States</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Shanwen Zhang, Xijing University, China; Yunchao Tang, Zhongkai University of Agriculture and Engineering, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Jun Li, <email xlink:href="mailto:autojunli@scau.edu.cn">autojunli@scau.edu.cn</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work</p>
</fn>
<fn fn-type="other" id="fn002">
<p>This article was submitted to Technical Advances in Plant Science, a section of the journal Frontiers in Plant Science</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>06</day>
<month>03</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1132909</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>12</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>17</day>
<month>02</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Li, Sun, Jia, Yao, Lin, Chen, Zhou, Zhou, Wu, Shi and Li</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Li, Sun, Jia, Yao, Lin, Chen, Zhou, Zhou, Wu, Shi and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author (s) and the copyright owner (s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Longan yield estimation is an important practice before longan harvests. Statistical longan yield data can provide an important reference for market pricing and improving harvest efficiency and can directly determine the economic benefits of longan orchards. At present, the statistical work concerning longan yields requires high labor costs. Aiming at the task of longan yield estimation, combined with deep learning and regression analysis technology, this study proposed a method to calculate longan yield in complex natural environment. First, a UAV was used to collect video images of a longan canopy at the mature stage. Second, the CF-YD model and SF-YD model were constructed to identify Cluster_Fruits and Single_Fruits, respectively, realizing the task of automatically identifying the number of targets directly from images. Finally, according to the sample data collected from real orchards, a regression analysis was carried out on the target quantity detected by the model and the real target quantity, and estimation models were constructed for determining the Cluster_Fruits on a single longan tree and the Single_Fruits on a single Cluster_Fruit. Then, an error analysis was conducted on the data obtained from the manual counting process and the estimation model, and the average error rate regarding the number of Cluster_Fruits was 2.66%, while the average error rate regarding the number of Single_Fruits was 2.99%. The results show that the method proposed in this paper is effective at estimating longan yields and can provide guidance for improving the efficiency of longan fruit harvests.</p>
</abstract>
<kwd-group>
<kwd>yield estimation</kwd>
<kwd>UAV image</kwd>
<kwd>convolutional neural network</kwd>
<kwd>image analysis</kwd>
<kwd>regression analysis</kwd>
</kwd-group>
<counts>
<fig-count count="14"/>
<table-count count="4"/>
<equation-count count="6"/>
<ref-count count="50"/>
<page-count count="17"/>
<word-count count="8140"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Smart orchard systems can effectively evaluate the growth conditions of fruit trees and improve the quality of fruits through digital technology. During the fruit ripening period, accurate statistics regarding the output of each fruit tree and the total output of the whole orchard can not only improve the efficiency of deploying harvesting robots and transportation robots but also guide market pricing and upgrade the fruit yield grade, which is conducive to the maximization of the economic benefits of orchards (<xref ref-type="bibr" rid="B15">He et&#xa0;al., 2022</xref>). Longan is widely studied in the field of smart orchards. At present, the yield estimation methods for longan orchards mainly adopt manual visual investigation. This statistical method is labor-intensive, laborious and time-consuming; is easily influenced by the subjective factors of different investigators; and has low accuracy and efficiency (<xref ref-type="bibr" rid="B28">Marani et&#xa0;al., 2021</xref>). Therefore, to reduce the cost of longan orchard yield estimation and improve the accuracy of yield estimation, it is necessary to develop a system that can automatically estimate longan orchard yields.</p>
<p>Longan fruits are usually clustered and grow on the outside of the canopy (<xref ref-type="bibr" rid="B31">Pham et&#xa0;al., 2015</xref>). The growth characteristics of fruits and their quantitative statistical scheme are shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. In the natural environment, the distribution of longan fruits is complex, and they are easily blocked by leaves and branches, exhibiting different postures in different growing environments. Therefore, accurate longan fruit detection is the difficult part of realizing automatic yield estimation, which directly affects the accuracy and efficiency of orchard yield estimation. In recent years, researchers in related fields have used shape matching, color space transformation, threshold segmentation, multiscale feature fusion, fuzzy clustering and other methods to identify, detect and classify apples, oranges and other fruits (<xref ref-type="bibr" rid="B18">Jaisin et&#xa0;al., 2013</xref>; <xref ref-type="bibr" rid="B45">Xiong et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B50">Zhuang et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B16">He et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B25">Lin et&#xa0;al., 2020</xref>). The traditional machine learning methods used in these studies can only be used for image processing tasks with simple background conditions, and have poor robustness in the face of very complex actual orchard scenes.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Growth characteristics of longan fruits and their quantitative statistical scheme.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g001.tif"/>
</fig>
<p>With the development of sensor and computer technology, deep learning approaches have been widely developed and applied by researchers, and deep learning exhibits an excellent learning ability in cases involving the extraction of features from complex images. In recent years, with the demand for intelligence in the agricultural field, an increasing number of researchers have used deep learning technology to process collected image data for various tasks (<xref ref-type="bibr" rid="B42">Wang et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B5">da Silva et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B6">de Medeiros et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B48">Zhou et&#xa0;al., 2022</xref>), including fruit recognition (<xref ref-type="bibr" rid="B11">Gao et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B46">Xiong et&#xa0;al., 2020</xref>), classification of plants (<xref ref-type="bibr" rid="B10">Flores et&#xa0;al., 2021</xref>), classification of pests and diseases (<xref ref-type="bibr" rid="B2">Anagnostis et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B35">Singh et&#xa0;al., 2021</xref>), monitoring of crop growth state based on remote sensing (<xref ref-type="bibr" rid="B27">Ma et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B30">Paoletti et&#xa0;al., 2019</xref>), nondestructive testing and grading of fruit (<xref ref-type="bibr" rid="B20">Koirala et&#xa0;al., 2019</xref>), and animal behavior analysis (<xref ref-type="bibr" rid="B29">Norouzzadeh et&#xa0;al., 2018</xref>). To sum up, the deep learning model has stronger feature extraction ability, and it can effectively solve complex nonlinear problems. Faced with the problems of complex image backgrounds, uneven light intensities and diverse fruit features in complex orchard scenes, some researchers have applied deep learning technology to target detection tasks in complex scenes, and these models have strong robustness (<xref ref-type="bibr" rid="B1">Alpaydin, 2016</xref>; <xref ref-type="bibr" rid="B23">Liang et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B21">Li et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B47">Zhong et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B44">Wu et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B37">Tang et&#xa0;al., 2023</xref>).</p>
<p>How to quickly obtain high-definition images of orchard scenes is a key issue for improving the efficiency of orchard yield estimation. With the rapid development of unmanned aerial vehicle (UAV) power systems, control systems and sensor technology, it is possible for UAVs to carry various types of sensors to observe the earth. In recent years, UAVs have been equipped with various sensors and used in agriculture, including plant growth state detection and yield estimation (<xref ref-type="bibr" rid="B39">Vanegas et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B38">Tetila et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B49">Zhou et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B8">Feng et&#xa0;al., 2020a</xref>; <xref ref-type="bibr" rid="B9">Feng et&#xa0;al., 2020b</xref>; <xref ref-type="bibr" rid="B36">Sumesh et&#xa0;al., 2021</xref>). Therefore, the researchers on our team use an RGB camera on a UAV to plan the UAV flight path in advance and quickly obtain high-definition image data about the orchard.</p>    <p>In this study, by combining a UAV and deep learning application technology, a fast and accurate longan yield statistics approach is proposed. This approach will help to improve the accuracy and efficiency of the yield statistics of each longan tree in the modern orchard production scene and provide information for the task assignments of fruit picking UAVs and fruit transport aircraft. The main contributions of this research are as follows.</p>
<list list-type="alpha-lower">
<list-item>
<p>A method of collecting canopy images and videos of each fruit tree with a UAV is proposed to accurately and completely obtain canopy image data for each fruit tree.</p>
</list-item>
<list-item>
<p>Two datasets are set up to train and evaluate the performance of different target detection models.</p>
</list-item>
<list-item>
<p>A scheme for counting the numbers of different targets is proposed; this scheme includes a model based on Cluster_Fruit-YOLOv5s_Deepsort (CF-YD) and a model based on Single_Fruit-YOLOv7_Deepsort (SF-YD).</p>
</list-item>
<list-item>
<p>A regression analysis is carried out on the quantities counted by the two models and their real quantities to obtain a fitting equation.</p>
</list-item>
</list>    <p>The main contents of this paper are as follows: Section 2 introduces the materials and methods, Section 3 introduces the model construction process and the statistical strategy for calculating single fruit tree yields, Section 4 introduces the model experiment and results analysis in detail, and Section 5 summarizes the full text.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Overview of the fruit tree yield estimation methods</title>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> shows the method of quickly and accurately calculate the yield of each fruit tree. First, fruit tree canopy images are collected, and the obtained images are preprocessed as the Cluster_Fruit image dataset. Second, a two-step model is established. The first step is to count the number of Cluster_Fruits on a single fruit tree based on the CF-YD model, and the second step is to count the number of Single_Fruits on each Cluster_Fruit based on the SF-YD model. The results of each step are combined with the corresponding fitting equation to correct the final result.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Solution for longan fruit tree yield statistics.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g002.tif"/>
</fig>
<p>In the two-step model of this scheme, when each Cluster_Fruit is detected from a canopy image of fruit trees based on the CF-YD model, each Cluster_Fruit needs to be cut out as the input image of the statistical Single_Fruit quantity model. After using the SF-YD model to count the number of Single_Fruits in each image cut out in the previous step, it is necessary to calculate the yield of a single fruit tree according to the weight of the Single_Fruit.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Sensor system and image acquisition</title>
<p>To build the data set, 1100 valid longan images and 16 complete longan tree canopy videos were acquired at the Longan orchard in Guangzhou on July 1-25, 2021, and July 1-26, 2022, during two different time periods: morning (7:30&#x2013;11:30) and afternoon (13:30&#x2013;18:30). Furthermore, to perform modelling and verify the accuracy of the model, the actual numbers of Cluster_Fruits and Single_Fruits and the quality of 16 Cluster_Fruits from 16 fruit trees in the orchard were manually counted. A lightweight, small-size and high-resolution RGB camera mounted on a DJI Mavic 2 Pro UAV was used to collect orchard canopy images. The camera had 12 million pixels, the viewing angle was 85 degrees, the focal point ranged from 0.5&#xa0;m to infinity, and 120 images could be taken in one second at the fastest speed. <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref> is a schematic diagram of the image acquisition mode.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Schematic diagram of the image acquisition mode.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g003.tif"/>
</fig>
<p>According to the planting mode of modern orchards, the images were collected according to the following steps. &#x2460; Control the UAV to fly around the fruit trees, and during this process, have the RGB camera mounted on the UAV always look straight at the tree center to collect the canopy images of the fruit trees. &#x2461; Set the RGB image resolution to 1280&#xd7;720 pixels, and automatically save each image to the image acquisition card obtaining it. In order to ensure the diversity of images in the data set, images were taken on sunny and cloudy days respectively, including images of Shixia longan and Chuliang longan. <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref> shows the examples of the UAV images.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Examples of the UAV images.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g004.tif"/>
</fig>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Image preprocessing</title>
<p>In this study, data sets are prepared for the two-step model. First, aiming at the images collected by the camera on the UAV, image cropping and size normalization are performed, and the bounding box of each Cluster_Fruit in each sample is manually marked, forming a Cluster_Fruit data set for the CF-YD model. Second, the trained CF-YD model is used to detect Cluster_Fruits in the original RGB images, each Cluster_Fruit image is cut out according to the output coordinate information, and the Single_Fruit positions in each image are manually marked to form a Single_Fruit data set for the SF-YD model. <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref> shows the whole process.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Diagram of the whole process.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g005.tif"/>
</fig>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>Construction of the Cluster_Fruit image data set</title>
<p>During the process of constructing the Cluster_Fruit image dataset, firstly, to ensure the diversity of the training samples, a self-programmed random cropping algorithm is used to amplify the Cluster_Fruit images. After the initial longan dataset is expanded, 1100 longan images are obtained. Then, the image size is normalized to 1280&#xd7;1280 pixels. Finally, the 1100 images are manually annotated following the guidelines of the Pascal VOC 2010 reference challenge. The labelling information mainly includes the size of each image, the target category and the specific position coordinate information of the target area.</p>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>Construction of the Single_Fruit image dataset</title>
<p>As the SF-YD model adopted in this study first normalizes the input images of any size to 640&#xd7;640 pixels, the trained CF-YD model is used to detect each image in the Cluster_Fruit data set, and the coordinate information of each Cluster_Fruit image is obtained and directly cut out from the original image. After screening, 1100 Cluster_Fruit images are selected. LableImg software is used to manually mark the positions of Single_Fruits in each image to form a Single_Fruit data set for training and testing the SF-YD model.</p>
<p>In summary, the Cluster_Fruit data set and Single_Fruit data set are constructed by the above two methods. The images in both data sets are divided into a training set, verification set, and test set according to a ratio of 8:1:1. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> presents the images and the annotation information.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Details of the Cluster_Fruit and Single_Fruit data sets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Data set</th>
<th valign="middle" colspan="2" align="center">Cluster_Fruit data set</th>
<th valign="middle" colspan="2" align="center">Single_Fruit data set</th>
</tr>
<tr>
<th valign="middle" align="center">Images</th>
<th valign="middle" align="center">Cluster_Fruit</th>
<th valign="middle" align="center">Images</th>
<th valign="middle" align="center">Single_Fruit</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Full data set</td>
<td valign="middle" align="center">1100</td>
<td valign="middle" align="center">11239</td>
<td valign="middle" align="center">1100</td>
<td valign="middle" align="center">24045</td>
</tr>
<tr>
<td valign="middle" align="left">Training data set</td>
<td valign="middle" align="center">880</td>
<td valign="middle" align="center">8898</td>
<td valign="middle" align="center">880</td>
<td valign="middle" align="center">19393</td>
</tr>
<tr>
<td valign="middle" align="left">Validation data set</td>
<td valign="middle" align="center">110</td>
<td valign="middle" align="center">1205</td>
<td valign="middle" align="center">110</td>
<td valign="middle" align="center">2285</td>
</tr>
<tr>
<td valign="middle" align="left">Test data set</td>
<td valign="middle" align="center">110</td>
<td valign="middle" align="center">1136</td>
<td valign="middle" align="center">110</td>
<td valign="middle" align="center">2367</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Model construction and statistical strategy for fruit tree yield estimation</title>
<p>In this section, according to the video image data of the canopy of a single fruit tree collected by the RGB camera on the UAV, a yield prediction scheme for a single fruit tree is proposed. First, according to the growth characteristics of Cluster_Fruit, the improved YOLOv5s target detection algorithm and Deepsort target tracking algorithm are incorporated into the CF-YD model, and the flow chart for quickly and accurately obtaining the numerical and location information of Cluster_Fruits from the video image of a single fruit tree is determined. Then, according to the growth characteristics of Single_Fruits, the YOLOv7 target detection algorithm and Deepsort target tracking algorithm are merged into the SF-YD model, and the flow chart for quickly obtaining the number of Single_Fruits from a video image of Cluster_Fruit is worked out. Finally, according to the prediction results regarding the quantities of Cluster_Fruits and Single_Fruits, a strategy for counting the output of a single fruit tree is proposed.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Deepsort algorithm</title>
<p>The Deepsort algorithm (<xref ref-type="bibr" rid="B43">Wojke et&#xa0;al., 2017</xref>) is an algorithm with a multitarget tracking function, which is improved on the basis of the SORT algorithm (<xref ref-type="bibr" rid="B3">Bewley et&#xa0;al., 2016</xref>). Compared with the SORT algorithm, the Deepsort algorithm improves the content matching process to avoid ignoring multitarget ID transformations, uses appearance information to curb the frequency of target ID transformations, and adds a simple convolutional neural network (CNN) model to extract the appearance features of detected targets (expressed by low-dimensional vectors). The core of the Deepsort algorithm consists of prediction, observation and updating. The specific flow of the Deeppart algorithm is as follows. &#x2460; The target information predicted by You Only Look Once (YOLO) is input into the Deeppart algorithm as the observed value. The Kalman filter first judges whether a track is present, and if one is, it predicts the prior probability of the target information, then carries out cascade matching and IoU matching in the matching module, and finally obtains the matching success list. &#x2461; In the Kalman updating module, a posteriori prediction is performed on the successfully matched target to obtain the corrected target coordinates, and parameters such as the Kalman gain are updated. &#x2462; The above operations are repeated until all the videos are processed.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>CF-YD model for Cluster_Fruit statistics</title>
<p>The clusters of longan fruits are usually distributed in the canopies of fruit trees in disordered arrangements, and their sizes become very different as the distance from the camera increases. When quickly and accurately counting the number and locations of longan Cluster_Fruits from the canopy images of fruit trees, it is necessary to overcome the problem that target scale changes greatly affect the resulting detection accuracy. The most commonly used target detection algorithms are the R-CNN series (<xref ref-type="bibr" rid="B14">Girshick et&#xa0;al., 2015a</xref>; <xref ref-type="bibr" rid="B13">Girshick, 2015b</xref>; <xref ref-type="bibr" rid="B34">Ren et&#xa0;al., 2017</xref>) and YOLO series (<xref ref-type="bibr" rid="B32">Redmon et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B33">Redmon &amp; Farhadi, 2018</xref>; <xref ref-type="bibr" rid="B4">Bochkovskiy et&#xa0;al., 2020</xref>) models. R-CNN series algorithms, also known as target detection algorithms based on candidate areas, first generate candidate areas that may contain objects and then further classify and calibrate the candidate areas to obtain the final detection results. During the training process, YOLO series algorithms can pay more attention to the global information and the whole image in target detection. The core idea of YOLO is to use the whole picture as the input of the network and directly return to the position and category of the bounding box at the output.</p>
<p>Compared with the classic YOLOv3 algorithm, the data enhancement step of the YOLOv5 detection algorithm uses Mosaic to expand the input dataset, and it can also perform operations such as flipping, brightness adjustment and clipping. For a sample set with less data, the data can be effectively expanded. Four versions of YOLOv5 are available, namely, YOLOv5s, YOLOv5m, YOLOv5l and YOLOv5x, among which YOLOv5s is the network with the smallest depth and the smallest characteristic map width in this series of detection networks.</p>
<p>YOLOv5s mainly consists of three parts: a backbone, a neck and an output. The backbone is the basic feature extraction layer, which is used to extract feature information from images. It includes four modules: Focus, CBH, CSP1-x and spatial pyramid pooling (SPP) modules (<xref ref-type="bibr" rid="B17">He et&#xa0;al., 2015</xref>). The neck is a feature fusion layer whose function is to fuse image information with different scales to obtain better detection results. It uses the rectified linear unit (ReLU) activation function and adopts a feature pyramid network (FPN) (<xref ref-type="bibr" rid="B24">Lin et&#xa0;al., 2017</xref>) + PAN (<xref ref-type="bibr" rid="B26">Liu et&#xa0;al., 2018</xref>) network structure. The output is the output layer, whose function is to output the predicted target information, in which nonmaximum suppression (NMS) is performed on the last detection frame of the target to obtain the optimal target frame; three different detection scales (20&#xd7;20, 40&#xd7;40, 80&#xd7;80) are provided, which can predict longan Cluster_Fruits with different sizes. In the early stage, our team improves the YOLOv5s model for the Cluster_Fruit detection task and improves the accuracy of the model in the target detection task.</p>
<p>In this paper, according to the growth characteristics of Cluster_Fruit, the improved YOLOv5s target detection algorithm (<xref ref-type="bibr" rid="B22">Li et&#xa0;al., 2022</xref>) and Deepsort target tracking algorithm are incorporated into the CF-YD model. <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6A</bold>
</xref> shows the flow chart for the numerical Cluster_Fruit statistics in a single fruit tree. First, after the complete video data of a single fruit tree are input into the YOLOv5s algorithm, the YOLOv5s algorithm detects multiple targets in each frame and inputs the position information obtained from multiple Cluster_Fruits into the Deepsort algorithm to assign ID numbers. Then, the correlation filtering algorithm is used to compare whether anchor frames with the same size are present in the front and back frames (a target with the same anchor frame size continues to use the original number and assigns a new ID number to the new target). Finally, the maximum ID number is output, and the value of this number is used as the number of predicted Cluster_Fruits in the fruit tree canopy.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Flow chart of Cluster_Fruits and Single_Fruits quantity prediction. <bold>(A)</bold> The flow chart of Single_Fruit quantity statistics in a Cluster_Fruit, <bold>(B)</bold> The flow chart of Cluster_Fruit quantity statistics in a single fruit tree.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g006.tif"/>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>SF-YD model for Single_Fruit statistics</title>
<p>The Single_Fruits of longan are usually distributed in Cluster_Fruits in a disorder manner and occupy very small areas in the canopy images of fruit trees, so it is difficult to accurately count the number of Single_Fruits in each Cluster_Fruit directly from the canopies of fruit trees. In this study, the latest YOLOv7 algorithm (<xref ref-type="bibr" rid="B41">Wang et&#xa0;al., 2022</xref>), which has a higher detection accuracy and a faster detection speed than other algorithms in the series, is adopted to realize the Single_Fruit detection task. The YOLOv7 algorithm adopts strategies such as the extended efficient long-range attention network (E-ELAN), model scaling based on concatenation-based models (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2021</xref>), and convolutional reparameterization (<xref ref-type="bibr" rid="B7">Ding et&#xa0;al., 2021</xref>) and achieves a very good balance between detection efficiency and accuracy.</p>
<p>The YOLOv7 network mainly includes four parts: an input, a backbone, a head and a prediction module. The input module normalizes the input image of any size to the input pixel size set by the backbone network. The backbone module consists of several BConv layers, E-ELAN layers and MPConv layers. The BConv layer consists of a convolution layer, a batch normalization layer and a LeakyReLU activation function (<xref ref-type="bibr" rid="B19">Jiang &amp; Cheng, 2019</xref>), which is used to extract image features with different scales. The E-ELAN layer keeps the original ELAN design framework and improves the learning ability of the network without destroying the original gradient path by guiding the computing blocks of different feature groups to learn more diverse features. On the basis of the BConv layer, the MPConv layer adds an Maxpool layer to form two branches. The upper branch cuts the image length and width by half through Maxpool and the image channel by half through the BConv layer. In the lower branch, the image channel is halved by the first BConv layer, and the image length and width are halved by the second BConv layer. Finally, the features extracted by the upper and lower branches are fused by the Cat operation, which improves the feature extraction ability of the network. The head module is composed of a path aggregation FPN (PAFPN) (<xref ref-type="bibr" rid="B12">Ge et&#xa0;al., 2021</xref>) structure. By introducing the bottom-up path, the bottom-up information can be transmitted to the higher level more easily, thus realizing the efficient integration of features at different levels. The prediction module adjusts the number of image channels for three features of the PAFPN output with different scale, such as P3, P4 and P5, through the REPVGG block structure and finally uses a 1&#xd7;1 convolution to predict the confidence, category and anchor frame.</p>
<p>According to the growth characteristics of Single_Fruits, the YOLOv7 target detection algorithm and Deepsort target tracking algorithm are incorporated into the SF-YD model. <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6B</bold>
</xref> shows the flow chart of the numerical Single_Fruit statistics in a Cluster_Fruit. First, each Cluster_Fruit with an ID number assigned by the CF-YD model is continuously cut out from the original video image to form Cluster_Fruit video data with ID numbers. The YOLOv7 algorithm detects multiple targets in each frame, inputs the obtained position information of multiple Single_Fruits into the Deepsort algorithm to assign ID numbers, then compares whether anchor frames of the same size are present in the previous and subsequent frames by using the correlation filtering algorithm (targets with the same anchor frame size continue to use the original number, and new ID numbers are assigned to new targets), and finally outputs the maximum ID number and takes the value of this number as the number of Single_Fruits in each Cluster_Fruit.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Statistical strategy for a single fruit tree yield</title>    <p>In this section, according to the prediction results regarding the numbers of Cluster_Fruits and Single_Fruits in the previous two sections, the specific steps of longan yield estimation are formulated:</p>
<list list-type="simple">
<list-item>
<p>(1) Step 1: The CF-YD model is used to quickly obtain the number <italic>N<sub>CF</sub>
</italic>
<sub>1</sub> of Cluster_Fruits from the canopy images of fruit trees, and the total number <italic>N<sub>CF</sub>
</italic>
<sub>2</sub> of Cluster_Fruits is predicted by establishing a regression analysis model with the number of Cluster_Fruits counted in the real orchard.</p>
</list-item>
<list-item>
<p>(2) Step 2: The location information of the <italic>N<sub>CF</sub>
</italic>
<sub>1</sub> Cluster_Fruits obtained in the previous step is cut from the original image in turn and input into the SF-YD model to obtain the total number <italic>N<sub>SF</sub>
</italic>
<sub>1</sub> of Single_Fruits. The total number <italic>N<sub>SF</sub>
</italic>
<sub>2</sub> of Single_Fruits is predicted by establishing a regression analysis model with the number of Single_Fruits in the <italic>N<sub>CF</sub>
</italic>
<sub>1</sub> Cluster_Fruit statistics in the actual orchard.</p>
</list-item>
<list-item>
<p>(3) Step 3: According to the total number <italic>N<sub>SF</sub>
</italic>
<sub>2</sub> of Single_Fruits in <italic>N<sub>CF</sub>
</italic>
<sub>1</sub> Cluster_Fruits, the average number <italic>AVE<sub>nSF</sub>
</italic> of Single_Fruits in a single cluster can be calculated. Ten Cluster_Fruits are randomly selected to weigh and count the number of Single_Fruits and calculate the average quality <italic>AVE<sub>mSF</sub>
</italic> of Single_Fruits.</p>
</list-item>
<list-item>
<p>(4) Step 4: The formula for calculating the yield TQ of a single fruit tree is:</p>
</list-item>
</list>
<disp-formula>
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mtext>TQ</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>*</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>V</mml:mi>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>*</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>V</mml:mi>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Model experiment and results analysis</title>
<sec id="s4_1">
<label>4.1</label>
<title>Model training and parameter design</title>
<p>The training and testing processes of the CF-YD and SF-YD models are implemented on a workstation with the Ubuntu 18.04 LTS operating system. The main hardware devices of the workstation are as follows: GPU: NVIDIA GTX3060 (configured with CUDA 10.1 and cuDNN 7.1); processor: 11<sup>th</sup> Gen Intel (R) Core (TM) i7-11800H; RAM: NVIDIA 16G; and hard disk: Samsung 1T. On the PyTorch deep learning framework, a CNN model is built with the Python programming language.</p>
<p>The image size of the CF-YD model training data is set to 1280&#xd7;1280 pixels. In terms of parameter settings, the intersection over union (IoU) is set to 0.5, the initial learning rate is 1e-4, and the learning rate at the end of training is set to 1e-5. The dataset is split at a 90-10 training-verification ratio, and a total of 500 epochs of iterative training are conducted. During the training process, after conducting a series of convolution and pooling operations with the CF-YD model, the input image uses the anchor box in the feature map layer to extract a series of features. In the feature layer, each cell is mapped to the original image, the premarked anchor box is found, and then the loss value between this anchor box and the ground truth is calculated. After training, the CF-YD model obtains a series of model parameters to fit the real border with the anchor box.</p>
<p>During the training process of the SF-YD model, input images of any size are normalized to 640&#xd7;640 pixels through the input module. The training process of the model is divided into the Freeze phase and UnFreeze phase, the optimizer is set to stochastic gradient descent (SGD), the initial learning rate is 0.01, the momentum is 0.9, and the weight decay is set to 5e-4. The cosine annealing algorithm is used to adjust the learning rate, and the minimum learning rate is 1e-4. The training and verification steps alternate. During the Freeze phase, the training duration is 50 epochs, each epoch has 220 iterations in the training phase and 28 iterations in the verification phase, and the batch size is 4. During the UnFreeze phase, the training duration is 250 epochs, with 440 iterations in the training phase and 55 iterations in the verification phase of each epoch, and the batch size is 2. After training, the SF-YD model obtains a series of model parameters to fit the real border with the anchor box.</p>
<p>During the training and testing processes of the CF-YD and SF-YD models, it is necessary to generate a series of anchor boxes (candidate areas) in the given image according to certain rules. In this study, k-means clustering and a genetic algorithm are used to obtain anchor boxes. Because the prediction layer of the YOLO network contains three scales of information (corresponding to three receptive fields), each scale contains three anchors. Therefore, the YOLO network needs nine anchor scales; that is, the sizes of all the target bounding boxes in the dataset are clustered into nine categories. Through the analysis of the Cluster_Fruit and Single_Fruit datasets, the k-means clustering results of all the target bounding boxes in the two datasets are obtained. Each point in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7A</bold>
</xref> corresponds to a target bounding box in the Cluster_Fruit dataset. According to the overall size characteristics of the target bounding boxes, nine types of anchor boxes that are suitable for training and testing the CF-YD model are determined as [16,16, 21,28, 28,23, 30,39, 41,33, 46,52, 67,77, 116,135, 247,291]. Each point in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7B</bold>
</xref> corresponds to a target bounding box in the Single_Fruit dataset. According to the overall size characteristics of the target bounding box, nine types of anchor boxes that are suitable for training and testing the SF-YD model are determined as [45,43, 61,36, 61,53, 78,44, 70,66, 93,54, 87,78, 131,74, 110,104]. At the same time, it can be seen from the figure that the more points there are with the same color, the more targets with this cluster size, and the points with different colors represent targets with different cluster sizes. In other words, this figure can reflect the complicated situation regarding the targets to be detected in an orchard scene to some extent.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Clustering results of the target bounding boxes in the two datasets.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g007.tif"/>
</fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Model evaluation indicators</title>
<sec id="s4_2_1">
<label>4.2.1</label>
<title>Evaluation indices for the object detection algorithm</title>
<p>In this study, P, R, F1 score, AP, and FPS were used to evaluate the performance of two target detection models. The calculation methods for calculating the P, R, F1 score and AP here are shown in formulas (2), (3), (4) and (5).</p>
<disp-formula>
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mtext>P</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mtext>R</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>&#xa0;score</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>*</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>*</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mtext>AP</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#xa0;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In these formulas, TP represents true cases, FP represents false-positive cases, TN represents true-negative cases, and FN represents false-negative cases.</p>
</sec>
<sec id="s4_2_2">
<label>4.2.2</label>
<title>Evaluation indices for the Deepsort algorithm</title>
<p>This study selects identity switches (IDSs), multiple-object tracking accuracy (MOTA) and multiple-object tracking precision (MOTP) to evaluate the effectiveness of the multitarget tracking algorithm. IDS is the number of times the tracking target ID changes. The smaller its value is, the better the tracking stability. MOTA considers false alarms and IDSs simultaneously and measures the performance of the tracking algorithm in terms of detecting targets and keeping track of them, which has nothing to do with target detection accuracy. The larger its value is, the better the performance of the algorithm. MOTP is used to quantify the positioning accuracy of the detector. The larger its value is, the higher the accuracy of the detector.</p>
</sec>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Results and discussion of the object detection and counting tasks</title>
<sec id="s4_3_1">
<label>4.3.1</label>
<title>Performance evaluation results of different models</title>
<p>To fully evaluate the performance of the CF-YD model in detecting Cluster_Fruits and the SF-YD model in detecting Single_Fruits, first, the CF-YD and SF-YD models are trained according to the training parameters set in Section 4.1, and the weight file with the best training effect in each model is used as the weight file for testing the model performance. Then, the CF-YD model for the Cluster_Fruit test set and the SF-YD model for the Single_Fruit test set comprehensively evaluated from the aspects of P, R, AP, FPS, F1 score, etc., and the obtained results are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Evaluation index results obtained on the test dataset under different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Models/Evaluation index</th>
<th valign="middle" align="center">P (%)</th>
<th valign="middle" align="center">R (%)</th>
<th valign="middle" align="center">AP (%)</th>
<th valign="middle" align="center">F1 score</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">CF-YD model (Cluster_Fruit test data set)</td>
<td valign="middle" align="center">83.50</td>
<td valign="middle" align="center">85.70</td>
<td valign="middle" align="center">82.40</td>
<td valign="middle" align="center">0.85</td>
<td valign="middle" align="center">56.21</td>
</tr>
<tr>
<td valign="middle" align="left">SF-YD model (Single_Fruit test data set)</td>
<td valign="middle" align="center">93.23</td>
<td valign="middle" align="center">91.97</td>
<td valign="middle" align="center">97.12</td>
<td valign="middle" align="center">0.93</td>
<td valign="middle" align="center">98.35</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The P-R curve and F1 score changes exhibited by the CF-YD model on the Cluster_Fruit test dataset are shown in <xref ref-type="fig" rid="f8">
<bold>Figures&#xa0;8A, B</bold>
</xref>, respectively. The area enclosed by the P-R curve and the two coordinate axes in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8A</bold>
</xref> corresponds to the AP value of Cluster_Fruit detection. As shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>, the AP value of the CF-YD model for Cluster_Fruit detection is 82.4% on the test set, and the detection accuracy is high. However, some Cluster_Fruits are still blocked by other Cluster_Fruits or branches and leaves and cannot be accurately detected. The F1 score in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8B</bold>
</xref> first changes slightly with increasing confidence value and then suddenly decreases sharply when the confidence value is greater than 0.7. Therefore, it is usually sufficient to set this parameter to 0.5 in the model training stage. The FPS value is the number of images that the model can detect per second, and the detection time of each image is only 18 ms. According to the clustering results of the target sizes in the Cluster_Fruit dataset, the Cluster_Fruit size exhibits the diversity characteristic. The above results show that the CF-YD model has good detection performance for multiscale targets.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>P-R curves and F1 scores of different detection methods.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g008.tif"/>
</fig>
<p>The changes in the P-R curve and F1 score of the SF-YD model on the Single_Fruit test dataset are shown in <xref ref-type="fig" rid="f8">
<bold>Figures&#xa0;8C, D</bold>
</xref>, respectively. The area enclosed by the P-R curve and the two coordinate axes in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8C</bold>
</xref> basically covers the whole coordinate system. As shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>, the AP value of the SF-YD model on the test set for Single_Fruit is 97.12%, demonstrating high detection accuracy, and only a few Single_Fruits are undetected. The F1 score changes slightly with increasing confidence and suddenly decreases sharply when the confidence is greater than 0.85. Therefore, it is usually sufficient to set this parameter to 0.5 in the model training stage. The FPS value is the number of images that the model can detect per second, and the detection time of each image is only 10 ms. According to the clustering results of the target sizes in the Single_Fruit dataset, the sizes of Single_Fruits are generally small. The above results show that the SF-YD model also has good detection performance for small targets.</p>
</sec>
<sec id="s4_3_2">
<label>4.3.2</label>
<title>Detection effects of different models in real scenes</title>
<p>Many varieties of longan are available, and new varieties have appeared in recent years. To further evaluate the performance of the CF-YD and SF-YD models in detecting longan Cluster_Fruits and Single_Fruits in real and complicated mountain orchard environments, this section selects images of longan orchards with different varieties (Chuliang longan and Shixia longan), different illumination conditions (Sunny day and Cloudy day), different scales and different densities, tests the trained CF-YD and SF-YD models, and obtains the detection results of each model.</p>
<p>
<xref ref-type="fig" rid="f9">
<bold>Figures&#xa0;9A, B</bold>
</xref> are the test results obtained by the CF-YD model for Chuliang longan in different scenes of real orchards. <xref ref-type="fig" rid="f9">
<bold>Figures&#xa0;9C, D</bold>
</xref> are the test results obtained by the CF-YD model for Shixia longan in different scenes of real orchards. From the detection results, it can be seen that regardless of the longan variety and in sunny day or cloudy day, Cluster_Fruit is accurately detected for large-scale or small-scale targets. The above detection results show that the CF-YD model has good feature extraction performance, has strong generalization to different varieties of longan in real orchard environments and is not easily disturbed by uneven light. It also has a good detection effect on small targets, so it is suitable for target detection in longan orchards.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Cluster_Fruit detection results of the CF-YD model under different scenes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g009.tif"/>
</fig>
<p>To evaluate the detection effect of the SF-YD model on different longan varieties in a real orchard scene, the CF-YD model is first used to identify Cluster_Fruit in a real orchard scene and cut out the video data to form Single_Fruits, which are then input into the SF-YD model for Single_Fruit detection. <xref ref-type="fig" rid="f10">
<bold>Figures&#xa0;10A, B</bold>
</xref> are the test results obtained by the SF-YD model for Chuliang longan and Shixia longan, respectively, in different scenes of real orchards. The fruit colors and shapes of the two longan species are quite different. They exhibit different glosses at different distances and under different light. It can be seen from the detection results that Single_Fruits of different varieties are accurately detected in, different weather conditions, with different scales and in scenes with different densities. The above detection results show that the SF-YD model has good feature extraction performance, strong generalization for different varieties of longan Single_Fruits in a real orchard environment, and a good detection effect for multiscale targets, so it is suitable for small target detection in longan orchards.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Single_Fruit detection results of the SF-YD model under different scenes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g010.tif"/>
</fig>
</sec>
<sec id="s4_3_3">
<label>4.3.3</label>
<title>Counting results of different models in real scenes</title>
<p>To evaluate the tracking performance of the CF-YD model on Cluster_Fruit and the SF-YD model on Single_Fruit, a video image of a fruit tree canopy is randomly selected to test the CF-YD model, and a video image of a Cluster_Fruit is selected to test the SF-YD model. The models are comprehensively evaluated in terms of the IDS, MOTA, MOTP and other metrics, and the results obtained are shown in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Evaluation index results obtained by different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Models/Evaluation index</th>
<th valign="middle" align="center">IDS</th>
<th valign="middle" align="center">MOTA (%)</th>
<th valign="middle" align="center">MOTP (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">CF-YD model (Cluster_Fruit)</td>
<td valign="middle" align="center">5</td>
<td valign="middle" align="center">95.30</td>
<td valign="middle" align="center">92.60</td>
</tr>
<tr>
<td valign="middle" align="left">SF-YD model (Single_Fruit)</td>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">97.20</td>
<td valign="middle" align="center">94.70</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Regarding the IDS metric, the numbers of target ID changes observed during the process of tracking the target in the video images with the two models are very small at 5 and 2, respectively. The MOTA and MOTP values of the two models are basically above 90%, which shows that both tracking algorithms can track targets stably and accurately.</p>
<p>To further verify the performance of the CF-YD and SF-YD models in counting the numbers of longan Cluster_Fruits and Single_Fruits in the real and complicated mountain orchard environment, this section selects images of longan orchards with different varieties (Chuliang longan and Shixia longan) and different lighting scenes, tests the trained CF-YD and SF-YD models, and obtains the counting results of each model.</p>
<p>
<xref ref-type="fig" rid="f11">
<bold>Figures&#xa0;11A, B</bold>
</xref> are the Cluster_Fruit counting results obtained in different scenes of real orchards by the CF-YD model for Chuliang longan. <xref ref-type="fig" rid="f11">
<bold>Figures&#xa0;11C, D</bold>
</xref> show the counting results obtained by the CF-YD model for Shixia longan in different scenes of real orchards. It can be seen from the counting results that regardless of the variety and in sunny or cloudy weather, Cluster_Fruits yield accurate counting results for large-scale or small-scale targets. The above results show that the CF-YD model has good target tracking performance. It has strong generalization for different varieties of longan in real orchard environments and is not easily disturbed by uneven lighting. It also has a good tracking effect for multiple targets, so it is suitable for target counting tasks in longan orchards.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Cluster_Fruit counting results of the CF-YD model under different scenes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g011.tif"/>
</fig>
<p>To verify that the SF-YD model can count different longan varieties in real orchard scenes, the Single_Fruit video data are input into the SF-YD model to count Single_Fruits. <xref ref-type="fig" rid="f12">
<bold>Figures&#xa0;12A, B</bold>
</xref> are the counting results of the SF-YD model for Chuliang longan and Shixia longan, respectively, in different scenes of real orchards. It can be seen from the counting results diagram that the different varieties of Single_Fruits are accurately counted in different weather conditions. The above detection results show that the SF-YD model has a good target tracking performance, strong generalization for different varieties of longan Single_Fruit in a real orchard environment, and a good tracking effect for multiscale targets, so it is suitable for counting small targets in longan orchards.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Single_Fruit counting results of the SF-YD model under different scenes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g012.tif"/>
</fig>
</sec>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>The models for estimating the numbers of Cluster_Fruits and Single_Fruits</title>
<p>To accurately obtain the yield of a single longan tree, it is necessary to modify the numbers of Cluster_Fruits and Single_Fruits identified by the two models. First, 10 longan trees of different ages are randomly selected from the longan orchard, and the true number of Cluster_Fruits on each longan tree and the true numbers of Single_Fruits on the randomly selected 10 Cluster_Fruits fruits are manually counted. Then, the canopy video images of these 10 longan trees are captured by UAVs, and the number of Cluster_Fruits on each longan tree and the numbers of Single_Fruits on the 10 randomly selected Cluster_Fruits are identified by the method described in the previous section. Finally, the number of artificial statistics and the number identified by the model are fitted by an equation, and a number estimation model for the Cluster_Fruits on a single longan tree and a number estimation model for the Single_Fruits on a single Cluster_Fruit are constructed.</p>
<p>
<xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref> counts the quantity information of the manual counting approach and two identification models. The actual value of Cluster_Fruits on 10 longan trees ranges from 91 to 312, and the actual value of Single_Fruits on ten Cluster_Fruits ranges from 18 to 32. Because the 10 longan trees and 10 Cluster_Fruits are randomly selected, the numbers of fruits will be different in different runs. At the same time, during the process of growth, the fruit of longan trees is affected by external conditions such as nutritional components and light conditions, so the yield of each tree is different. Exponential fitting, linear fitting, logarithmic fitting, binomial fitting, power fitting, etc., are performed for determining the numbers identified by the models and the actual number of manual statistics in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. After performing a comprehensive analysis and comparing the fitting results, as shown in <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref>, the best fitting method for the number of Cluster_Fruits on a single fruit tree is binomial fitting. The fitting equation is <italic>y</italic>&#xa0;=&#xa0;0.0023<italic>x</italic>
<sup>2</sup>+0.7155<italic>x</italic>+19.562 , and the determination coefficient <italic>R</italic>
<sup>2</sup> is 0.9970. The best fitting method for the number of Single_Fruits on a single cluster is exponential fitting, the fitting equation is <italic>y</italic>&#xa0;=&#xa0;7.822<italic>e</italic>
<sup>0.0565<italic>x</italic>
</sup> , and the determination coefficient <italic>R</italic>
<sup>2</sup> is 0.9953. Strong correlation is observed between the two samples.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Sample number information of the two identification models and manual statistics.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Class</th>
<th valign="middle" align="center">Identified value<break/>(Cluster_Fruit)</th>
<th valign="middle" align="center">Actual value<break/>(Cluster_Fruit)</th>
<th valign="middle" align="center">Class</th>
<th valign="middle" align="center">Identified value<break/>(Single_Fruit)</th>
<th valign="middle" align="center">Actual value<break/>(Single_Fruit)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">275</td>
<td valign="middle" align="center">299</td>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">26</td>
<td valign="middle" align="center">34</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">283</td>
<td valign="middle" align="center">312</td>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">21</td>
<td valign="middle" align="center">26</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="middle" align="center">261</td>
<td valign="middle" align="center">279</td>
<td valign="middle" align="center">3</td>
<td valign="middle" align="center">19</td>
<td valign="middle" align="center">23</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">213</td>
<td valign="middle" align="center">241</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">23</td>
<td valign="middle" align="center">29</td>
</tr>
<tr>
<td valign="middle" align="center">5</td>
<td valign="middle" align="center">195</td>
<td valign="middle" align="center">212</td>
<td valign="middle" align="center">5</td>
<td valign="middle" align="center">20</td>
<td valign="middle" align="center">24</td>
</tr>
<tr>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">153</td>
<td valign="middle" align="center">179</td>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">24</td>
<td valign="middle" align="center">30</td>
</tr>
<tr>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">182</td>
<td valign="middle" align="center">198</td>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">22</td>
<td valign="middle" align="center">27</td>
</tr>
<tr>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">93</td>
<td valign="middle" align="center">116</td>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">17</td>
<td valign="middle" align="center">20</td>
</tr>
<tr>
<td valign="middle" align="center">9</td>
<td valign="middle" align="center">113</td>
<td valign="middle" align="center">129</td>
<td valign="middle" align="center">9</td>
<td valign="middle" align="center">16</td>
<td valign="middle" align="center">20</td>
</tr>
<tr>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">77</td>
<td valign="middle" align="center">91</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">14</td>
<td valign="middle" align="center">17</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>The fitting results of the actual and identified numbers of fruits.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g013.tif"/>
</fig>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Experimental results of Cluster_Fruits and Single_Fruits in real orchard scenes</title>
<p>To further verify the quantity estimation model in Section 4.4, six other longan trees are randomly selected from real orchards, and the true number of Cluster_Fruits on each longan tree and the true numbers of Single_Fruits on the 6 randomly selected Cluster_Fruits are obtained by manual counting. Then, the CF-YD and SF-YD models are used to obtain the identification numbers of the Cluster_Fruits and Single_Fruits from the video data, respectively. By using the fitting equation obtained in Section 4.4, the identified numbers are corrected, and the predicted numbers of Cluster_Fruits and Single_Fruits are obtained. Finally, the error between the real quantity and the predicted quantity is analyzed. The error in this study is the absolute value of the predicted value minus the actual value, and the error rate is equal to the percentage value obtained by dividing this error by the actual value. The calculation formula for the error rate is:</p>
<disp-formula>
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mtext>Error&#xa0;rate</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mtext>Predicted&#xa0;value</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>Actual&#xa0;value</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>Actual&#xa0;value</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>|</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The actual numbers, identified numbers and predicted numbers of Cluster_Fruits on six longan trees and Single_Fruits on 6 Cluster_Fruits are counted in <xref ref-type="fig" rid="f14">
<bold>Figures&#xa0;14A, B</bold>
</xref>, respectively, and their error rate data are counted in <xref ref-type="fig" rid="f14">
<bold>Figures&#xa0;14C, D</bold>
</xref>. It can be seen from the data in <xref ref-type="fig" rid="f14">
<bold>Figures&#xa0;14A, B</bold>
</xref> that the number of Cluster_Fruits identified by the CF-YD model and the number of Single_Fruits identified by the SF-YD model are corrected by the fitting equation obtained in Section 4.4, and the predicted numbers are very close to the actual numbers. According to the data in <xref ref-type="fig" rid="f14">
<bold>Figure&#xa0;14C</bold>
</xref>, the average error rate of Cluster_Fruit of 6 longan trees is 2.66%. According to the data in <xref ref-type="fig" rid="f14">
<bold>Figure&#xa0;14D</bold>
</xref>, the average error rate for the Single_Fruits of 6 Cluster_Fruits is 2.99%. It can be seen from the data in <xref ref-type="fig" rid="f14">
<bold>Figures&#xa0;14C, D</bold>
</xref> that the prediction error rates of Cluster_Fruits and Single_Fruits are below 5%.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Statistical information of Cluster_Fruits and Single_Fruits. <bold>(A)</bold> The actual numbers, identified numbers and predicted numbers of Cluster_Fruits on six longan trees, <bold>(B)</bold> The actual numbers, identified numbers and predicted numbers of Single_Fruits on six Cluster_Fruits, <bold>(C)</bold> Error rate information of Cluster_Fruit on six trees, <bold>(D)</bold> Error rate information of Single_Fruit on six Cluster_Fruits.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1132909-g014.tif"/>
</fig>
<p>According to the statistical results of Section 4.3, Section 4.4 and this section, there are errors between the real value and the identified value, which are mainly caused by two reasons. &#x2460; The two models have certain accuracy levels when detecting targets. &#x2461; The real value of Cluster_Fruits and Single_Fruits are obtained by manual statistics, which is a multiangle and full-range process. However, the UAV collects video images of the fruit tree canopy from the front angle and can only obtain Cluster_Fruit and Single_Fruit images outside the tree canopy.</p>
<p>According to the statistics of horticulture experts, the average fruit weight of the Chuliang longan variety is 13&#xa0;g and that of the Shixia longan variety is 8&#xa0;g. After using the method proposed in this paper to obtain the number of Cluster_Fruits on a single fruit tree and the number of Single_Fruits on each Cluster_Fruit, the yield data of a single longan tree can be obtained by using the yield estimation strategy for a single fruit tree in Section 3.4.</p>
</sec>
</sec>
<sec id="s5" sec-type="conclusion">
<title>Conclusion</title>
<p>In a complex longan orchard, fruit grow in clusters, and the shapes of Cluster_Fruits vary widely. It is difficult to estimate the yield of a single fruit tree simply by counting the number of Cluster_Fruits. Although the shapes Single_Fruits are relatively consistent, their shapes are small, so it is difficult to accurately count the number Single_Fruits directly by image analysis. Therefore, the yield estimation strategy based on UAV images proposed in this paper is of great significance and can improve the accuracy and efficiency of the yield statistics obtained for each fruit tree.</p>
<p>In this study, a method based on UAV images and computer vision technology is proposed to estimate the yield of a single longan fruit tree. First, a UAV is used to collect video images of the fruit tree canopy, and after preprocessing the images, two datasets are constructed, and the targets of the datasets are manually marked. Then, the CF-YD and SF-YD models are constructed to identify Cluster_Fruits and Single_Fruits, respectively, which realizes the task of automatically identifying the number of targets directly from each image. Finally, to further predict the yield of a single longan fruit tree accurately, two models for estimating the numbers of Cluster_Fruits and Single_Fruits are proposed, and two fitting equations are established for determining the actual number and predicted number of Cluster_Fruits on a single fruit tree and the number of Single_Fruits on a single Cluster_Fruit, and the models are tested and verified in real orchards. This study can quickly and accurately estimate the yield of a single fruit tree, which can not only provide guidance for the production management and market pricing of longan orchards but also improve the efficiency of deploying harvesting robots and transportation robots, which is conducive to maximizing the economic benefits of orchards. The research in this paper can apply UAV image migration to the harvests of clustered fruits such as grapes and Cerasus pseudocerasus and promote the development of smart agriculture and unmanned farms.</p>
<p>Since most longan orchards are currently unstructured, this work still has some limitations, and the target detection and tracking abilities of the proposed method need to be further improved. In this study, the UAV mainly collects canopy images of longan fruit trees from the perspective of elevation but cannot obtain all-around images inside the canopies of fruit trees. Therefore, there is an error between the collected data and the real values. In future research, we will first consider the use of UAV to automatically plan flight routes in order to obtain orchard canopy images more easily. Secondly, an image analysis processor will be built on the UAV to calculate the output of fruit trees in real time. Finally, the result data of artificial statistics will continue to be added to further improve the accuracy of the fitting equation prediction quantity. In addition, the research objects will be expanded to more longan varieties in the future. In future work, we will continue to optimize the details of the solution to promote the development of smart agriculture.</p>
</sec>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>DL and XS conceived the study and wrote the paper. YJ, ZY, PL, YC, HZ, ZZ, and KW participated in the experiment and analyzed the experimental data. JL and LS supervised the manuscript and made valuable inputs. All authors contributed to the article and approved the submitted version.</p>
</sec>
</body>
<back>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>This research is supported by the earmarked fund for the Laboratory of Lingnan Modern Agriculture Project (NZ2021009), the open competition program of top ten critical priorities of Agricultural Science and Technology Innovation for the 14th Five-Year Plan of Guangdong Province (2022SDZG03), the China Agriculture Research System (No. CARS-32-11), the Special Project of Rural Vitalization Strategy of Guangdong Academy of Agricultural Sciences (No. TS-1-4), and the Guangdong Provincial Modern Agricultural Industry Technology System (No. 2021KJ123).</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>We are very grateful to Guangdong Academy of Agricultural Sciences for providing us with the site for collecting experimental data.</p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<title>Abbreviations</title>
<fn fn-type="abbr">
<p>UAV, Unmanned aerial vehicle; CNN, Convolutional neural network; YOLO, You Only Look Once; SORT, Simple Online and Realtime Tracking; CF-YD, Cluster_Fruit-YOLOv5s_Deepsort; SF-YD, Single_Fruit-YOLOv7_Deepsort; RGB, Red Green Blue; ID, Identity document; IoU, Intersection over Union; R-CNN, Recursive convolutional neural network; SPP, Spatial Pyramid Pooling; FPN, Feature pyramid network; PAN, Path aggregation network; NMS, Non Maximum Suppression; E-ELAN, Extended Efficient Long-Range Attention Network; PAFPN, Path Aggregation Feature Pyramid Network; P, Precision; R, Recall; AP, Average precision of a category; FPS, Frames per second; TP, True positive; FP, False positive; TN, True negative; FN, False negative; P-R, Precision-Recall; IDS, Identity Switch; MOTA, Multiple Object Tracking Accuracy; MOTP, Multiple Object Tracking Precision.</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Alpaydin</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Neural networks and deep learning</article-title>. <source>machine learning: The new AI</source>. <publisher-name>MIT Press</publisher-name>. <publisher-loc>Cambridge, Massachusetts, USA</publisher-loc>. <uri xlink:href="https://ieeexplore.ieee.org/document/7845182">https://ieeexplore.ieee.org/document/7845182</uri>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Anagnostis</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Tagarakis</surname> <given-names>A. C.</given-names>
</name>
<name>
<surname>Asiminari</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Papageorgiou</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Kateris</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Moshou</surname> <given-names>D.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A deep learning approach for anthracnose infected trees classification in walnut orchards</article-title>. <source>Comput. Electron. Agric.</source> <volume>182</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.105998</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bewley</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Ge</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Ott</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ramos</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Upcroft</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Simple Online and Realtime Tracking</article-title>. <conf-name>2016 IEEE International Conference on Image Processing (ICIP)</conf-name>, <conf-loc>Phoenix, AZ, USA</conf-loc>, pp. <fpage>3464</fpage>&#x2013;<lpage>3468</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICIP.2016.7533003</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>YOLOv4: Optimal speed and accuracy of object detection</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2004.10934</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>da Silva</surname> <given-names>C. B.</given-names>
</name>
<name>
<surname>Bianchini</surname> <given-names>V. D. M.</given-names>
</name>
<name>
<surname>de Medeiros</surname> <given-names>A. D.</given-names>
</name>
<name>
<surname>de Moraes</surname> <given-names>M. H. D.</given-names>
</name>
<name>
<surname>Marassi</surname> <given-names>A. G.</given-names>
</name>
<name>
<surname>Tannus</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A novel approach for jatropha curcas seed health analysis based on multispectral and resonance imaging techniques</article-title>. <source>Ind. Crops Prod.</source> <volume>161</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.indcrop.2020.113186</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>de Medeiros</surname> <given-names>A. D.</given-names>
</name>
<name>
<surname>Bernardes</surname> <given-names>R. C.</given-names>
</name>
<name>
<surname>da Silva</surname> <given-names>L. J.</given-names>
</name>
<name>
<surname>de Freitas</surname> <given-names>B. A. L.</given-names>
</name>
<name>
<surname>Dias</surname> <given-names>D. C. F. D.</given-names>
</name>
<name>
<surname>da Silva</surname> <given-names>C. B.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Deep learning-based approach using X-ray images for classifying crambe abyssinica seed quality</article-title>. <source>Ind. Crops Prod.</source> <volume>164</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.indcrop.2021.113378</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ding</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>RepVGG: Making VGG-style ConvNets Great Again</article-title>. <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Nashville, TN, USA</conf-loc>, pp. <fpage>13728</fpage>&#x2013;<lpage>13737</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01352</pub-id>.</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname> <given-names>A. J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J. F.</given-names>
</name>
<name>
<surname>Vories</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Sudduth</surname> <given-names>K. A.</given-names>
</name>
</person-group> (<year>2020</year>a). <article-title>Evaluation of cotton emergence using UAV-based imagery and deep learning</article-title>. <source>Comput. Electron. Agric.</source> <volume>177</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105711</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname> <given-names>A. J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J. F.</given-names>
</name>
<name>
<surname>Vories</surname> <given-names>E. D.</given-names>
</name>
<name>
<surname>Sudduth</surname> <given-names>K. A.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>M. N.</given-names>
</name>
</person-group> (<year>2020</year>b). <article-title>Yield estimation in cotton using UAV-based multi-sensor imagery</article-title>. <source>Biosyst. Eng.</source> <volume>193</volume>, <fpage>101</fpage>&#x2013;<lpage>114</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2020.02.014</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Flores</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Igathinathane</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Jithin</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Naik</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Stenger</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Distinguishing seedling volunteer corn from soybean through greenhouse color, color-infrared, and fused images using machine and deep learning</article-title>. <source>Ind. Crops Prod.</source> <volume>161</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.indcrop.2020.113223</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>F. F.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>L. S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Majeed</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Karkee</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Multi-class fruit-on-plant detection for apple in SNAP system using faster r-CNN</article-title>. <source>Comput. Electron. Agric.</source> <volume>176</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105634</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ge</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>YOLOX: Exceeding YOLO series in 2021</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2107.08430</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>b). <article-title>Fast R-CNN</article-title>. <conf-name>2015 IEEE International Conference on Computer Vision (ICCV)</conf-name>, <conf-loc>Santiago, Chile</conf-loc>, pp. <fpage>1440</fpage>&#x2013;<lpage>1448</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/iccv.2015.169</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Donahue</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Darrell</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Malik</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>a). <article-title>Region-based convolutional networks for accurate object detection and segmentation</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>38</volume> (<issue>1</issue>), <fpage>142</fpage>&#x2013;<lpage>158</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/tpami.2015.2437384</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Fruit yield prediction and estimation in orchards: A state-of-the-art comprehensive review for both direct and indirect methods</article-title>. <source>Comput. Electron. Agric.</source> <volume>195</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.106812</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>Z. L.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J. T.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>S. M.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z. X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>S. F.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>A method of green citrus detection based on a deep bounding box regression forest</article-title>. <source>Biosyst. Eng.</source> <volume>193</volume>, <fpage>206</fpage>&#x2013;<lpage>215</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2020.03.001</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K. M.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X. Y.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S. Q.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Spatial pyramid pooling in deep convolutional networks for visual recognition</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>37</volume> (<issue>9</issue>), <fpage>1904</fpage>&#x2013;<lpage>1916</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/tpami.2015.2389824</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jaisin</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Pathaveerat</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Terdwongworakul</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Determining the size and location of longans in bunches by image processing technique</article-title>. <source>Maejo Int. J. Sci. Technol.</source> <volume>7</volume> (<issue>3</issue>), <fpage>444</fpage>&#x2013;<lpage>455</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.14456/mijst.2013.37</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Target Recognition Based on CNN with LeakyReLU and PReLU Activation Functions</article-title>. <conf-name>2019 International Conference on Sensing, Diagnostics, Prognostics, and Control (SDPC)</conf-name>, <conf-loc>Beijing, China</conf-loc>, pp. <fpage>718</fpage>&#x2013;<lpage>722</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/SDPC.2019.00136</pub-id>.</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Koirala</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Walsh</surname> <given-names>K. B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z. L.</given-names>
</name>
<name>
<surname>McCarthy</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deep learning - method overview and review of use for fruit detection and yield estimation</article-title>. <source>Comput. Electron. Agric.</source> <volume>162</volume>, <fpage>219</fpage>&#x2013;<lpage>234</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2019.04.017</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>D. H.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>X. X.</given-names>
</name>
<name>
<surname>Elkhouchlaa</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>Z. W.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>P. Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Fast detection and location of longan fruits using UAV images</article-title>. <source>Comput. Electron. Agric.</source> <volume>190</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106465</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>D. H.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>X. X.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>S. P.</given-names>
</name>
<name>
<surname>Elkhouchlaa</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>Z. W.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>A novel approach for 3D localization of branch picking points based on deep learning applied to fruit picking UAVs</article-title>. <source>Comput. Electron. Agric.</source> <volume>199</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107191</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liang</surname> <given-names>C. X.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J. T.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Z. H.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z. H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>S. M.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>A visual detection method for nighttime litchi fruits and fruiting stems</article-title>. <source>Comput. Electron. Agric.</source> <volume>169</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2019.105192</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T. Y.</given-names>
</name>
<name>
<surname>Dollar</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Hariharan</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Belongie</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Feature pyramid networks for object detection</article-title>. <conf-name>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Honolulu, HI, USA</conf-loc>, pp. <fpage>936</fpage>&#x2013;<lpage>944</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2017.106</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Fruit detection in natural environment using partial shape matching and probabilistic hough transform</article-title>. <source>Precis. Agric.</source> <volume>21</volume> (<issue>1</issue>), <fpage>160</fpage>&#x2013;<lpage>177</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-019-09662-w</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Path aggregation network for instance segmentation</article-title>. <conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, <conf-loc>Salt Lake City, UT, USA</conf-loc>, pp. <fpage>8759</fpage>&#x2013;<lpage>8768</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2018.00913</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X. L.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>Y. X.</given-names>
</name>
<name>
<surname>Yin</surname> <given-names>G. F.</given-names>
</name>
<name>
<surname>Johnson</surname> <given-names>B. A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deep learning in remote sensing applications: A meta-analysis and review</article-title>. <source>Isprs J. Photogramm. Remote Sens.</source> <volume>152</volume>, <fpage>166</fpage>&#x2013;<lpage>177</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isprsjprs.2019.04.015</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marani</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Milella</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Petitti</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Reina</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Deep neural networks for grape bunch segmentation in natural images from a consumer-grade camera</article-title>. <source>Precis. Agric.</source> <volume>22</volume> (<issue>2</issue>), <fpage>387</fpage>&#x2013;<lpage>413</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-020-09736-0</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Norouzzadeh</surname> <given-names>M. S.</given-names>
</name>
<name>
<surname>Nguyen</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kosmala</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Swanson</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Palmer</surname> <given-names>M. S.</given-names>
</name>
<name>
<surname>Packer</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Automatically identifying, counting, and describing wild animals in camera-trap images with deep learning</article-title>. <source>Proc. Natl. Acad. Sci. United States America</source> <volume>115</volume> (<issue>25</issue>), <fpage>E5716</fpage>&#x2013;<lpage>E5725</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1073/pnas.1719367115</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paoletti</surname> <given-names>M. E.</given-names>
</name>
<name>
<surname>Haut</surname> <given-names>J. M.</given-names>
</name>
<name>
<surname>Plaza</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Plaza</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deep learning classifiers for hyperspectral imaging: A review</article-title>. <source>Isprs J. Photogramm. Remote Sens.</source> <volume>158</volume>, <fpage>279</fpage>&#x2013;<lpage>317</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isprsjprs.2019.09.006</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pham</surname> <given-names>V. T.</given-names>
</name>
<name>
<surname>Herrero</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Hormaza</surname> <given-names>J. I.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Phenological growth stages of longan (Dimocarpus longan) according to the BBCH scale</article-title>. <source>Sci. Hortic.</source> <volume>189</volume>, <fpage>201</fpage>&#x2013;<lpage>207</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.scienta.2015.03.036</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>You only look once: Unified, real-time object detection</article-title>. <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Las Vegas, NV, USA</conf-loc>, pp. <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>YOLOv3: An incremental improvement</article-title>. <source>arXiv e-prints</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1804.02767</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S. Q.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K. M.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Faster r-CNN: Towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>39</volume> (<issue>6</issue>), <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Singh</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Verma</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Alex</surname> <given-names>J. S. R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Disease and pest infection detection in coconut tree through deep learning techniques</article-title>. <source>Comput. Electron. Agric.</source> <volume>182</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.105986</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sumesh</surname> <given-names>K. C.</given-names>
</name>
<name>
<surname>Ninsawat</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Som-ard</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Integration of RGB-based vegetation index, crop surface model and object-based image analysis approach for sugarcane yield estimation using unmanned aerial vehicle</article-title>. <source>Comput. Electron. Agric.</source> <volume>180</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105903</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>Y. C.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H. J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y. Q.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Fruit detection and positioning technology for a camellia oleifera c. Abel orchard based on improved YOLOv4-tiny model and binocular stereo vision</article-title>. <source>Expert Syst. Appl.</source> <volume>211</volume>, <elocation-id>118573</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2022.118573</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tetila</surname> <given-names>E. C.</given-names>
</name>
<name>
<surname>Machado</surname> <given-names>B. B.</given-names>
</name>
<name>
<surname>Menezes</surname> <given-names>G. K.</given-names>
</name>
<name>
<surname>Oliveira</surname> <given-names>A. D.</given-names>
</name>
<name>
<surname>Alvarez</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Amorim</surname> <given-names>W. P.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Automatic recognition of soybean leaf diseases using UAV images and deep convolutional neural networks</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>17</volume> (<issue>5</issue>), <fpage>903</fpage>&#x2013;<lpage>907</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2019.2932385</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vanegas</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Bratanov</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Powell</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Weiss</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Gonzalez</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A novel methodology for improving plant pest surveillance in vineyards and crops using UAV-based hyperspectral and spatial data</article-title>. <source>Sensors</source> <volume>18</volume> (<issue>1</issue>). doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s18010260</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Scaled-YOLOv4: Scaling cross stage partial network</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>.</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</article-title>. <source>arXiv e-prints</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2207.02696</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. L.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>T. H.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>L. J.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y. C.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X. J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Window zooming&#x2013;based localization algorithm of fruit and vegetable for harvesting robot</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>103639</fpage>&#x2013;<lpage>103649</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/access.2019.2925812</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wojke</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Bewley</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Paulus</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Simple online and realtime tracking with a deep association metric</article-title>. <conf-name>2017 IEEE International Conference on Image Processing (ICIP)</conf-name>, <conf-loc>Beijing, China</conf-loc>, pp. <fpage>3645</fpage>&#x2013;<lpage>3649</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICIP.2017.8296962</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>F. Y.</given-names>
</name>
<name>
<surname>Duan</surname> <given-names>J. L.</given-names>
</name>
<name>
<surname>Ai</surname> <given-names>P. Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z. Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X. J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Rachis detection and three-dimensional localization of cut off point for vision-based banana robot</article-title>. <source>Comput. Electron. Agric.</source> <volume>198</volume>, <elocation-id>107079</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107079</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiong</surname> <given-names>J. T.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Z. L.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Bu</surname> <given-names>R. B.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Z. G.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Visual positioning technology of picking robots for dynamic litchi clusters with disturbance</article-title>. <source>Comput. Electron. Agric.</source> <volume>151</volume>, <fpage>226</fpage>&#x2013;<lpage>237</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.06.007</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiong</surname> <given-names>J. T.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>S. M.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B. L.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Z. H.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Visual detection of green mangoes by an unmanned aerial vehicle in orchards based on a deep learning method</article-title>. <source>Biosyst. Eng.</source> <volume>194</volume>, <fpage>261</fpage>&#x2013;<lpage>272</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2020.04.006</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhong</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J. T.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Z. H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B. L.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>S. S.</given-names>
</name>
<name>
<surname>Huo</surname> <given-names>Z. W.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A method for litchi picking points calculation in natural environment based on main fruit bearing branch detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>189</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106398</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y. C.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X. J.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>M. L.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>F.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Adaptive active positioning of camellia oleifera fruit picking points: Classical image processing and YOLOv7 fusion algorithm</article-title>. <source>Appl. Sciences-Basel</source> <volume>12</volume> (<issue>24</issue>), <elocation-id>12959</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app122412959</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J. F.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ali</surname> <given-names>M. L.</given-names>
</name>
<name>
<surname>Nguyen</surname> <given-names>H. T.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P. Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Classification of soybean leaf wilting due to drought stress using UAV-based imagery</article-title>. <source>Comput. Electron. Agric.</source> <volume>175</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105576</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhuang</surname> <given-names>J. J.</given-names>
</name>
<name>
<surname>Hou</surname> <given-names>C. J.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Q. W.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>Z. Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Computer vision-based localisation of picking points for automatic litchi harvesting applications towards natural scenarios</article-title>. <source>Biosyst. Eng.</source> <volume>187</volume>, <fpage>1</fpage>&#x2013;<lpage>20</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2019.08.016</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>