<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="review-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1646871</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Review</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A review of visual perception technology for intelligent fruit harvesting robots</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Huang</surname>
<given-names>Yikun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1941088/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xu</surname>
<given-names>Shuyan</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Hao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Gang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Dong</surname>
<given-names>Heng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2924957/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yu</surname>
<given-names>Jie</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Xi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Chen</surname>
<given-names>Riqing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/966323/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Future Technology, Fujian Agriculture and Forestry University</institution>, <addr-line>Fuzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Concore University College, Fujian Normal University</institution>, <addr-line>Fuzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Minnan University of Science and Technology</institution>, <addr-line>Quanzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Fujian Key Lab of Agricultural Internet of Things Applications, Sanming University</institution>, <addr-line>Sanming</addr-line>,&#xa0;<country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Aichen Wang, Jiangsu University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Caner Beldek, University of Wollongong, Australia</p>
<p>Mingyou Chen, Foshan University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Shuyan Xu, <email xlink:href="mailto:xsy@mnust.edu.cn">xsy@mnust.edu.cn</email>; Riqing Chen, <email xlink:href="mailto:riqing.chen@fafu.edu.cn">riqing.chen@fafu.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>19</day>
<month>08</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1646871</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>17</day>
<month>07</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Huang, Xu, Chen, Li, Dong, Yu, Zhang and Chen.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Huang, Xu, Chen, Li, Dong, Yu, Zhang and Chen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>With the development of smart agriculture, fruit picking robots have attracted widespread attention as one of the key technologies to improve agricultural productivity. Visual perception technology plays a crucial role in fruit picking robots, involving precise fruit identification, localization, and grasping operations. This paper reviews the research progress in the visual perception technology for fruit picking robots, focusing on key technologies such as camera types used in picking robots, object detection techniques, picking point recognition and localization, active vision, and visual servoing. First, the paper introduces the application characteristics and selection criteria of different camera types in the fruit picking process. Then, it analyzes how object detection techniques help robots accurately recognize fruits and achieve efficient fruit classification. Next, it discusses the picking point recognition and localization technologies, including vision-based 3D reconstruction and depth sensing methods. Subsequently, it elaborates on the adaptability of active vision technology in dynamic environments and how visual servoing technology achieves precise localization. Additionally, the review explores robot mobility perception technologies, focusing on V-SLAM, mobile path planning, and task scheduling. These technologies enhance harvesting efficiency across the entire orchard and facilitate better collaboration among multiple robots. Finally, the paper summarizes the challenges in current research and the future development trends, aiming to provide references for the optimization and promotion of fruit picking robot technology.</p>
</abstract>
<kwd-group>
<kwd>intelligent fruit harvesting robots</kwd>
<kwd>agricultural robotics</kwd>
<kwd>visual perception</kwd>
<kwd>object detection</kwd>
<kwd>visual servoing</kwd>
<kwd>V-SLAM</kwd>
</kwd-group>
<counts>
<fig-count count="8"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="126"/>
<page-count count="18"/>
<word-count count="8699"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>With the continuous growth of the global population, agricultural production is facing increasingly severe challenges. Rising labor costs, increased labor intensity for farmers, and low agricultural productivity have become key factors limiting the development of modern agriculture (<xref ref-type="bibr" rid="B115">Zhang et&#xa0;al., 2024</xref>). To address these issues, agricultural automation technologies have emerged, with intelligent harvesting robots receiving widespread attention and research as an efficient and automated solution (<xref ref-type="bibr" rid="B17">Chunjiang et&#xa0;al., 2023</xref>). With the rapid advancements in artificial intelligence, robotics, and computer vision technologies, fruit harvesting robots have gradually become a focal point of research.</p>
<p>In order to provide a comprehensive understanding of the research trends in this field, we conducted a statistical analysis of related research articles from 2005 to 2024 based on the Web of Science database, as show in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. The results show a significant increase in the number of publications in the field of &#x201c;Fruit Harvesting,&#x201d; rising from 732 articles in 2005 to 2130 in 2024. This indicates that, with the rapid development of smart agriculture technologies, the research interest in this field has continued to grow, with visual perception and robotics technologies gradually becoming the core focus of research.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Statistics of article counts by keywords in web of science.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g001.tif">
<alt-text content-type="machine-generated">Line graph showing the number of articles related to various keywords in the Web of Science from 2005 to 2024. The keywords are &#x201c;Fruit Harvesting&#x201d; (blue), &#x201c;Fruit Picking&#x201d; (yellow), &#x201c;Picking Robot&#x201d; (green), &#x201c;Harvesting Robot&#x201d; (red), &#x201c;Intelligent Harvesting&#x201d; (purple), &#x201c;Intelligent Picking&#x201d; (brown), and &#x201c;Picking Point&#x201d; (pink). &#x201c;Fruit Harvesting&#x201d; has the highest counts, peaking in 2021 with 2130. Other keywords show lower counts and gradually rising trends.</alt-text>
</graphic>
</fig>
<sec id="s1_1">
<label>1.1</label>
<title>Development status of intelligent fruit harvesting robots</title>
<p>In 1968, the United States pioneered the study of fruit harvesting using mechanical or pneumatic vibration methods. Although these methods could perform basic harvesting tasks, vibration and pneumatics often caused significant damage to the fruit (<xref ref-type="bibr" rid="B76">Schertz and Brown, 1968</xref>). With the development of computer and control technologies, agricultural robots began to be applied in tasks such as harvesting, spraying, and weeding from the 1990s onward, assisted by computer vision. In particular, some robotic arms were able to simulate manual harvesting actions. However, due to the limitations of robot and sensor technologies at the time, automated harvesting robots still faced challenges such as high costs, low precision, and limited application scenarios. With the rapid development of Industry 4.0, advancements in artificial intelligence, the Internet of Things, and big data analysis have greatly propelled the progress of agricultural harvesting robots, especially in the precision of perception, autonomous decision-making, control, and execution (<xref ref-type="bibr" rid="B63">Oliveira et&#xa0;al., 2021</xref>). In particular, the continuous innovation of visual perception systems has provided harvesting robots with more powerful sensing capabilities. Modern intelligent fruit harvesting robots are now able to obtain real-time environmental information through devices such as cameras, LiDAR, and depth sensors, and identify the type, location, and status of fruits using image processing and pattern recognition technologies.</p>
<p>Harvesting robots can be divided into bulk harvesting robots and selective harvesting robots (<xref ref-type="bibr" rid="B123">Zhou et&#xa0;al., 2022</xref>). As shown in <xref ref-type="fig" rid="f2">
<bold>Figures 2a-c</bold>
</xref>, bulk harvesting robots are typically large and perform one-time harvesting by applying vibration or forced separation to the fruit trees. Examples include apple harvesting by vibrating branches (<xref ref-type="bibr" rid="B18">De Kleine and Karkee, 2015</xref>), cherry harvesting by vibration (<xref ref-type="bibr" rid="B122">Zhou et&#xa0;al., 2016</xref>), and bulk grape harvesting for industrial use (<xref ref-type="bibr" rid="B106">Yan et&#xa0;al., 2023</xref>). Although bulk harvesting methods are efficient, they cause significant damage to the fruits and are difficult to distinguish based on ripeness, making them suitable only for industrial fruit, not for those intended for market sales.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Various types of harvesting robots. <bold>(a)</bold> Olive Shaking Bulk Harvesting Equipment (<xref ref-type="bibr" rid="B77">Sola-Guirado et&#xa0;al., 2023</xref>), <bold>(b)</bold> Apple Vibration Harvesting Robot (<xref ref-type="bibr" rid="B18">De Kleine and Karkee, 2015</xref>), <bold>(c)</bold> Apple Selective Harvesting Large-Scale Equipment (<xref ref-type="bibr" rid="B32">Jia et&#xa0;al., 2020</xref>), <bold>(d)</bold> Sweet Pepper Harvesting Robot (<xref ref-type="bibr" rid="B1">Arad et&#xa0;al., 2020</xref>), <bold>(e)</bold> Tomato Harvesting Robot (<xref ref-type="bibr" rid="B69">Rapado-Rinc&#xf3;n et&#xa0;al., 2023</xref>), <bold>(f)</bold> Cucumber harvesting robot (<xref ref-type="bibr" rid="B64">Park et&#xa0;al., 2023</xref>).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g002.tif">
<alt-text content-type="machine-generated">(a) A large agricultural machine harvesting trees in an orchard.(b) A yellow mechanical arm in an apple orchard, with the human operator blurred for anonymity.(c) A tracked machine with a retractable arm harvesting apples in an open field.(d) An automatic pepper-picking machine with mechanical components labeled CE, PW, CB, PC, and FC, operating inside a greenhouse.(e) A robotic arm moving between two rows of tomato plants in a greenhouse.(f) A cucumber-harvesting robot system with labeled components including an end-effector, manipulator, camera, and a collection box.</alt-text>
</graphic>
</fig>
<p>Selective harvesting robots typically install the end effector on a robotic arm and use computer vision to identify ripe fruits, guiding the robotic arm and end effector to perform the harvesting task, as shown in <xref ref-type="fig" rid="f2">
<bold>Figures 2d-f</bold>
</xref>. These devices are usually smaller in size and can move freely in agricultural environments. Since their harvesting method is the closest to human picking, they have already been applied in harvesting fruits such as apples (<xref ref-type="bibr" rid="B32">Jia et&#xa0;al., 2020</xref>), peppers (<xref ref-type="bibr" rid="B1">Arad et&#xa0;al., 2020</xref>), tomatoes (<xref ref-type="bibr" rid="B69">Rapado-Rinc&#xf3;n et&#xa0;al., 2023</xref>), yellow peaches (<xref ref-type="bibr" rid="B97">Wang et&#xa0;al., 2023</xref>), and strawberries (<xref ref-type="bibr" rid="B80">Tafuro et&#xa0;al., 2022</xref>). Although selective harvesting robots have lower work efficiency, they support batch harvesting and effectively reduce fruit damage, thus preserving the commercial value of the fruits. This method is considered the most likely to fully replace human harvesters, which has led to widespread attention to selective harvesting robots in both academia and industry (<xref ref-type="bibr" rid="B74">Sanders, 2005</xref>).</p>
</sec>
<sec id="s1_2">
<label>1.2</label>
<title>The importance of visual perception technology in fruit harvesting</title>
<p>Visual perception technology plays a pivotal role in intelligent fruit harvesting robots, serving as one of the core technologies enabling automated picking. It facilitates the accurate identification and localization of target fruits through image processing and object detection, ensuring the efficient and precise execution of harvesting tasks. The visual system must adapt to varying lighting conditions, diverse fruit types, and complex background environments. The application of deep learning, 3D reconstruction, and image segmentation techniques enhances its robustness and accuracy. Furthermore, visual perception supports dynamic decision-making for the robot, allowing real-time adjustments to harvesting strategies in response to fruit displacement or occlusion, thereby ensuring operational continuity and stability. With technological advancements, the introduction of visual servo systems and closed-loop control has further improved manipulation precision and minimized fruit damage.</p>
<p>Scholars have developed models for detecting picking points using image analysis and deep learning techniques to guide robotic manipulators in the intelligent harvesting of fruits such as pepper (<xref ref-type="bibr" rid="B1">Arad et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B5">Babellahi et&#xa0;al., 2020</xref>), tomato (<xref ref-type="bibr" rid="B33">Jun et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B102">Wu et&#xa0;al., 2021</xref>), apple (<xref ref-type="bibr" rid="B32">Jia et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B42">Li C. et&#xa0;al., 2023</xref>), and grape (<xref ref-type="bibr" rid="B106">Yan et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B91">Wang J. et&#xa0;al., 2024</xref>). In intricate field settings, factors such as fluctuating illumination, fruit overlap, variations in fruit maturity, accurate peduncle/stem recognition, and precise localization of the picking point significantly impact the operational efficiency and harvesting accuracy of robots. Concurrently, when fruits are occluded, determining the optimal viewing angle for observation and planning effective manipulator trajectories become critical challenges for enhancing harvesting performance. Therefore, a thorough examination of the latest advancements, existing challenges, and future trends in visual perception technology for fruit harvesting robots holds substantial academic significance and practical value for advancing the field.</p>
</sec>
</sec>
<sec id="s2">
<label>2</label>
<title>Common camera types for harvesting robots</title>
<p>Efficient visual perception systems are fundamental to intelligent fruit harvesting robots, with cameras serving as core components whose performance is determined by sensor type and design. Driven by advancements in computer vision, deep learning, and sensor technology, traditional RGB cameras are increasingly being supplemented or replaced by various advanced sensors. Combining different sensors proves particularly effective in complex agricultural environments, significantly enhancing system robustness and accuracy. Common vision sensors include monocular cameras, binocular (stereo) cameras, RGB-D cameras and event cameras, each possessing distinct advantages and suitable application scenarios. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> presents a performance comparison of different types of cameras. The following will provide a detailed analysis of these camera types and explore their specific applications in fruit harvesting tasks.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Comparison of different depth sensing technologies.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Technology</th>
<th valign="top" align="center">Monocular (<xref ref-type="bibr" rid="B6">Baeten et&#xa0;al., 2008</xref>)</th>
<th valign="top" align="center">Binocular stereo (<xref ref-type="bibr" rid="B102">Wu et&#xa0;al., 2021</xref>)</th>
<th valign="top" align="center">Structured light camera (<xref ref-type="bibr" rid="B91">Wang J. et&#xa0;al., 2024</xref>)</th>
<th valign="top" align="center">Time of flight camera (<xref ref-type="bibr" rid="B46">Li Z. et&#xa0;al., 2022</xref>)</th>
<th valign="top" align="center">Event camera (<xref ref-type="bibr" rid="B70">Rebecq et&#xa0;al., 2018</xref>)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Technology Principle</td>
<td valign="top" align="center">Captures 2D images using a single camera</td>
<td valign="top" align="center">Calculates depth<break/>information using the<break/>principle of disparity</td>
<td valign="top" align="center">Projects a light pattern<break/>and analyzes<break/>its<break/>deformation<break/>to acquire depth</td>
<td valign="top" align="center">Measures depth by the time<break/>difference of<break/>infrared light reflection</td>
<td valign="top" align="center">Pixel-level asynchronous<break/>brightness change detection</td>
</tr>
<tr>
<td valign="top" align="center">Depth Range</td>
<td valign="top" align="center">Estimated via algorithm</td>
<td valign="top" align="center">0.5&#x2013;10 meters</td>
<td valign="top" align="center">0.2&#x2013;5 meters</td>
<td valign="top" align="center">0.2&#x2013;10 meters</td>
<td valign="top" align="center">Wide-range</td>
</tr>
<tr>
<td valign="top" align="center">Accuracy</td>
<td valign="top" align="center">Dependent on algorithm, low accuracy</td>
<td valign="top" align="center">Moderate</td>
<td valign="top" align="center">High</td>
<td valign="top" align="center">Moderate</td>
<td valign="top" align="center">High accuracy</td>
</tr>
<tr>
<td valign="top" align="center">Dynamic Scene Performance</td>
<td valign="top" align="center">Dependent on algorithm,<break/>performance is poor</td>
<td valign="top" align="center">Moderate</td>
<td valign="top" align="center">Good for<break/>static scenes,<break/>moderate for<break/>dynamic scenes</td>
<td valign="top" align="center">Excellent, suitable for<break/>dynamic scenes</td>
<td valign="top" align="center">Excellent, suitable for<break/>dynamic scenes</td>
</tr>
<tr>
<td valign="top" align="center">Advantages</td>
<td valign="top" align="center">Lowest cost, highest resolution</td>
<td valign="top" align="center">Provides direct depth<break/>information, moderate cost</td>
<td valign="top" align="center">High<break/>precision depth<break/>perception,<break/>suitable for<break/>near-field object<break/>recognition, good light adaptability</td>
<td valign="top" align="center">High<break/>precision,<break/>suitable for<break/>long-range,<break/>relatively stable</td>
<td valign="top" align="center">Ultra-low latency,<break/>ultra-high<break/>dynamic<break/>range and low power consumption</td>
</tr>
<tr>
<td valign="top" align="center">Disadvantages</td>
<td valign="top" align="center">Difficult to obtain depth information, highly<break/>affected by external light</td>
<td valign="top" align="center">Requires good scene texture,<break/>limited in<break/>poor lighting conditions</td>
<td valign="top" align="center">Sensitive to ambient light, higher cost</td>
<td valign="top" align="center">Affected by strong<break/>ambient light, accuracy<break/>decreases at longer distances</td>
<td valign="top" align="center">There is no texture<break/>information,<break/>so a dedicated algorithm is needed.</td>
</tr>
<tr>
<td valign="top" align="center">Providers</td>
<td valign="top" align="center">MOKOSE, HIKRobot, etc.</td>
<td valign="top" align="center">ZED, Intel<break/>RealSense<break/>D400 Series, etc.</td>
<td valign="top" align="center">Microsoft<break/>Kinect 1, Intel<break/>RealSense<break/>LR200,<break/>Orbbec Astra, etc.</td>
<td valign="top" align="center">Microsoft<break/>Kinect 2, Intel<break/>RealSense<break/>L515,<break/>SEERsense, etc.</td>
<td valign="top" align="center">Pixel-level asynchronous<break/>brightness change detection.</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s2_1">
<label>2.1</label>
<title>Monocular camera</title>
<p>Monocular cameras, capturing color images through a single lens, are widely utilized for image acquisition in deep learning applications due to their simple structure and low cost, as shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3a</bold>
</xref>. However, they are incapable of directly capturing depth information, providing only two-dimensional scene data, and are primarily used for tasks like object detection and yield estimation. To address this limitation, researchers employ deep learning and other methods to process monocular images and estimate fruit positions (<xref ref-type="bibr" rid="B35">Khan et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B16">Cheng et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B109">Yin et&#xa0;al., 2023</xref>). For instance, Yang et&#xa0;al. proposed a self-supervised monocular depth estimation network (<xref ref-type="bibr" rid="B107">Yang et&#xa0;al., 2020</xref>), while Ban et&#xa0;al. tackled depth estimation in defocused images using Markov random fields and geometric constraints (<xref ref-type="bibr" rid="B7">Ban et&#xa0;al., 2022</xref>). Despite these efforts, the lack of inherent depth data means monocular depth estimation still relies on computationally intensive methods and achieves limited accuracy. This challenge is particularly pronounced in unstructured agricultural scenes, where environmental complexity and indistinct object features further complicate depth estimation.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Representatives of cameras from different technology types. <bold>(a)</bold> MOKOSE monocular camera, <bold>(b)</bold> ZED stereo camera, <bold>(c)</bold> Intel T265 stereo camera. <bold>(d)</bold> ORBBEC structured light camera, <bold>(e)</bold> SEERSENSE ToF (Time of Flight) camera. <bold>(f)</bold> iniVation event camera.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g003.tif">
<alt-text content-type="machine-generated">(a) MOkOSE monocular camera, (b) ZED stereo camera, (c) Intel T265 stereo camera. (d) ORBBEC structured light camera, (e) SEERSENSE ToF (Time of Flight) camera. (f) iniVation event camera.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Stereo camera</title>
<p>Binocular cameras, as shown in <xref ref-type="fig" rid="f3">
<bold>Figures&#xa0;3b, c</bold>
</xref>, also referred to as stereo cameras, capture images of a scene using two lenses from different viewpoints. They compute object depth by leveraging the principle of parallax (<xref ref-type="bibr" rid="B11">Chao et&#xa0;al., 2023</xref>). By mimicking the human binocular vision system to acquire three-dimensional (3D) information, binocular cameras provide depth data more directly compared to monocular cameras. Consequently, they are widely adopted in agricultural robotics and automated harvesting scenarios due to their ability to deliver more accurate spatial localization in complex environments (<xref ref-type="bibr" rid="B49">Ling et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B102">Wu et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B101">Wen et&#xa0;al., 2022</xref>). However, binocular cameras also exhibit certain limitations. For instance, they exhibit a high dependency on texture features within the scene; depth estimation performance may degrade in texture-poor regions or under suboptimal lighting conditions. Furthermore, the hardware configuration of binocular cameras is inherently more complex than that of monocular cameras, demanding precise calibration and stringent synchronization between the two lenses.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>RGB-D camera</title>
<p>To overcome the limitations of monocular and binocular cameras, RGB-D cameras have emerged as a solution. RGB-D cameras integrate an RGB color camera with a depth sensor, enabling simultaneous capture of color information and depth data from the scene, making them an increasingly popular choice for diverse applications. Beyond stereo vision, common methods for acquiring depth information with RGB-D cameras include structured light technology and Time of Flight (ToF) (<xref ref-type="bibr" rid="B121">Zhou et&#xa0;al., 2021</xref>) (as shown in <xref ref-type="fig" rid="f3">
<bold>Figures&#xa0;3d, e</bold>
</xref>). Structured light technology typically projects a known light pattern (e.g., stripes, dot arrays) onto object surfaces and captures the resulting deformation of this pattern using a camera to infer depth. Cameras employing this method offer high accuracy at close range and rapid depth acquisition, but depth measurement accuracy may decrease for objects with low reflectivity or lacking texture. Common structured light cameras include the Intel RealSense series, and the Intel RealSense D435 camera, valued for its compact size and high precision, is widely utilized in fruit harvesting tasks (<xref ref-type="bibr" rid="B51">Liu et&#xa0;al., 2024</xref>).</p>
<p>ToF calculates distance by emitting light pulses and measuring the time difference for the light to travel from the camera to the object and back. ToF can operate effectively under low-light conditions or significant illumination variations and provides rapid depth acquisition. However, its resolution is generally lower than that of structured light cameras, making it difficult to capture sufficiently detailed depth information in complex, close-range environments.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Event camera</title>
<p>In addition to conventional frame-based cameras, emerging vision sensors&#x2014;such as event cameras&#x2014;have demonstrated significant potential in agricultural applications, particularly in complex environments with high dynamic lighting conditions, as shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3f</bold>
</xref>. Unlike traditional cameras that capture entire images at fixed frame rates, event cameras operate using an asynchronous imaging mechanism that records data only when changes in pixel brightness occur (<xref ref-type="bibr" rid="B19">Gallego et&#xa0;al., 2021</xref>). Each event contains a timestamp, pixel location, and the polarity of brightness change, enabling ultra-high temporal resolution at the microsecond level, extremely low latency, and substantially reduced data redundancy. One of the most prominent advantages of event cameras is their exceptionally high dynamic range, often exceeding 120 dB, making them particularly effective in agricultural scenarios (<xref ref-type="bibr" rid="B86">Wan et&#xa0;al., 2024</xref>). For instance, event cameras can produce stable outputs under highly variable lighting conditions, such as when sunlight filters through foliage or when transitions occur rapidly between shaded and sunlit areas. Furthermore, their low power consumption and compact size make event cameras well-suited for integration into embedded systems and various field-deployed agricultural automation platforms.</p>
<p>In the context of precision agriculture, event cameras offer potential for a variety of tasks, including crop monitoring, where subtle structural changes in plants can be more effectively detected; real-time navigation of agricultural robots and UAVs in dynamically lit environments; and high-speed target detection (<xref ref-type="bibr" rid="B20">Gehrig and Scaramuzza, 2023</xref>), such as rapid identification of field animals, tracking of pest movements (<xref ref-type="bibr" rid="B66">Pohle-Fr&#xf6;hlich et&#xa0;al., 2024</xref>), or detection of fruit maturity status.</p>
<p>Compared to the aforementioned camera types, RGB-D cameras offer more stable depth perception in complex environments and exhibit reduced dependency on scene texture. They demonstrate superior performance in localization accuracy and computational efficiency (<xref ref-type="bibr" rid="B123">Zhou et&#xa0;al., 2022</xref>), making them well-suited for scenarios demanding high-precision depth information, such as agricultural robotics and autonomous driving. Given these advantages, RGB-D cameras have been successfully applied to the harvesting of various fruits (<xref ref-type="bibr" rid="B111">Yoshida et&#xa0;al., 2022</xref>).</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Camera installation position</title>
<p>The installation position of the camera directly determines the perception ability of the picking robot toward the fruits. A reasonable installation position can maximize the coverage of the visual perception system, enhance the ability to capture image details, and reduce the interference of external factors on recognition accuracy. Generally, the camera installation positions on a picking robot can be divided into Eye-To-Hand and Eye-In-Hand. Eye-To-Hand means the camera is installed at a fixed position on the robotic arm, typically on the robot&#x2019;s base, workbench, or another location that does not change with the movement of the robotic arm. For example, <xref ref-type="bibr" rid="B9">Birrell et&#xa0;al. (2020)</xref> fixed the camera on a bracket in their lettuce harvesting system, as shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>. This method provides stable visual information, but the fixed camera may fail to detect all the fruits due to occlusion. Eye-In-Hand refers to the camera being directly installed at the end of the robotic arm, where each movement of the arm directly affects the camera&#x2019;s view. For example, <xref ref-type="bibr" rid="B34">Junge et&#xa0;al. (2023)</xref> installed an RGB-D camera at the end of the robotic arm in their strawberry picking robot design, with the camera moving along with the arm, as shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>. This method is better at handling target localization and manipulation tasks in complex or confined spaces. However, its drawbacks include a larger computational load and a higher risk of the camera being damaged due to accidental collisions.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Eye-To-Hand robot.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g004.tif">
<alt-text content-type="machine-generated">A robot using the Eye-to-Hand configuration, with a camera mounted at a fixed position above the robotic arm to guide its operation.</alt-text>
</graphic>
</fig>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Eye-In-Hand robot.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g005.tif">
<alt-text content-type="machine-generated">An Eye-in-Hand configuration consisting of an Oak-D camera and silicone fingers mounted at the end-effector of a robotic arm.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Object detection technology in fruit picking</title>
<p>Objective detection technology is the core technology in intelligent fruit harvesting (<xref ref-type="bibr" rid="B104">Xiao et&#xa0;al., 2024</xref>). Due to the vast variety of fruits, which exhibit significant variations in morphology, size, and color, object detection enables the training and optimization of recognition capabilities for different fruit types. Within harvesting tasks, object detection must first precisely locate fruit positions, assess maturity levels, evaluate occlusion conditions, and identify pickable points. Furthermore, it determines the picking sequence by analyzing fruit clustering before robotic arm execution, thereby enhancing harvesting efficiency and accuracy. Object detection techniques are typically categorized into traditional feature-based machine learning methods and deep learning-based approaches.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Traditional object detection technology</title>
<p>Traditional object detection methods primarily rely on the sliding window strategy and manual feature extraction. These include color features (such as threshold segmentation in HSV, Lab, and other color spaces), texture features (e.g., Gray-Level Co-occurrence Matrix - GLCM and Local Binary Patterns - LBP), and shape features (e.g., edge detection and Hough transform). Due to their distinctiveness and stability, color features are widely employed in fruit recognition, particularly in scenarios with simple backgrounds and high contrast between the fruit and its surroundings. For instance, Arefi et&#xa0;al. achieved an accuracy of 96.36% by combining features extracted from the RGB, HIS, and YIQ color spaces for tomato recognition (<xref ref-type="bibr" rid="B2">Arefi et&#xa0;al., 2011</xref>). Tian et&#xa0;al. utilized components of the HIS and LAB color spaces for tomato leaf segmentation (<xref ref-type="bibr" rid="B82">Tian et&#xa0;al., 2019</xref>), while Yamamoto et&#xa0;al. implemented target identification for strawberry harvesting through color threshold analysis, achieving a harvest rate of 67% (<xref ref-type="bibr" rid="B105">Yamamoto et&#xa0;al., 2014</xref>). In complex agricultural environments, OTSU adaptive thresholding is extensively applied to extract target fruit locations based on color differences (<xref ref-type="bibr" rid="B100">Wei et&#xa0;al., 2014</xref>; <xref ref-type="bibr" rid="B56">Lv et&#xa0;al., 2016</xref>). While color models prove effective in distinguishing fruits from backgrounds, their performance deteriorates significantly in complex backgrounds or when encountering objects with similar colors.</p>
<p>Morphological characteristics also hold significant importance in traditional methods. Features such as shape can be extracted through edge detection (e.g., the Canny operator) and contour detection (e.g., Hough transform), proving particularly effective for regularly shaped fruits. For instance, Lv et&#xa0;al. achieved fruit recognition by combining RGB color features with the Canny operator and Hough transform (<xref ref-type="bibr" rid="B57">Lv et&#xa0;al., 2015</xref>), while Tan et&#xa0;al. utilized Canny edge detection to extract edge features from apples, lemons, and mangoes for subsequent classification using machine learning (<xref ref-type="bibr" rid="B81">Tan et&#xa0;al., 2021</xref>). However, the robustness of these traditional methods is often limited in complex scenarios or when detecting occluded fruits. To enhance accuracy, Rabby et&#xa0;al. successfully implemented fruit recognition and classification in controlled background settings by integrating color and morphological features (<xref ref-type="bibr" rid="B67">Rabby et&#xa0;al., 2018</xref>). Furthermore, texture features, including but not limited to those derived from the Gray-Level Co-occurrence Matrix (GLCM) and Local Binary Patterns (LBP), play a crucial role in fruit object detection (<xref ref-type="bibr" rid="B4">Ayg&#xfc;n and G&#xfc;ne&#x15f;, 2017</xref>; <xref ref-type="bibr" rid="B25">Gurubelli et&#xa0;al., 2020</xref>).</p>
<p>Furthermore, Haar-like features (<xref ref-type="bibr" rid="B8">Besnassi et&#xa0;al., 2020</xref>) and Histogram of Oriented Gradients (HOG) features (<xref ref-type="bibr" rid="B124">Zhou and Yu, 2021</xref>) are also widely employed for image description and fruit recognition. Haar-like features extract discriminative information by computing differences in pixel intensities within rectangular regions. While achieving notable success in facial recognition, this approach has also been effectively applied to fruit detection within the agricultural domain. Conversely, HOG features facilitate classifier recognition of fruits by quantifying the distribution of gradient orientations within localized image regions.</p>
<p>With the advancement of machine learning technologies, traditional methods have progressively been integrated with machine learning classifiers, forming feature-based + classifier frameworks for object detection. These classifiers encompass Support Vector Machines (SVM), Random Forests (RF), KNearest Neighbors (KNN), and Na&#xef;ve Bayes, among others. For instance, Zhang et&#xa0;al. (<xref ref-type="bibr" rid="B117">Zhang and Wu, 2012</xref>) achieved the classification of multiple fruit types using an SVM, achieving an accuracy of 88.2%, while Lin et&#xa0;al. successfully identified six fruit types employing the Hough transform combined with an SVM (<xref ref-type="bibr" rid="B48">Lin et&#xa0;al., 2020</xref>). RF enhances classification stability by aggregating predictions from multiple decision trees (<xref ref-type="bibr" rid="B68">Ramisetty et&#xa0;al., 2022</xref>), whereas KNN classifies fruits such as apples and dragon fruit based on sample similarity (<xref ref-type="bibr" rid="B3">Aulia et&#xa0;al., 2023</xref>). Na&#xef;ve Bayes performs well in relatively straightforward classification scenarios, demonstrating effectiveness in non-destructive testing applications for apples (<xref ref-type="bibr" rid="B61">Miriti, 2016</xref>; <xref ref-type="bibr" rid="B110">Yogesh et&#xa0;al., 2021</xref>).</p>
<p>Prior to the widespread adoption of deep learning, methods based on handcrafted features and machine learning classifiers constituted the mainstream approach in object detection. Although demonstrating satisfactory performance in simple scenarios, their heavy reliance on manually designed features resulted in suboptimal effectiveness when confronted with complex environments. However, the rise of deep learning has precipitated a paradigm shift, with automated feature learning progressively supplanting handcrafted feature engineering to become the dominant technology in object detection.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Object detection technology based on deep learning</title>
<p>Driven by the advancement of agricultural automation and intelligence, the application of deep learning technologies in fruit harvesting has emerged as a prominent research focus. Fruit harvesting confronts multiple challenges, including object recognition in complex environments, identification and localization of diverse fruit types, maturity assessment, and occlusion handling. Traditional manual or mechanical methods are often characterized by low efficiency, high costs, and significant environmental constraints. In contrast, deep learning techniques, particularly Convolutional Neural Networks (CNNs) and their extensions such as Faster R-CNN, DETR, and YOLO, have significantly propelled the intelligence and automation of fruit harvesting robots.</p>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Two-stage object detection methods</title>
<p>Early object detection methods primarily relied on traditional CNN architectures like LeNet and AlexNet. While successful in image classification tasks, these networks inherently lacked the capability to directly output positional information. To address this limitation, the R-CNN approach proposed by Ross et&#xa0;al. pioneered the two-stage object detection paradigm by combining region proposal generation with deep feature extraction (<xref ref-type="bibr" rid="B21">Girshick, 2015</xref>). Subsequent advancements, namely Fast R-CNN and Faster R-CNN, substantially improved detection speed and accuracy through shared convolutional feature maps and the introduction of a Region Proposal Network (RPN) (<xref ref-type="bibr" rid="B73">Ren et&#xa0;al., 2016</xref>). The Feature Pyramid Network (FPN) further optimized Faster R-CNN by constructing a pyramid structure on feature maps of different scales, thereby enhancing multi-scale object detection capabilities (<xref ref-type="bibr" rid="B47">Lin et&#xa0;al., 2017</xref>). For example, Wan et&#xa0;al. achieved multiclass fruit detection using Faster R-CNN (<xref ref-type="bibr" rid="B85">Wan and Goudos, 2020</xref>), while Parvathi et&#xa0;al. applied Faster R-CNN for the detection of coconut maturity in complex backgrounds (<xref ref-type="bibr" rid="B65">Parvathi and Selvi, 2021</xref>).</p>
<p>Mask R-CNN is based on Faster R-CNN and achieves precise segmentation and localization of each instance object by adding pixel-level masks (<xref ref-type="bibr" rid="B26">He et&#xa0;al., 2017</xref>). This method has been applied to the identification of pick-up points, such as <xref ref-type="bibr" rid="B53">L&#xf3;pez-Barrios et&#xa0;al. (2023)</xref> who used Mask R-CNN to detect green bell peppers in greenhouses, successfully locating pick-up points. Despite the accuracy advantage of two-stage networks, they are computationally expensive and slow. Therefore, with the increasing demand for real-time performance, researchers have gradually shifted toward more efficient one-stage object detection methods.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>One-stage object detection methods</title>
<p>The YOLO (You Only Look Once) family represents a milestone in one-stage object detection models by transforming object localization into a regression problem through a fully convolutional architecture, achieving high detection speed (<xref ref-type="bibr" rid="B71">Redmon, 2016</xref>). With successive iterations, YOLO models have steadily improved in both accuracy and efficiency. Among earlier versions, YOLOv5 gained widespread adoption in agricultural scenarios due to its streamlined architecture and training efficiency (<xref ref-type="bibr" rid="B99">Wang et al., 2022</xref>; <xref ref-type="bibr" rid="B28">Hou et&#xa0;al., 2022</xref>). For instance, <xref ref-type="bibr" rid="B78">Sozzi et&#xa0;al. (2022)</xref> validated YOLOv5&#x2019;s reliability in grape cluster detection across YOLOv3, YOLOv4, and YOLOv5 models.</p>
<p>Recent versions have introduced more advanced designs tailored for real-time and complex environments. YOLOv6 incorporates cross-layer feature fusion strategies to enhance real-time performance in industrial contexts (<xref ref-type="bibr" rid="B41">Li C. et&#xa0;al., 2022</xref>), while YOLOv8 significantly improves multi-scale object detection and feature extraction (<xref ref-type="bibr" rid="B31">Hussain, 2024</xref>). In agricultural applications, <xref ref-type="bibr" rid="B93">Wang et&#xa0;al. (2025)</xref> proposed a customized YOLO-ALW model based on YOLOv8, achieving 99.1% mAP in pepper detection tasks.</p>
<p>Further developments from YOLOv9 to YOLOv12 introduced architectural innovations such as reversible branches, the GELAN backbone, and modules like C2f-faster and Area Attention, improving detection precision while reducing inference latency (<xref ref-type="bibr" rid="B36">Khanam and Hussain, 2024</xref>; <xref ref-type="bibr" rid="B87">Wang A. et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B98">Wang CY. et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B83">Tian et&#xa0;al., 2025</xref>). <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref> presents a comparison of latency (left) and computational complexity (FLOPs, right) against mAP on the MS COCO dataset. YOLOv12 achieves superior mAP while maintaining low latency and FLOPs, demonstrating outstanding overall efficiency. However, Sapkota et&#xa0;al. conducted a comprehensive evaluation of YOLOv8 through YOLOv12 in complex orchard environments and found that YOLOv9 delivered the best performance for green apple detection and counting (<xref ref-type="bibr" rid="B75">Sapkota and Karkee, 2025</xref>). Most recently, YOLOv13 introduced HyperACE (Hypergraph Adaptive Correlation Enhancement) and the FullPAD mechanism, further boosting detection performance (<xref ref-type="bibr" rid="B39">Lei et&#xa0;al., 2025</xref>). These advances suggest strong potential for future application in intelligent fruit harvesting.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Performance comparison chart of YOLO series.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g006.tif">
<alt-text content-type="machine-generated">Two line graphs compare different YOLO model versions in terms of MS COCO mean Average Precision (mAP). The left graph plots mAP against latency in milliseconds, and the right graph plots mAP against FLOPs in gigaflops. YOLOv12 consistently shows the highest mAP. The graphs include a legend for various YOLO models and competing methods, with each model represented by a unique color and line style.</alt-text>
</graphic>
</fig>
<p>In summary, while newer YOLO variants offer enhanced accuracy and speed, their effectiveness in agricultural environments depends on task-specific factors such as target size, occlusion level, and real-time requirements. Selecting the most suitable version requires careful consideration of these variables.</p>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Transformer-based object detection methods</title>
<p>Originally achieving remarkable success in natural language processing, Transformer architectures have recently been introduced into the field of object detection due to their ability to model global dependencies via self-attention mechanisms. Representative models include DETR (<xref ref-type="bibr" rid="B118">Zhao et&#xa0;al., 2024</xref>), Deformable DETR (<xref ref-type="bibr" rid="B126">Zhu et&#xa0;al., 2020</xref>), Swin Transformer (<xref ref-type="bibr" rid="B50">Liu et&#xa0;al., 2022</xref>), and Vision Transformer (ViT) (<xref ref-type="bibr" rid="B29">Huang et&#xa0;al., 2022</xref>). Compared with convolutional neural networks (CNNs), Transformer-based models enable end-to-end training without relying on predefined anchor boxes and offer strong global modeling capabilities, making them particularly suitable for complex agricultural environments with background clutter or occlusion.</p>
<p>Despite these advantages, Transformers still face several challenges in practical applications, including high computational cost, slow convergence, and a strong dependence on large-scale labeled datasets. To address these limitations, Guo et&#xa0;al. proposed a Transformer-based fruit detection framework, which effectively captures long-range dependencies but still struggles with tasks such as small object detection and fruit localization at boundaries (<xref ref-type="bibr" rid="B24">Guo et&#xa0;al., 2024</xref>).</p>
<p>To provide a comparative view of detection performance across different fruit types and detection models, <xref ref-type="table" rid="T2">
<bold>Tables&#xa0;2</bold>
</xref>, <xref ref-type="table" rid="T3">
<bold>3</bold>
</xref> summarize the results reported in recent studies. &#x201c;Results&#x201d; refers to the reported detection accuracy under specific datasets or field conditions, while &#x201c;Cycle Time&#x201d; indicates the average time to complete a full picking cycle for each fruit, including perception, motion planning and execution, and fruit placement. These comparisons help illustrate the trade-offs between detection performance and overall harvesting efficiency across various algorithms and application contexts. To balance real-time performance and accuracy, recent research has begun to explore hybrid models that integrate Transformer modules into YOLO frameworks. Additionally, fusing Transformer features with multi-modal sensor data&#x2014;such as RGB-Depth or thermal imagery&#x2014;has emerged as a promising direction for enhancing robustness and accuracy in agricultural detection tasks.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Part 1 of the research progress on various fruit harvesting visual perception technologies.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Fruit types</th>
<th valign="top" align="center">Technical solution</th>
<th valign="top" align="center">Results</th>
<th valign="top" align="center">Cycle time</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Strawberry (<xref ref-type="bibr" rid="B80">Tafuro et&#xa0;al., 2022</xref>)</td>
<td valign="top" align="center">Detectron-2</td>
<td valign="top" align="center">AP50 = 94.19%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Tomato (<xref ref-type="bibr" rid="B102">Wu et&#xa0;al., 2021</xref>)</td>
<td valign="top" align="center">Stereo matching algorithm</td>
<td valign="top" align="center">/</td>
<td valign="top" align="center">13.2s</td>
</tr>
<tr>
<td valign="top" align="center">Grape (<xref ref-type="bibr" rid="B55">Luo et&#xa0;al., 2016</xref>)</td>
<td valign="top" align="center">Binocular stereo vision algorithm</td>
<td valign="top" align="center">Detection accuracy=87%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Coconuts (<xref ref-type="bibr" rid="B65">Parvathi and Selvi, 2021</xref>)</td>
<td valign="top" align="center">Improved Faster R-CNN with ResNet-50</td>
<td valign="top" align="center">mAP50 = 89.4%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Lychee (<xref ref-type="bibr" rid="B22">Guo et&#xa0;al., 2019</xref>)</td>
<td valign="top" align="center">Based on the CLAHE and Hough circle methods</td>
<td valign="top" align="center">F1 = 87.07%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Grape (<xref ref-type="bibr" rid="B78">Sozzi et&#xa0;al., 2022</xref>)</td>
<td valign="top" align="center">YOLOv3, YOLOv4, YOLOv5</td>
<td valign="top" align="center">F1 = 77%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Green Pepper (<xref ref-type="bibr" rid="B95">Wang F. et&#xa0;al., 2022</xref>)</td>
<td valign="top" align="center">YOLOv5s-CFL</td>
<td valign="top" align="center">mAP=95.46%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Lychee (<xref ref-type="bibr" rid="B120">Zhong et&#xa0;al., 2021</xref>)</td>
<td valign="top" align="center">MFBB</td>
<td valign="top" align="center">F1 = 83.8%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Citrus (<xref ref-type="bibr" rid="B28">Hou et&#xa0;al., 2022</xref>)</td>
<td valign="top" align="center">Improved YOLOv5s</td>
<td valign="top" align="center">F1 = 98.0%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Citrus (<xref ref-type="bibr" rid="B42">Li C. et&#xa0;al., 2023</xref>)</td>
<td valign="top" align="center">YOLOv5-CBAM</td>
<td valign="top" align="center">F1 = 92.41%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Zanthoxylum (<xref ref-type="bibr" rid="B23">Guo et&#xa0;al., 2023</xref>)</td>
<td valign="top" align="center">CA-DCNv2-YOLOv5</td>
<td valign="top" align="center">mAP=69.5%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Tomato (<xref ref-type="bibr" rid="B13">Chen W. et&#xa0;al., 2024</xref>)</td>
<td valign="top" align="center">YOLO-DNA</td>
<td valign="top" align="center">mAP=74%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Apple (<xref ref-type="bibr" rid="B43">Li H. et&#xa0;al., 2023</xref>)</td>
<td valign="top" align="center">BTC-YOLOv5s</td>
<td valign="top" align="center">mAP=84.3%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Green pepper (<xref ref-type="bibr" rid="B30">Huang et&#xa0;al., 2024</xref>)</td>
<td valign="top" align="center">Pepper-YOLO</td>
<td valign="top" align="center">mAP50 = 88.1%</td>
<td valign="top" align="center">/</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Part 2 of the research progress on various fruit harvesting visual perception technologies.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Fruit types</th>
<th valign="top" align="center">Technical solution</th>
<th valign="top" align="center">Results</th>
<th valign="top" align="center">Cycle time</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Strawberry (<xref ref-type="bibr" rid="B112">Yu et&#xa0;al., 2020</xref>)</td>
<td valign="top" align="center">R-YOLO</td>
<td valign="top" align="center">recognition rate=94.43%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Grape (<xref ref-type="bibr" rid="B12">Chen J. et&#xa0;al., 2024</xref>)</td>
<td valign="top" align="center">YOLOv8-GP</td>
<td valign="top" align="center">mAP=89.7%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Longan (<xref ref-type="bibr" rid="B15">Chen et&#xa0;al., 2025</xref>)</td>
<td valign="top" align="center">Improved YOLOv8n</td>
<td valign="top" align="center">AP50 = 74.3%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Mango (<xref ref-type="bibr" rid="B40">Li et&#xa0;al., 2024</xref>)</td>
<td valign="top" align="center">Improved YOLOv8</td>
<td valign="top" align="center">mPA=84.9%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Strawberry (<xref ref-type="bibr" rid="B103">Xia, 2024</xref>)</td>
<td valign="top" align="center">Improved YOLOv8-Pose</td>
<td valign="top" align="center">mAP-kp=97.85%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Tomato (<xref ref-type="bibr" rid="B52">Liu et&#xa0;al., 2020</xref>)</td>
<td valign="top" align="center">YOLO-Tomato</td>
<td valign="top" align="center">AP=96.4%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Tomato (<xref ref-type="bibr" rid="B37">Lawal MO., 2021</xref>)</td>
<td valign="top" align="center">YOLO-Tomato-B</td>
<td valign="top" align="center">AP=99.3%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Green Sweet Pepper (<xref ref-type="bibr" rid="B53">Lopez-Barrios&#xb4; et&#xa0;al., 2023</xref>)</td>
<td valign="top" align="center">Mask R-CNN</td>
<td valign="top" align="center">mAP50 = 72.64%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Mango (<xref ref-type="bibr" rid="B119">Zheng et&#xa0;al., 2021</xref>)</td>
<td valign="top" align="center">Mask R-CNN</td>
<td valign="top" align="center">AP=82.4%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Strawberry (<xref ref-type="bibr" rid="B60">Mia et&#xa0;al., 2023</xref>)</td>
<td valign="top" align="center">DANet</td>
<td valign="top" align="center">mAP=78.27%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Tomato (<xref ref-type="bibr" rid="B38">Lawal OM., 2021</xref>)</td>
<td valign="top" align="center">YOLOMixNet</td>
<td valign="top" align="center">AP=98.4%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Apple (<xref ref-type="bibr" rid="B45">Li et&#xa0;al., 2023b</xref>)</td>
<td valign="top" align="center">MARL</td>
<td valign="top" align="center">Detection accuracy:71.28%-80.45%</td>
<td valign="top" align="center">5.8-6.7s</td>
</tr>
<tr>
<td valign="top" align="center">Lotus (<xref ref-type="bibr" rid="B54">Lu et&#xa0;al., 2024</xref>)</td>
<td valign="top" align="center">Three-view depth visual positioning method</td>
<td valign="top" align="center">Detection accuracy=98%</td>
<td valign="top" align="center">/</td>
</tr>
<tr>
<td valign="top" align="center">Sweet Pepper (<xref ref-type="bibr" rid="B62">Ning et&#xa0;al., 2022</xref>)</td>
<td valign="top" align="center">AYDY</td>
<td valign="top" align="center">Picking Rate=90.04%</td>
<td valign="top" align="center">/</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Data labeling methods and localization techniques for fruit picking</title>
<p>The localization of picking points determines whether the fruit can be successfully harvested, making it one of the core aspects of the fruit picking process. In recent years, many scholars have focused on the labeling and research of fruit picking points. The methods for data labeling of fruit picking points and their localization and recognition are crucial elements in the research of intelligent harvesting robots. The goal is to ensure the accurate identification and localization of picking points through efficient and precise labeling methods and localization technologies, thereby enhancing the automation and intelligence of the harvesting machinery.</p>
<p>Selective picking methods are classified into two categories based on the way the fruit is harvested: picking the fruit itself and picking the fruit stem. The terminal operation methods differ between these two categories, and there are also significant differences in data labeling approaches. In recent years, many researchers have noted variations in the labeling of data for picking the same type of fruit, and these differences affect the picking accuracy.</p>
<p>Wang et&#xa0;al. applied prior knowledge of apples and used the Hough transform method and contour curvature to propose a method for calculating the contours of occluded apples to enable picking localization (<xref ref-type="bibr" rid="B94">Wang et&#xa0;al., 2016</xref>). This method struggles to identify the fruit when they overlap. Yu et&#xa0;al. labeled the strawberry body with a bounding box and used R-YOLO to predict the rotational boundaries of the strawberry and the physical size estimation of the picking point based on the strawberry&#x2019;s rotation angle to confirm the picking point (<xref ref-type="bibr" rid="B112">Yu et&#xa0;al., 2020</xref>), as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7a</bold>
</xref>. Tafuro used instance segmentation to label the strawberry body and calculated the fruit stem position and picking point localization by recognizing the boundary of the strawberry (<xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7b</bold>
</xref>) (<xref ref-type="bibr" rid="B80">Tafuro et&#xa0;al., 2022</xref>). Zhong et&#xa0;al. in their lychee picking labeling, only labeled the main fruit branch and took the center point of the bounding box as the picking point (<xref ref-type="bibr" rid="B120">Zhong et&#xa0;al., 2021</xref>), as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7c</bold>
</xref>. If the center point is not exactly on the branch or is blocked by leaves, it can cause significant errors. <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7d</bold>
</xref> shows the sweet pepper picking labeling, where both the bounding box and the center point of the fruit are estimated to confirm the picking position (<xref ref-type="bibr" rid="B62">Ning et&#xa0;al., 2022</xref>).</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Labeling method for picking points of different fruits. <bold>(a)</bold> Strawberry picking point calculation (<xref ref-type="bibr" rid="B112">Yu et&#xa0;al., 2020</xref>), <bold>(b)</bold> Strawberry picking point calculation (<xref ref-type="bibr" rid="B80">Tafuro et&#xa0;al., 2022</xref>), <bold>(c)</bold> Litchi picking point calculation (<xref ref-type="bibr" rid="B120">Zhong et&#xa0;al., 2021</xref>), <bold>(d)</bold> Sweet pepper picking point (<xref ref-type="bibr" rid="B62">Ning et&#xa0;al., 2022</xref>), <bold>(e)</bold> Viburnum picking point calculation (<xref ref-type="bibr" rid="B54">Lu et&#xa0;al., 2024</xref>), <bold>(f)</bold> Mango picking point calculation (<xref ref-type="bibr" rid="B119">Zheng et&#xa0;al., 2021</xref>), <bold>(g)</bold> Grape picking point calculation (<xref ref-type="bibr" rid="B13">Chen W. et&#xa0;al., 2024</xref>), <bold>(h)</bold> Pepper picking point calculation (<xref ref-type="bibr" rid="B30">Huang et&#xa0;al., 2024</xref>), <bold>(i)</bold> Mango picking point calculation (<xref ref-type="bibr" rid="B40">Li et&#xa0;al., 2024</xref>).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g007.tif">
<alt-text content-type="machine-generated">A series of nine images, each labeled with a letter from (a) to (i), depicting various fruits with bounding boxes and annotations. (a) A ripe strawberry is highlighted with boxes and a &#x201c;ripe&#x201d; label. (b) Shows multiple strawberries with color-coded indicators. (c) Displays lychees with a blue box around one. (d) Features numerous green fruits marked with red boxes. (e) A lotus pod with labeled parts like &#x201c;stalk&#x201d; and &#x201c;mask&#x201d;. (f) Mangoes with annotated points and green outlines. (g) Grapes hanging in a cluster. (h) Peppers with several boxes indicating detection. (i) A single fruit labeled as &#x201c;fruit&#x201d; with a box around the stem connection.</alt-text>
</graphic>
</fig>
<p>Lu et&#xa0;al. in their lotus pod picking used YOLOv5-based instance segmentation to label both the fruit region and the fruit stem region separately, and then calculated the key points from the segmented regions, inferring the picking position from those key points, as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7e</bold>
</xref> (<xref ref-type="bibr" rid="B54">Lu et&#xa0;al., 2024</xref>). These various methods show the diversity in approaches to fruit picking point labeling and localization across different fruit types. The key challenge lies in ensuring high accuracy despite differences in fruit shapes, growth environments, and occlusions.</p>
<p>With the development of deep learning technologies, some researchers have shifted the fruit picking point localization from traditional geometric computations to regression-based calculations. <xref ref-type="bibr" rid="B119">Zheng et&#xa0;al. (2021)</xref> applied a combination of fruit instance segmentation and key point labeling for mango picking point localization, as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7f</bold>
</xref>. They used the Mask RCNN model to simultaneously perform regression on the instance regions and multiple key points, with the picking point location ultimately determined by the key points. Chen et&#xa0;al (<xref ref-type="bibr" rid="B13">Chen W. et&#xa0;al., 2024</xref>), in their grape picking labeling work, used a fruit target bounding box and a fruit stem picking key point to label the data, and directly applied the YOLOv8pose model for regression calculations to achieve picking point localization, as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7g</bold>
</xref>. To address the issue of chili picking points being occluded in complex scenarios, <xref ref-type="bibr" rid="B30">Huang et&#xa0;al. (2024)</xref> improved the YOLOv8-pose model by introducing a reversible network structure and a feature fusion module to achieve the recognition of multiple key points of the chili. The precise estimation of the picking points is realized through these key points, with the detection results shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7h</bold>
</xref>. Li et&#xa0;al (<xref ref-type="bibr" rid="B40">Li et&#xa0;al., 2024</xref>), in their mango picking work, combined object detection and instance segmentation. They first used two target bounding boxes to separately label the mango body and fruit stem, then applied instance segmentation to label the fruit stem region. After detecting the fruit stem using object detection, they performed instance segmentation on the stem region to obtain the skeleton line of the fruit stem, which was then used to calculate the picking point, as shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7i</bold>
</xref>.</p>
<p>In summary, we can observe that in recent years, there have been multiple labeling methods and picking point calculation approaches for the same fruit or different fruits with similar picking methods. The accuracy of the models trained or computed with different labeling methods also varies. In complex environments, how to develop a fruit labeling method that serves fruit picking tasks becomes particularly crucial. One of the key challenges in fruit picking work has always been how to minimize the position error of the fruit picking points.</p>
</sec>
<sec id="s5">
<label>5</label>
<title>Robot mobility and global environment perception technologies</title>
<sec id="s5_1">
<label>5.1</label>
<title>Visual perception and navigation</title>
<p>Visual perception is one of the core technologies enabling fruit harvesting robots to achieve autonomous navigation and environmental understanding. By integrating Visual Simultaneous Localization and Mapping (V-SLAM) systems, robots can construct 3D maps and localize themselves in complex orchard environments, thereby enhancing their autonomous navigation capabilities. <xref ref-type="bibr" rid="B14">Chen et&#xa0;al. (2021)</xref> proposed a framework combining eye-in-hand stereo vision with SLAM, addressing the limitations of traditional SLAM methods in orchard environments and providing a solution for large-scale orchard harvesting that adapts to complex terrain and varying lighting conditions. <xref ref-type="bibr" rid="B58">Maud et&#xa0;al. (2023)</xref> utilized object detection and RTAB-Map algorithms to propose a real-time 3D mapping and localization system, optimizing the detection and management of palm oil trees and improving tree localization accuracy in large-scale plantations. <xref ref-type="bibr" rid="B92">Wang P. et&#xa0;al. (2025)</xref> based their approach on visual SLAM combined with semantic segmentation networks, improving the representation of point clouds and enhancing real-time processing speed, thus enabling more precise navigation and perception in greenhouse environments. These studies show that the combination of stereo vision with SLAM, particularly with the introduction of semantic SLAM, significantly enhances the robot&#x2019;s perception and navigation accuracy in complex environments.</p>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>Path planning for mobile robots</title>
<p>Path planning is crucial for fruit harvesting robots to operate efficiently, particularly in complex orchard environments where optimizing paths to minimize time and energy consumption is essential. <xref ref-type="bibr" rid="B84">Urvina et&#xa0;al. (2024)</xref> proposed a combined global and local planning strategy, using the Traveling Salesman Problem (TSP) and the Informed Rapidly-exploring Random Tree (IRRT*) algorithm to optimize paths and avoid obstacles, improving navigation efficiency in complex terrain. <xref ref-type="bibr" rid="B96">Wang L. et&#xa0;al. (2022)</xref> introduced a full-coverage path planning method based on multi-objective constraints, which enhances the adaptability of path planning algorithms in irregular terrains, ensuring complete coverage. <xref ref-type="bibr" rid="B88">Wang et&#xa0;al. (2025a)</xref>. developed a hybrid path planning approach, combining inner spiral and improved nested methods, significantly reducing non-work path length and improving operational coverage. These studies highlight the progression of path planning technologies toward combining global and local strategies, addressing path optimization challenges in complex agricultural environments.</p>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>Task scheduling</title>
<p>Task scheduling is vital for enhancing the efficiency of multi-tasking harvesting robots, especially when multiple tasks are performed simultaneously. Efficient task allocation and resource optimization are key to improving robot performance. <xref ref-type="bibr" rid="B44">Li et&#xa0;al. (2023a)</xref> proposed a Multi-Agent Reinforcement Learning (MARL)based scheduling method that dynamically adjusts task allocation based on real-time environment changes and task priorities, boosting operational efficiency. <xref ref-type="bibr" rid="B89">Wang et&#xa0;al. (2025b)</xref> addressed collaborative scheduling between harvesters and transport robots, introducing a task allocation and path planning method based on topological maps, significantly enhancing operational efficiency. <xref ref-type="bibr" rid="B125">Zhu et&#xa0;al. (2025)</xref> developed a task scheduling method for dual-arm robots using Mixed-Integer Linear Programming (MILP), optimizing task coordination and substantially improving strawberry harvesting throughput. These studies demonstrate that incorporating multi-agent systems and optimization algorithms into task scheduling can effectively enhance multi-task coordination and improve overall operational efficiency.</p>
</sec>
</sec>
<sec id="s6">
<label>6</label>
<title>Optimal viewpoint planning for fruit picking</title>
<p>During the fruit picking process, environmental factors such as exposure, backlighting, shadows, occlusions, and vibrations may cause changes in the fruit&#x2019;s position or lead to recognition failures. These factors not only result in the loss of visual information but may also prevent the accurate localization of picking points, ultimately reducing picking efficiency (<xref ref-type="bibr" rid="B79">Suresh Kumar and Mohan, 2023</xref>). For example, under strong sunlight or backlighting conditions, the camera may fail to clearly capture the fruit&#x2019;s outline, while shadowed areas may obscure parts of the fruit, causing recognition errors. Vibration or mechanical movement can also shift the fruit&#x2019;s position in the visual sensor, further affecting the accuracy and efficiency of the picking task. In addition, different viewpoints may produce varying picking outcomes. To address these issues, viewpoint planning, as an important technical measure, aims to maximize the fruit&#x2019;s visibility and recognition rate by selecting the most appropriate angle, thereby minimizing the impact of external factors on recognition effectiveness (<xref ref-type="bibr" rid="B108">Yi et&#xa0;al., 2024</xref>). Viewpoint planning for fruit picking can be divided into four types based on the methods used: geometry-based viewpoint planning, information-based viewpoint planning, optimization-based viewpoint planning, and learning-based viewpoint planning.</p>
<sec id="s6_1">
<label>6.1</label>
<title>Geometry-based viewpoint planning method</title>
<p>The geometric-based viewpoint planning method focuses on selecting the optimal viewpoint by calculating the spatial relationships between the environment and the target object. It typically involves using depth cameras or LiDAR to create an environmental model, which includes geometric shapes such as tree structures, fruit positions, and the locations of branches and leaves. The visual system then identifies the position of the target fruit and analyzes the feasibility of viewpoint selection based on the geometric relationship between the fruit and the environment. Once the best viewpoint is selected, it notifies the robotic arm to carry out the picking task. Menon et&#xa0;al. planned the optimal picking viewpoint based on the completeness of the fruit&#x2019;s shape, as shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8a</bold>
</xref> (<xref ref-type="bibr" rid="B59">Menon et&#xa0;al., 2023</xref>). Hornung et&#xa0;al. proposed a 3D point cloud mapping based on octrees to simulate the robot&#x2019;s 3D environment (<xref ref-type="bibr" rid="B27">Hornung et&#xa0;al., 2013</xref>). RVP constructed a voxel map of the fruit region and used a utility function based on expected information of the fruit region to evaluate candidate viewpoints (<xref ref-type="bibr" rid="B114">Zaenker et&#xa0;al., 2021</xref>). Burusa et&#xa0;al. drove next-best-view (NBV) planning through the tomato plant&#x2019;s structural features and an attention mechanism (<xref ref-type="bibr" rid="B10">Burusa et&#xa0;al., 2024</xref>).</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Different methods for calculating the optimal viewpoint. <bold>(a)</bold> Evaluate the picking point location through fruit shape completion, <bold>(b)</bold> Calculate the unobstructed areas of grape picking points using a scoring function, <bold>(c)</bold> Identify the optimal viewpoint through deep learning.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1646871-g008.tif">
<alt-text content-type="machine-generated">A three-part image illustrating a robotic system evaluating fruits for harvesting. Panel (a) shows a real-world environment with a robotic arm detecting fruits on plants, highlighted by circles. Panel (b) displays a 3D voxel map of a fruit-picking region with green and red points indicating unoccluded and occluded points, respectively. Panel (c) depicts analysis of fruit visibility, with images showing fully visible, partially occluded, and truncated views, alongside bounding box data.</alt-text>
</graphic>
</fig>
<p>These methods have high computational complexity, are heavily dependent on equipment, and may become ineffective if the environment changes, such as when leaves or fruits sway, making pre-computed optimal viewpoints unsuitable.</p>
</sec>
<sec id="s6_2">
<label>6.2</label>
<title>Information-based and optimization-based viewpoint planning methods</title>
<p>Information-based and optimization-based viewpoint planning methods evaluate the characteristics of different viewpoints to select the ones that provide the maximum perceptual information or optimize task execution. These methods are widely applied in complex scenarios, such as fruit harvesting tasks. Yi et&#xa0;al. generated viewpoints randomly and guided the robotic arm to adjust its perspective by combining spatial coverage and motion cost to optimize the scoring function, as shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8b</bold>
</xref> (<xref ref-type="bibr" rid="B108">Yi et&#xa0;al., 2024</xref>); Menon et&#xa0;al. estimated missing information through shape completion and used an NBV-SC planner to find the best viewpoint (<xref ref-type="bibr" rid="B59">Menon et&#xa0;al., 2023</xref>); Akshay et&#xa0;al. made multi-viewpoint semantic perception decisions to determine the best viewpoint in tomato harvesting, achieving better results than active vision strategies (<xref ref-type="bibr" rid="B10">Burusa et&#xa0;al., 2024</xref>); Zaenker et&#xa0;al. designed a viewpoint motion planner to optimize the information gain for pepper detection (<xref ref-type="bibr" rid="B113">Zaenker et&#xa0;al., 2023</xref>). These methods require evaluating multiple viewpoints, resulting in a large computational load that affects real-time performance. Optimization-based viewpoint planning, on the other hand, uses optimization algorithms to select viewpoints, with objectives typically focused on minimizing occlusion, maximizing information gain, or improving task efficiency. These methods evaluate the quality of viewpoints by setting objective functions. For example, Li et&#xa0;al. improved YOLOv5 and combined it with the ant colony algorithm to optimize the harvesting sequence of citrus, addressing collision issues (<xref ref-type="bibr" rid="B42">Li C. et&#xa0;al., 2023</xref>); Li et&#xa0;al. used reinforcement learning to define a reward function for optimizing harvesting strategies in a multi-arm system (<xref ref-type="bibr" rid="B45">Li et&#xa0;al., 2023b</xref>); Yi et&#xa0;al. generated candidate viewpoints and scored them to select the best perspective (<xref ref-type="bibr" rid="B108">Yi et&#xa0;al., 2024</xref>). Optimization-based methods also require evaluating multiple viewpoints, which imposes a large computational burden, especially in large-scale and dynamic environments, affecting real-time performance.</p>
</sec>
<sec id="s6_3">
<label>6.3</label>
<title>Learning-based viewpoint planning methods</title>
<p>Learning-based planning methods utilize machine learning and deep learning techniques to train models that learn how to select the optimal viewpoint based on occlusion conditions. These methods offer high adaptability and flexibility, performing particularly well in complex and dynamic environments. Learning-based viewpoint planning works by automatically extracting features from a large amount of training data and making predictions using learned models. The models can include deep neural networks, reinforcement learning models, and others. The learning process typically involves using historical data to train the model, enabling it to generate reasonable viewpoint selection strategies based on input environmental information or task requirements. Zhang et&#xa0;al. applied deep learning techniques for multiview fruit detection in apple picking to determine the optimal picking location (<xref ref-type="bibr" rid="B116">Zhang et&#xa0;al., 2022</xref>). Wang et&#xa0;al. used a few-shot reinforcement learning approach to jointly train the Next Best View (NBV) and Next Best Point (NBP), with the model continuously optimizing viewpoint decisions through interaction with the environment (<xref ref-type="bibr" rid="B90">Wang G. et&#xa0;al., 2024</xref>). Chen et&#xa0;al. employed YOLOv8 for real-time object detection of longan fruits and guided a drone to perform fruit picking by establishing the relationship between the target points and the drone&#x2019;s speed (<xref ref-type="bibr" rid="B15">Chen et&#xa0;al., 2025</xref>). Rehman et&#xa0;al. conducted viewpoint data collection by rotating 30 degrees from left to right around the target in a nighttime environment, using deep learning techniques to identify occluded areas and guide the harvesting robot in selecting the optimal viewpoint, as shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8c</bold>
</xref> (<xref ref-type="bibr" rid="B72">Rehman and Miura, 2021</xref>).</p>
<p>Overall, with the enhancement of perception and computational capabilities, significant progress has been made in fruit harvesting viewpoint planning technology. Geometric, information-based, optimization, and learning methods each have their advantages, adapting to different scenarios and requirements. Geometric methods are precise but complex and dependent on specific conditions; information-based methods optimize viewpoints but are computationally intensive; optimization methods are effective but burdensome in complex environments; and learning methods are highly adaptable but rely on training data and resources. Although existing research has improved recognition and harvesting efficiency, real-time performance, robustness, and accuracy in complex environments remain major challenges. Future research could explore the integration of multiple methods, such as combining optimization with deep learning, to enhance efficiency, reduce computational consumption, and improve the system&#x2019;s adaptability and real-time adjustment capabilities.</p>
</sec>
</sec>
<sec id="s7" sec-type="discussion">
<label>7</label>
<title>Discussion</title>
<p>The fruit-picking robot has made significant advancements in visual perception technology, which is central to the automation of fruit harvesting. However, despite continuous technological progress, there are still many challenges when it comes to applying these systems in real agricultural environments.</p>
<sec id="s7_1">
<label>7.1</label>
<title>Technical challenges and limitations</title>
<p>Various advanced cameras, such as monocular, binocular, and 3D depth sensors, have enhanced the precision of fruit recognition and localization for robots. Binocular cameras provide depth information through disparity, but they have limitations in calibration and adaptability. Complex depth sensors, such as Time-of-Flight (ToF) cameras and structured light cameras, offer excellent depth perception but are expensive and computationally intensive. Deep learning algorithms, such as YOLO, have improved fruit detection accuracy, but they require powerful computational resources, large training datasets, and depth data fusion. Striking a balance between computational efficiency and accuracy remains a key challenge for large-scale applications.</p>
</sec>
<sec id="s7_2">
<label>7.2</label>
<title>Impact of environmental variations</title>
<p>Intelligent fruit-picking robots face challenges such as lighting variations, plant positioning, and fruit occlusion in agricultural environments. These factors complicate the visual system&#x2019;s ability to detect and localize fruits accurately. Even advanced sensors struggle when confronted with real-world agricultural settings. For instance, differences in the shape, color, and growth patterns of various fruits increase the difficulty of segmentation and classification. Ensuring high-precision recognition amidst these variations remains an unsolved problem.</p>
</sec>
<sec id="s7_3">
<label>7.3</label>
<title>Picking accuracy and efficiency</title>
<p>Picking accuracy is crucial, particularly in minimizing damage and improving fruit quality. Visual reconstruction and depth perception technologies assist in pinpointing the picking location, but the high computational cost remains a bottleneck in real-time data processing. Enhancing operational precision and preventing fruit damage are key considerations. Additionally, the introduction of active vision technology, which adjusts the visual angle based on real-time perception, can further improve picking accuracy.</p>
</sec>
<sec id="s7_4">
<label>7.4</label>
<title>Future development directions</title>
<p>Despite the challenges, the future of intelligent fruit-picking robots remains promising. Future research could explore sensor fusion, integrating visual, tactile, and force data to enhance the robot&#x2019;s overall environmental perception. AI and machine learning, particularly unsupervised learning, hold the potential to reduce the reliance on large labeled datasets and improve the robot&#x2019;s adaptability to new environments. By combining deep learning-based visual servoing techniques, path planning, and control strategies can be optimized. In the future, intelligent fruit-picking robots will achieve a better balance between real-time performance and accuracy.</p>
</sec>
</sec>
<sec id="s8" sec-type="conclusion">
<label>8</label>
<title>Conclusion</title>
<p>In this paper, we reviewed the research progress of visual perception technology in intelligent fruit-picking robots. First, we introduced the advantages and disadvantages of different types of cameras: monocular cameras are suitable for simple scenarios, binocular cameras provide depth information for moderately complex environments, while structured light and ToF depth cameras perform excellently in high-precision depth perception and complex environments.</p>
<p>Next, we explored the application of object detection technology in fruit picking, comparing traditional image processing methods with modern deep learning methods such as YOLO and SSD. While deep learning methods offer higher accuracy and better adaptability, they require large amounts of training data and high-performance hardware. Traditional methods still have advantages when resources are limited.</p>
<p>Regarding the localization of picking points, we reviewed vision-based 3D reconstruction and depth perception methods, emphasizing the importance of accurate localization to improve the picking success rate and reduce fruit damage. Additionally, we explored technologies such as V-SLAM, mobile path planning, and task scheduling, which contribute to enhancing the robot&#x2019;s operational efficiency throughout the entire orchard. We also discussed the combination of active vision and visual servoing techniques, showing that these two technologies can significantly enhance the robot&#x2019;s adaptability and precision in dynamic environments. By adjusting the visual angle in real-time and optimizing control strategies, robots can more accurately locate and manipulate targets, especially when dealing with fruit occlusion and complex backgrounds.</p>
<p>Finally, we summarized the current status and future development directions of visual perception technology. Despite significant progress, challenges such as poor environmental adaptability, low system integration, and high costs still exist in real agricultural environments. With the continuous development of computer vision, deep learning, and sensor technologies, the future intelligent fruit-picking robots, combining active vision and visual servoing techniques, will make greater breakthroughs in efficiency and accuracy and will be capable of addressing more complex application scenarios.</p>
</sec>
</body>
<back>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>YH: Conceptualization, Methodology, Investigation, Writing &#x2013; original draft, Supervision. RC: Writing &#x2013; original draft, Resources, Project administration. SX: Data curation, Writing &#x2013; review &amp; editing, Formal analysis. HC: Writing &#x2013; review &amp; editing. GL: Writing &#x2013; review &amp; editing. JY: Writing &#x2013; review &amp; editing. XZ: Writing &#x2013; review &amp; editing. HD: Methodology support, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s10" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research and/or publication of this article. This research was funded by the Natural Science Foundation of Fujian Province (No. 2022J01644), the Fujian Agriculture and Forestry University Science and Technology Innovation Fund (No. KFB24043), the Big Data in Agroforestry (Cross-Disciplinary) of Fujian Agriculture and Forestry University (No. 712023030), the Infrastructure Development Fund of Fujian Agriculture and Forestry University (No. KXNDM0001), and the Higher Education Scientific Research Planning Project (No. ZD202309).</p>
</sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s12" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec id="s13" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arad</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Balendonck</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Barth</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Ben-Shahar</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Edan</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hellstrom&#xa8;</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Development of a sweet pepper harvesting robot</article-title>. <source>J. Field Robot.</source> <volume>37</volume>, <fpage>1027</fpage>&#x2013;<lpage>1039</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/rob.21937</pub-id>
</citation></ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arefi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Motlagh</surname> <given-names>A. M.</given-names>
</name>
<name>
<surname>Mollazade</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Teimourlou</surname> <given-names>R. F.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Recognition and localization of ripen tomato based on machine vision</article-title>. <source>Aust. J. Crop Sci.</source> <volume>5</volume>, <fpage>1144</fpage>&#x2013;<lpage>1149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3316/informit.745798602538938</pub-id>
</citation></ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aulia</surname> <given-names>L. H.</given-names>
</name>
<name>
<surname>Azhari</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Bimantara</surname> <given-names>M. D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Implementation of hsv imagery with k-nearest neighbor for classification of maturity levels in tomatoes</article-title>. <source>Bigint. Comput. J.</source> <volume>1</volume>, <fpage>62</fpage>&#x2013;<lpage>69</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.55537/bigint.v1i2.779</pub-id>
</citation></ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ayg&#xfc;n</surname> <given-names>S.</given-names>
</name>
<name>
<surname>G&#xfc;ne&#x15f;</surname> <given-names>E. O.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>A benchmarking: Feature extraction and classification of agricultural textures using lbp, glcm, rbo, neural networks, k-nn, and random forest</article-title>,&#x201d; in <conf-name>2017 6th International Conference on Agro-Geoinformatics</conf-name>. <fpage>1</fpage>&#x2013;<lpage>4</lpage> (<publisher-loc>Piscataway, NJ</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/Agro-Geoinformatics.2017.8047000</pub-id>
</citation></ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Babellahi</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Paliwal</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Erkinbaev</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Amodio</surname> <given-names>M. L.</given-names>
</name>
<name>
<surname>Chaudhry</surname> <given-names>M. M. A.</given-names>
</name>
<name>
<surname>Colelli</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Early detection of chilling injury in green bell peppers by hyperspectral imaging and chemometrics</article-title>. <source>Postharvest. Biol. Technol.</source> <volume>162</volume>, <fpage>111100</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.postharvbio.2019.111100</pub-id>
</citation></ref>
<ref id="B6">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Baeten</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Donn&#xe9;</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Boedrij</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Beckers</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Claesen</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>Autonomous fruit picking machine: A robotic apple harvester</article-title>,&#x201d; in <conf-name>Field and service robotics: Results of the 6th international conference</conf-name>. <fpage>531</fpage>&#x2013;<lpage>539</lpage> (<publisher-loc>Berlin, Heidelberg</publisher-loc>, <publisher-name>Springer</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-540-75404-6_51</pub-id>
</citation></ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ban</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yin</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Depth estimation method for monocular camera defocus images in microscopic scenes</article-title>. <source>Electronics</source> <volume>11</volume>, <fpage>2012</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics11132012</pub-id>
</citation></ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Besnassi</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Neggaz</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Benyettou</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Face detection based on evolutionary haar filter</article-title>. <source>Pattern Anal. Appl.</source> <volume>23</volume>, <fpage>309</fpage>&#x2013;<lpage>330</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10044-019-00784-5</pub-id>
</citation></ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Birrell</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hughes</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>J. Y.</given-names>
</name>
<name>
<surname>Iida</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A field-tested robotic harvesting system for iceberg lettuce</article-title>. <source>J. Field Robot.</source> <volume>37</volume>, <fpage>225</fpage>&#x2013;<lpage>245</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/rob.21888</pub-id>, PMID: <pub-id pub-id-type="pmid">32194355</pub-id></citation></ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Burusa</surname> <given-names>A. K.</given-names>
</name>
<name>
<surname>van Henten</surname> <given-names>E. J.</given-names>
</name>
<name>
<surname>Kootstra</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Attention-driven next-best-view planning for efficient reconstruction of plants and targeted plant parts</article-title>. <source>Biosyst. Eng.</source> <volume>246</volume>, <fpage>248</fpage>&#x2013;<lpage>262</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2024.08.002</pub-id>
</citation></ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chao</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Junxi</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Guowei</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Chuyan</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Analysis of parallax characteristics of binocular vision based on no-similar imaging</article-title>,&#x201d; in <source>AOPC 2022: optical sensing, imaging, and display technology</source>, vol. <volume>12557</volume>. (<publisher-loc>Bellingham, Washington, United States</publisher-loc>, <publisher-name>SPIE</publisher-name>), <fpage>576</fpage>&#x2013;<lpage>583</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1117/12.2669643</pub-id>
</citation></ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Efficient and lightweight grape and picking point synchronous detection model based on key point detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>217</volume>, <fpage>108612</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.108612</pub-id>
</citation></ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Rao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Mlp-based multimodal tomato detection in complex scenarios: Insights from task-specific analysis of feature fusion architectures</article-title>. <source>Comput. Electron. Agric.</source> <volume>221</volume>, <fpage>108951</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.108951</pub-id>
</citation></ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>3d global mapping of large-scale unstructured orchard integrating eye-in-hand stereo vision and slam</article-title>. <source>Comput. Electron. Agric.</source> <volume>187</volume>, <fpage>106237</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106237</pub-id>
</citation></ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Mai</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>). <article-title>A real-time vision guidance method for autonomous longan picking by the uav</article-title>. <source>Comput. Electron. Agric.</source> <volume>229</volume>, <fpage>109735</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109735</pub-id>
</citation></ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Swin-depth: Using transformers and multi-scale fusion for monocularbased depth estimation</article-title>. <source>IEEE Sensors. J.</source> <volume>21</volume>, <fpage>26912</fpage>&#x2013;<lpage>26920</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSEN.2021.3120753</pub-id>
</citation></ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chunjiang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Beibei</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Qingchun</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Agricultural robots: Technology progress, challenges and trends</article-title>. <source>Smart. Agric.</source> <volume>5</volume>, <fpage>1</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.12133/j.smartag.SA202312030</pub-id>
</citation></ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>De Kleine</surname> <given-names>M. E.</given-names>
</name>
<name>
<surname>Karkee</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>A semi-automated harvesting prototype for shaking fruit tree limbs</article-title>. <source>Trans. ASABE.</source> <volume>58</volume>, <fpage>1461</fpage>&#x2013;<lpage>1470</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13031/trans.58.11011</pub-id>
</citation></ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gallego</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Delbruck&#xa8;</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Orchard</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Bartolozzi</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Taba</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Censi</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Event-based vision: A survey</article-title>. <source>IEEE Transactions on Pattern Analysis and Machine Intelligence</source> <volume>44</volume>, <fpage>154</fpage>&#x2013;<lpage>180</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2020.3008413</pub-id>, PMID: <pub-id pub-id-type="pmid">32750812</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gehrig</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Scaramuzza</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Recurrent vision transformers for object detection with event cameras</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. <publisher-loc>Vancouver, Canada</publisher-loc>, <publisher-name>IEEE</publisher-name> <fpage>13884</fpage>&#x2013;<lpage>13893</lpage>.</citation></ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Fast r-cnn</article-title>. <source>arXiv. preprint. arXiv:1504.08083</source>, <page-range>1440&#x2013;48</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2015.169</pub-id>
</citation></ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhuang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hou</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Lychee fruit detection based on monocular machine vision in orchard environment</article-title>. <source>Sensors</source> <volume>19</volume>, <fpage>4091</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s19194091</pub-id>, PMID: <pub-id pub-id-type="pmid">31546669</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Miao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lan</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Design and experiment of a visual detection system for zanthoxylum-harvesting robot based on improved yolov5 model</article-title>. <source>Agriculture</source> <volume>13</volume>, <fpage>821</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture13040821</pub-id>
</citation></ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Q.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>) <article-title>End-to-End lightweight TransformerBased neural network for grasp detection towards fruit robotic handling</article-title>. <source>Computers and Electronics in Agriculture</source> <volume>221</volume>, <fpage>109014</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109014</pub-id>
</citation></ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gurubelli</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Malmathanraj</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Palanisamy</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Texture and colour gradient features for grade analysis of pomegranate and mango fruits using kernel-svm classifiers</article-title>,&#x201d; in <conf-name>2020 6th International Conference on Advanced Computing and Communication Systems (ICACCS)</conf-name>. <fpage>122</fpage>&#x2013;<lpage>126</lpage> (<publisher-loc>Coimbatore, India</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICACCS48705.2020.9074221</pub-id>
</citation></ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Gkioxari</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Dollar&#xb4;</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Mask r-cnn</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name>. <publisher-loc>Venice, Italy</publisher-loc>, <publisher-name>IEEE</publisher-name>, <fpage>2961</fpage>&#x2013;<lpage>2969</lpage>.</citation></ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hornung</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Wurm</surname> <given-names>K. M.</given-names>
</name>
<name>
<surname>Bennewitz</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Stachniss</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Burgard</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Octomap: An efficient probabilistic 3d mapping framework based on octrees</article-title>. <source>Autonomous. Robots.</source> <volume>34</volume>, <fpage>189</fpage>&#x2013;<lpage>206</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10514-012-9321-0</pub-id>
</citation></ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hou</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhuang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Detection and localization of citrus fruit based on improved you only look once v5s and binocular vision in the orchard</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>972445</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.972445</pub-id>, PMID: <pub-id pub-id-type="pmid">35968138</pub-id></citation></ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>You</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Qian</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>LightViT: towards light-weight convolution-free vision transformers</article-title>. <source>ArXiv. preprint. arXiv:2207.05557</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2207.05557</pub-id>
</citation></ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Pepper-yolo: an lightweight model for green pepper detection and picking point localization in complex environments</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>, <elocation-id>1508258</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1508258</pub-id>, PMID: <pub-id pub-id-type="pmid">39811717</pub-id></citation></ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hussain</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Yolov5, yolov8 and yolov10: The go-to detectors for real-time vision</article-title>. <source>arXiv. preprint. arXiv:2407.02988</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2407.02988</pub-id>
</citation></ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jia</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lian</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Apple harvesting robot under information technology: A review</article-title>. <source>Int. J. Adv. Robot. Syst.</source> <volume>17</volume>, <fpage>1729881420925310</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1177/1729881420925310</pub-id>
</citation></ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jun</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Seol</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Son</surname> <given-names>H. I.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Towards an efficient tomato harvesting robot: 3d perception, manipulation, and end-effector</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>17631</fpage>&#x2013;<lpage>17640</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2021.3052240</pub-id>
</citation></ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Junge</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Pires</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Hughes</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Lab2field transfer of a robotic raspberry harvester enabled by a soft sensorized physical twin</article-title>. <source>Commun. Eng.</source> <volume>2</volume>, <fpage>40</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s44172-023-00089-w</pub-id>
</citation></ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Salahuddin</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Javidnia</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep learning-based monocular depth estimation methods&#x2014;a state-of-the-art review</article-title>. <source>Sensors</source> <volume>20</volume>, <fpage>2272</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s20082272</pub-id>, PMID: <pub-id pub-id-type="pmid">32316336</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khanam</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Hussain</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Yolov11: An overview of the key architectural enhancements</article-title>. <source>ArXiv. preprint. arXiv:2410.17725</source> <volume>20</volume>, <fpage>2272</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2410.17725</pub-id>
</citation></ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lawal</surname> <given-names>M. O.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Tomato detection based on modified yolov3 framework</article-title>. <source>Sci. Rep.</source> <volume>11</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-021-81216-5</pub-id>, PMID: <pub-id pub-id-type="pmid">33446897</pub-id></citation></ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lawal</surname> <given-names>O. M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Development of tomato detection model for robotic platform using deep learning</article-title>. <source>Multimedia. Tools Appl.</source> <volume>80</volume>, <fpage>26751</fpage>&#x2013;<lpage>26772</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-021-10933-w</pub-id>
</citation></ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lei</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>). <article-title>YOLOv13: real-time object detection with hypergraph-enhanced adaptive visual perception</article-title>. <source>ArXiv. preprint. arXiv:2506.17733</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2506.17733</pub-id>
</citation></ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Gu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>He</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Positioning of mango picking point using an improved yolov8 architecture with object detection and instance segmentation</article-title>. <source>Biosyst. Eng.</source> <volume>247</volume>, <fpage>202</fpage>&#x2013;<lpage>220</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2024.09.015</pub-id>
</citation></ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Weng</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Geng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Yolov6: A single-stage object detection framework for industrial applications</article-title>. <source>arXiv. preprint. arXiv:2209.02976</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2209.02976</pub-id>
</citation></ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Recognition of citrus fruit and planning the robotic picking sequence in orchards</article-title>. <source>Signal. Image. Video. Process.</source> <volume>17</volume>, <fpage>4425</fpage>&#x2013;<lpage>4434</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11760-023-02676-y</pub-id>
</citation></ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yin</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Real-time detection of apple leaf diseases in natural scenes based on yolov5</article-title>. <source>Agriculture</source> <volume>13</volume>, <fpage>878</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture13040878</pub-id>
</citation></ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2023</year>a). &#x201c;<article-title>Multi-arm robot task planning for fruit harvesting using multi-agent reinforcement learning</article-title>,&#x201d; in <conf-name>2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>. <fpage>4176</fpage>&#x2013;<lpage>4183</lpage> (<publisher-loc>Detroit, MI, USA</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IROS55552.2023.10341822</pub-id>
</citation></ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2023</year>b). <article-title>A multi-arm robot system for efficient apple harvesting: Perception, task plan and control</article-title>. <source>Comput. Electron. Agric.</source> <volume>211</volume>, <fpage>107979</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107979</pub-id>
</citation></ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A review on structural development and recognition&#x2013;localization methods for end-effector of fruit&#x2013;vegetable picking robots</article-title>. <source>Int. J. Adv. Robot. Syst.</source> <volume>19</volume>, <fpage>17298806221104906</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1177/17298806221104906</pub-id>
</citation></ref>
<ref id="B47">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T. Y.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Hariharan</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Belongie</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <publisher-loc>Honolulu, HI, USA</publisher-loc>, <publisher-name>IEEE</publisher-name>, <fpage>2117</fpage>&#x2013;<lpage>2125</lpage>.</citation></ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Fruit detection in natural environment using partial shape matching and probabilistic hough transform</article-title>. <source>Precis. Agric.</source> <volume>21</volume>, <fpage>160</fpage>&#x2013;<lpage>177</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-019-09662-w</pub-id>
</citation></ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ling</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Gong</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Dual-arm cooperation and implementing for robotic harvesting tomato using binocular vision</article-title>. <source>Robot. Autonomous. Syst.</source> <volume>114</volume>, <fpage>134</fpage>&#x2013;<lpage>143</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.robot.2019.01.019</pub-id>
</citation></ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Swin transformer v2: Scaling up capacity and resolution</article-title>. <source>ArXiv. preprint. arXiv:2111.09883</source>, <page-range>12009&#x2013;19</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52688.2022.01170</pub-id>
</citation></ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jing</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Younas</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Dang</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Performance evaluation of newly released cameras for fruit detection and localization in complex kiwifruit orchard environments</article-title>. <source>J. Field Robot.</source> <volume>41</volume>, <fpage>881</fpage>&#x2013;<lpage>894</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/rob.22297</pub-id>
</citation></ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Nouaze</surname> <given-names>J. C.</given-names>
</name>
<name>
<surname>Touko Mbouembe</surname> <given-names>P. L.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J. H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Yolo-tomato: A robust algorithm for tomato detection based on yolov3</article-title>. <source>Sensors</source> <volume>20</volume>, <fpage>2145</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s20072145</pub-id>, PMID: <pub-id pub-id-type="pmid">32290173</pub-id></citation></ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lopez-Barrio&#x15b;</surname> <given-names>J. D.</given-names>
</name>
<name>
<surname>Escobedo Cabello</surname> <given-names>J. A.</given-names>
</name>
<name>
<surname>Gomez-Espinosa&#xb4;</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Montoya-Cavero</surname> <given-names>L. E.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Green sweet pepper fruit and peduncle detection using mask r-cnn in greenhouses</article-title>. <source>Appl. Sci.</source> <volume>13</volume>, <fpage>6296</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app13106296</pub-id>
</citation></ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>An effective picking point localization method for multi-posture lotus pods based on three-view depth vision observation</article-title>. <source>Comput. Electron. Agric.</source> <volume>227</volume>, <fpage>109492</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109492</pub-id>
</citation></ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Vision-based extraction of spatial information in grape clusters for harvesting robots</article-title>. <source>Biosyst. Eng.</source> <volume>151</volume>, <fpage>90</fpage>&#x2013;<lpage>104</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2016.08.026</pub-id>
</citation></ref>
<ref id="B56">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lv</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Peach fruit recognition method under natural environment</article-title>,&#x201d; in <conf-name>Eighth International Conference on Digital Image Processing (ICDIP 2016)</conf-name>, Vol. <volume>10033</volume>. <fpage>232</fpage>&#x2013;<lpage>236</lpage> (<publisher-loc>Bellingham</publisher-loc>, <publisher-name>SPIE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1117/12.2244945</pub-id>
</citation></ref>
<ref id="B57">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lv</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Rong</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Yellow apple recognition method under natural environment</article-title>,&#x201d; in <conf-name>2015 7th International Conference on Intelligent Human-Machine Systems and Cybernetics</conf-name>, Vol. <volume>1</volume>. <fpage>46</fpage>&#x2013;<lpage>49</lpage> (<publisher-loc>New York</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IHMSC.2015.91</pub-id>
</citation></ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Maud</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Kadim</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Hon</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Real-time 3d mapping and localization of palm oil tree for harvest data management system</article-title>. <source>J. Inf. Syst. And. Technol. Manage. (JISTM).</source> <volume>8</volume>, <fpage>91</fpage>&#x2013;<lpage>101</lpage>.</citation></ref>
<ref id="B59">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Menon</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zaenker</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Dengler</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Bennewitz</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Nbv-sc: Next best view planning based on shape completion for fruit mapping and reconstruction</article-title>,&#x201d; in <conf-name>2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>. <fpage>4197</fpage>&#x2013;<lpage>4203</lpage> (<publisher-loc>New York</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IROS55552.2023.10341855</pub-id>
</citation></ref>
<ref id="B60">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mia</surname> <given-names>M. S.</given-names>
</name>
<name>
<surname>Voban</surname> <given-names>A. A. B.</given-names>
</name>
<name>
<surname>Arnob</surname> <given-names>A. B. H.</given-names>
</name>
<name>
<surname>Naim</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Ahmed</surname> <given-names>M. K.</given-names>
</name>
<name>
<surname>Islam</surname> <given-names>M. S.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Danet: Enhancing small object detection through an efficient deformable attention network</article-title>,&#x201d; in <conf-name>2023 International Conference on the Cognitive Computing and Complex Data (ICCD)</conf-name>. <fpage>51</fpage>&#x2013;<lpage>62</lpage> (<publisher-loc>New York</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCD59681.2023.10420622</pub-id>
</citation></ref>
<ref id="B61">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Miriti</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>2016</year>). <source>. Classification of selected apple fruit varieties using Naive Bayes</source> (<publisher-name>University of Nairobi</publisher-name>, <publisher-loc>Nairobi, Kenya</publisher-loc>).</citation></ref>
<ref id="B62">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ning</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Recognition of sweet peppers and planning the robotic picking sequence in high-density orchards</article-title>. <source>Comput. Electron. Agric.</source> <volume>196</volume>, <fpage>106878</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.106878</pub-id>
</citation></ref>
<ref id="B63">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oliveira</surname> <given-names>L. F.</given-names>
</name>
<name>
<surname>Moreira</surname> <given-names>A. P.</given-names>
</name>
<name>
<surname>Silva</surname> <given-names>M. F.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Advances in agriculture robotics: A state-of-the-art review and challenges ahead</article-title>. <source>Robotics</source> <volume>10</volume>, <fpage>52</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/robotics10020052</pub-id>
</citation></ref>
<ref id="B64">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Park</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Seol</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Pak</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Son</surname> <given-names>H. I.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Human-centered approach for an efficient cucumber harvesting robot system: Harvest ordering, visual servoing, and end-effector</article-title>. <source>Comput. Electron. Agric.</source> <volume>212</volume>, <elocation-id>108116</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108116</pub-id>
</citation></ref>
<ref id="B65">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Parvathi</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Selvi</surname> <given-names>S. T.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Detection of maturity stages of coconuts in complex background using faster r-cnn model</article-title>. <source>Biosyst. Eng.</source> <volume>202</volume>, <fpage>119</fpage>&#x2013;<lpage>132</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2020.12.002</pub-id>
</citation></ref>
<ref id="B66">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pohle-Fr&#xf6;hlich</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Gebler</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Bolten</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Stereo-event-camera-technique for insect monitoring</article-title>. <source>VISIGRAPP</source> <volume>3</volume>, <fpage>375</fpage>&#x2013;<lpage>384</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.5220/0012326500003660</pub-id>
</citation></ref>
<ref id="B67">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rabby</surname> <given-names>M. K. M.</given-names>
</name>
<name>
<surname>Chowdhury</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J. H.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>A modified canny edge detection algorithm for fruit detection &amp; Classification</article-title>,&#x201d; in <conf-name>2018 10TH INTERNATIONAL CONFERENCE ON ELECTRICAL AND COMPUTER ENGINEERING (ICECE)</conf-name>, <conf-loc>New York</conf-loc>. <fpage>237</fpage>&#x2013;<lpage>240</lpage> (<publisher-loc>Dhaka, Bangladesh</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICECE.2018.8636811</pub-id>
</citation></ref>
<ref id="B68">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ramisetty</surname> <given-names>U. M.</given-names>
</name>
<name>
<surname>Gundavarapu</surname> <given-names>V. N. K.</given-names>
</name>
<name>
<surname>Rajender</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Segovia Ram&#xed;rez</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Garc&#xed;a M&#xe1;rquez</surname> <given-names>F. P.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Prediction analysis of crop and their futuristic yields using random forest regression</article-title>,&#x201d; in <conf-name>The International Conference on Industrial Engineering and Industrial Management</conf-name>. <fpage>280</fpage>&#x2013;<lpage>285</lpage> (<publisher-loc>Cham</publisher-loc>, <publisher-name>Springer</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-031-27915-7_50</pub-id>
</citation></ref>
<ref id="B69">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rapado-Rinc&#xf3;n</surname> <given-names>D.</given-names>
</name>
<name>
<surname>van Henten</surname> <given-names>E. J.</given-names>
</name>
<name>
<surname>Kootstra</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Development and evaluation of automated localisation and reconstruction of all fruits on tomato plants in a greenhouse based on multi-view perception and 3d multi-object tracking</article-title>. <source>Biosyst. Eng.</source> <volume>231</volume>, <fpage>78</fpage>&#x2013;<lpage>91</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2023.06.003</pub-id>
</citation></ref>
<ref id="B70">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rebecq</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Gallego</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Mueggler</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Scaramuzza</surname> <given-names>D</given-names>
</name>
</person-group>. (<year>2018</year>). <article-title>EMVS: event-based multi-view stereo&#x2014;3D reconstruction with an event camera in real-time</article-title>? <source>International Journal of Computer Vision</source> <volume>126</volume>, <fpage>1394</fpage>&#x2013;<lpage>1414</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11263017-1050-6</pub-id>
</citation></ref>
<ref id="B71">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: Unified, real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <publisher-loc>Las Vegas, NV, USA</publisher-loc>, <publisher-name>IEEE</publisher-name> <fpage>1</fpage>&#x2013;<lpage>10</lpage>.</citation></ref>
<ref id="B72">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rehman</surname> <given-names>H. U.</given-names>
</name>
<name>
<surname>Miura</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Viewpoint planning for automated fruit harvesting using deep learning</article-title>,&#x201d; in <conf-name>2021 IEEE/SICE International Symposium on System Integration (SII)</conf-name>. <fpage>409</fpage>&#x2013;<lpage>414</lpage> (<publisher-loc>New York</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IEEECONF49454.2021.9382628</pub-id>
</citation></ref>
<ref id="B73">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Faster r-cnn: Towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>39</volume>, <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>, PMID: <pub-id pub-id-type="pmid">27295650</pub-id></citation></ref>
<ref id="B74">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sanders</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Orange harvesting systems review</article-title>. <source>Biosyst. Eng.</source> <volume>90</volume>, <fpage>115</fpage>&#x2013;<lpage>125</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2004.10.006</pub-id>
</citation></ref>
<ref id="B75">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sapkota</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Karkee</surname> <given-names>M</given-names>
</name>
</person-group>. (<year>2025</year>). <article-title>Improved YOLOv12 with LLM-generated synthetic data for enhanced apple detection and benchmarking against YOLOv11 and YOLOv10</article-title>? <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2503.00057</pub-id>. ArXiv preprint arXiv:2503.00057.</citation></ref>
<ref id="B76">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schertz</surname> <given-names>C. E.</given-names>
</name>
<name>
<surname>Brown</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>1968</year>). <article-title>Basic considerations in mechanizing citrus harvest</article-title>. <source>Trans. ASAE.</source> <volume>11</volume>, <fpage>343</fpage>&#x2013;<lpage>0346</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13031/2013.39405</pub-id>
</citation></ref>
<ref id="B77">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sola-Guirado</surname> <given-names>R. R.</given-names>
</name>
<name>
<surname>S&#xe1;nchez-Cachinero</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Blanco-Rold&#xe1;n</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Simultaneous trunk and branch shaking in an over-the-row olive harvester</article-title>. <source>Biosyst. Eng.</source> <volume>231</volume>, <fpage>92</fpage>&#x2013;<lpage>103</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2023.06.005</pub-id>
</citation></ref>
<ref id="B78">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sozzi</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Cantalamessa</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Cogato</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kayad</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Marinello</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Automatic bunch detection in white grape varieties using yolov3, yolov4, and yolov5 deep learning algorithms</article-title>. <source>Agronomy</source> <volume>12</volume>, <fpage>319</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy12020319</pub-id>
</citation></ref>
<ref id="B79">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Suresh Kumar</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Mohan</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Selective fruit harvesting: Research, trends and developments towards fruit detection and localization&#x2013;a review</article-title>. <source>Proc. Inst. Mechanical. Eng. Part C: J. Mechanical. Eng. Sci.</source> <volume>237</volume>, <fpage>1405</fpage>&#x2013;<lpage>1444</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1177/09544062221128443</pub-id>
</citation></ref>
<ref id="B80">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tafuro</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Adewumi</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Parsa</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Amir</surname> <given-names>G. E.</given-names>
</name>
<name>
<surname>Debnath</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Strawberry picking point localization ripeness and weight estimation</article-title>,&#x201d; in <conf-name>2022 International conference on robotics and automation (ICRA)</conf-name>. <fpage>2295</fpage>&#x2013;<lpage>2302</lpage> (<publisher-loc>Philadelphia, PA, USA</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICRA46639.2022.9812200</pub-id>
</citation></ref>
<ref id="B81">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tan</surname> <given-names>S. H.</given-names>
</name>
<name>
<surname>Lam</surname> <given-names>C. K.</given-names>
</name>
<name>
<surname>Kamarudin</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Ismail</surname> <given-names>A. H.</given-names>
</name>
<name>
<surname>Rahim</surname> <given-names>N. A.</given-names>
</name>
<name>
<surname>Azmi</surname> <given-names>M. S. M.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Vision-based edge detection system for fruit recognition</article-title>. <source>J. Phys.: Conf. Ser.</source> <volume>2107</volume>, <fpage>012066</fpage>.</citation></ref>
<ref id="B82">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Evans</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Segmentation of tomato leaf images based on adaptive clustering number of k-means algorithm</article-title>. <source>Comput. Electron. Agric.</source> <volume>165</volume>, <elocation-id>104962</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2019.104962</pub-id>
</citation></ref>
<ref id="B83">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Doermann</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>YOLOv12: attention-centric real-time object detectors</article-title>. <source>arXiv</source> doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2502.12524</pub-id>
</citation></ref>
<ref id="B84">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Urvina</surname> <given-names>R. P.</given-names>
</name>
<name>
<surname>Guevara</surname> <given-names>C. L.</given-names>
</name>
<name>
<surname>V&#xe1;sconez</surname> <given-names>J. P.</given-names>
</name>
<name>
<surname>Prado</surname> <given-names>A. J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>An integrated route and path planning strategy for skid&#x2013;steer mobile robots in assisted harvesting tasks with terrain traversability constraints</article-title>. <source>Agriculture</source> <volume>14</volume>, <fpage>1206</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture14081206</pub-id>
</citation></ref>
<ref id="B85">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Goudos</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Faster r-cnn for multi-class fruit detection using a robotic vision system</article-title>. <source>Comput. Networks</source> <volume>168</volume>, <fpage>107036</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.comnet.2019.107036</pub-id>
</citation></ref>
<ref id="B86">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Guan</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Review of the perception technologies for unmanned agricultural machinery operating environment</article-title>. <source>Transactions of the Chinese Society of Agricultural Engineering</source> <volume>40</volume>, <fpage>1</fpage>&#x2013;<lpage>18</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.10026819.202402020</pub-id>
</citation></ref>
<ref id="B87">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Yolov10: Real-time end-to-end object detection</article-title>. <source>arXiv. preprint. arXiv:2405.14458</source> <volume>37</volume>, <page-range>107984&#x2013;108011</page-range>.</citation></ref>
<ref id="B88">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>a). <article-title>Hybrid path planning methods for complete coverage in harvesting operation scenarios</article-title>. <source>Comput. Electron. Agric.</source> <volume>231</volume>, <fpage>109946</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.109946</pub-id>
</citation></ref>
<ref id="B89">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>b). <article-title>A collaborative scheduling and planning method for multiple machines in harvesting and transportation operations-part: Harvester task allocation and sequence optimization</article-title>. <source>Comput. Electron. Agric.</source> <volume>232</volume>, <fpage>110060</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110060</pub-id>
</citation></ref>
<ref id="B90">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Observe then act: Asynchronous active vision-action model for robotic manipulation</article-title>. <source>arXiv. preprint. arXiv:2409.14891</source>.</citation></ref>
<ref id="B91">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Cognition of grape cluster picking point based on visual knowledge distillation in complex vineyard environment</article-title>. <source>Comput. Electron. Agric.</source> <volume>225</volume>, <fpage>109216</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109216</pub-id>
</citation></ref>
<ref id="B92">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>). <article-title>Real-time semantic slam-based 3d reconstruction robot for greenhouse vegetables</article-title>. <source>Comput. Electron. Agric.</source> <volume>237</volume>, <fpage>110582</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110582</pub-id>
</citation></ref>
<ref id="B93">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ouyang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>). <article-title>YOLO-ALW: an enhanced high-precision model for chili maturity detection</article-title>. <source>Sensors</source> <volume>25</volume>, <fpage>1405</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s25051405</pub-id>, PMID: <pub-id pub-id-type="pmid">40096232</pub-id></citation></ref>
<ref id="B94">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Tie</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>He</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Recognition and localization of occluded apples using k-means clustering algorithm and convex hull theory: a comparison</article-title>. <source>Multimedia. Tools Appl.</source> <volume>75</volume>, <fpage>3177</fpage>&#x2013;<lpage>3198</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-014-2429-9</pub-id>
</citation></ref>
<ref id="B95">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Xiaomila green pepper target detection method under complex environment based on improved yolov5s</article-title>. <source>Agronomy</source> <volume>12</volume>, <fpage>1477</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy12061477</pub-id>
</citation></ref>
<ref id="B96">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Ying</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Full coverage path planning methods of harvesting robot with multi-objective constraints</article-title>. <source>J. Intelligent. Robot. Syst.</source> <volume>106</volume>, <fpage>17</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10846-022-01722-0</pub-id>
</citation></ref>
<ref id="B97">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Qian</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Continuous picking of yellow peaches with recognition and collision-free path</article-title>. <source>Comput. Electron. Agric.</source> <volume>214</volume>, <fpage>108273</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108273</pub-id>
</citation></ref>
<ref id="B98">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Yeh</surname> <given-names>I. H.</given-names>
</name>
<name>
<surname>Mark Liao</surname> <given-names>H. Y. M.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Yolov9: Learning what you want to learn using programmable gradient information</article-title>,&#x201d; in <conf-name>European conference on computer vision</conf-name>. <volume>15089</volume>, <fpage>1</fpage>&#x2013;<lpage>21</lpage> (<publisher-loc>Cham</publisher-loc>, <publisher-name>Springer</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-031-72751-1_1</pub-id>
</citation></ref>
<ref id="B99">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>H</given-names>
</name>
</person-group> (<year>2022</year>) <article-title>&#x201c;Apple stem/calyx real-time recognition using YOLO-v5 algorithm for fruit automatic loading system&#x201d;</article-title>, <source>Postharvest Biology and Technology</source>, Vol. <volume>185</volume>, p. <fpage>111808</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.postharvbio.2021.111808</pub-id>
</citation></ref>
<ref id="B100">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lan</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Automatic method of fruit object extraction under complex agricultural background for vision system of fruit picking robot</article-title>. <source>Optik</source> <volume>125</volume>, <fpage>5684</fpage>&#x2013;<lpage>5689</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ijleo.2014.07.001</pub-id>
</citation></ref>
<ref id="B101">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yin</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Detection of wheat lodging by binocular cameras during harvesting operation</article-title>. <source>Agriculture</source> <volume>13</volume>, <fpage>120</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture13010120</pub-id>
</citation></ref>
<ref id="B102">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Tomato harvesting robot system based on binocular vision</article-title>,&#x201d; in <conf-name>2021 IEEE international conference on unmanned systems (ICUS)</conf-name>. <fpage>757</fpage>&#x2013;<lpage>761</lpage> (<publisher-loc>Beijing, China</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICUS52573.2021.9641260</pub-id>
</citation></ref>
<ref id="B103">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xia</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Rapid strawberry ripeness detection and 3d localization of picking point based on improved yolo v8-pose with rgbcamera</article-title>. <source>J. Electric. Syst.</source> <volume>20</volume>, <fpage>2171</fpage>&#x2013;<lpage>2187</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.52783/jes.1840</pub-id>
</citation></ref>
<ref id="B104">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Review of research advances in fruit and vegetable harvesting robots</article-title>. <source>J. Electric. Eng. Technol.</source> <volume>19</volume>, <fpage>773</fpage>&#x2013;<lpage>789</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s42835-023-01596-8</pub-id>
</citation></ref>
<ref id="B105">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yamamoto</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hayashi</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yoshida</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Kobayashi</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Development of a stationary robotic strawberry harvester with a picking mechanism that approaches the target fruit from below</article-title>. <source>Jpn. Agric. Res. Quarterly.: JARQ.</source> <volume>48</volume>, <fpage>261</fpage>&#x2013;<lpage>269</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.6090/jarq.48.261</pub-id>
</citation></ref>
<ref id="B106">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>Q.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Vibration analysis and experimental study of the effects of mechanised grape picking on the fruit&#x2013;stem system</article-title>. <source>Biosyst. Eng.</source> <volume>227</volume>, <fpage>82</fpage>&#x2013;<lpage>94</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2023.01.019</pub-id>
</citation></ref>
<ref id="B107">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Stumberg</surname> <given-names>Lv</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Cremers</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>D3vo: Deep depth, deep pose and deep uncertainty for monocular visual odometry</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <publisher-loc>Seattle, WA, USA</publisher-loc>, <publisher-name>IEEE</publisher-name> <fpage>1281</fpage>&#x2013;<lpage>1292</lpage>.</citation></ref>
<ref id="B108">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yi</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>View planning for grape harvesting based on active vision strategy under occlusion</article-title>. <source>IEEE Robot. Automat. Lett.</source> <volume>9</volume>, <fpage>2535</fpage>&#x2013;<lpage>2542</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LRA.2024.3357397</pub-id>
</citation></ref>
<ref id="B109">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yin</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Handroos</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Monocular camera-based robotic pick-and-place in fusion applications</article-title>. <source>Appl. Sci.</source> <volume>13</volume>, <fpage>4487</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app13074487</pub-id>
</citation></ref>
<ref id="B110">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yogesh</surname>
</name>
<name>
<surname>Dubey</surname> <given-names>A. K.</given-names>
</name>
<name>
<surname>Arora</surname> <given-names>R. R.</given-names>
</name>
<name>
<surname>Mathur</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Fruit defect prediction model (fdpm) based on three-level validation</article-title>. <source>J. Nondestruct. Eval.</source> <volume>40</volume>, <fpage>45</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10921-021-00778-6</pub-id>
</citation></ref>
<ref id="B111">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yoshida</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Kawahara</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Fukao</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Fruit recognition method for a harvesting robot with rgb-d cameras</article-title>. <source>ROBOMECH. J.</source> <volume>9</volume>, <fpage>15</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s40648-022-00230-y</pub-id>
</citation></ref>
<ref id="B112">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Real-time visual localization of the picking points for a ridge-planting strawberry harvesting robot</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>116556</fpage>&#x2013;<lpage>116568</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/Access.6287639</pub-id>
</citation></ref>
<ref id="B113">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zaenker</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Ruckin&#xa8;</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Menon</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Popovic&#xb4;</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Bennewitz</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Graph-based view motion planning for fruit detection</article-title>,&#x201d; in <conf-name>2023 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>. <fpage>4219</fpage>&#x2013;<lpage>4225</lpage> (<publisher-loc>Detroit, MI, USA</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IROS55552.2023.10341874</pub-id>
</citation></ref>
<ref id="B114">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zaenker</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Smitt</surname> <given-names>C.</given-names>
</name>
<name>
<surname>McCool</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Bennewitz</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Viewpoint planning for fruit size and position estimation</article-title>,&#x201d; in <conf-name>2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>. <fpage>3271</fpage>&#x2013;<lpage>3277</lpage> (<publisher-name>IEEE</publisher-name>).</citation></ref>
<ref id="B115">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Qu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Automatic fruit picking technology: a comprehensive review of research advances</article-title>. <source>Artif. Intell. Rev.</source> <volume>57</volume>, <fpage>54</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-023-10674-2</pub-id>
</citation></ref>
<ref id="B116">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lammers</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Chu</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Dickinson</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Algorithm design and integration for a robotic apple harvesting system</article-title>,&#x201d; in <conf-name>2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>. <fpage>9217</fpage>&#x2013;<lpage>9224</lpage> (<publisher-loc>Kyoto, Japan</publisher-loc>, <publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IROS47612.2022.9981417</pub-id>
</citation></ref>
<ref id="B117">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Classification of fruits using computer vision and a multiclass support vector machine</article-title>. <source>sensors</source> <volume>12</volume>, <fpage>12489</fpage>&#x2013;<lpage>12505</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s120912489</pub-id>, PMID: <pub-id pub-id-type="pmid">23112727</pub-id></citation></ref>
<ref id="B118">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Dang</surname> <given-names>Q.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). &#x201c;<article-title>DETRs beat YOLOs on real-time object detection</article-title>,&#x201d; in <conf-name>2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Seattle, WA, USA</publisher-loc>, <publisher-name>IEEE</publisher-name> <page-range>16965&#x2013;74</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01605</pub-id>
</citation></ref>
<ref id="B119">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Pang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Tu</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A mango picking vision algorithm on instance segmentation and key point detection from rgb images in an open orchard</article-title>. <source>Biosyst. Eng.</source> <volume>206</volume>, <fpage>32</fpage>&#x2013;<lpage>54</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2021.03.012</pub-id>
</citation></ref>
<ref id="B120">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhong</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Huo</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A method for litchi picking points calculation in natural environment based on main fruit bearing branch detection</article-title>. <source>Comput. Electron. Agric.</source> <volume>189</volume>, <fpage>106398</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106398</pub-id>
</citation></ref>
<ref id="B121">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>D. P.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>M. M.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Rgb-d salient object detection: A survey</article-title>. <source>Comput. Visual Media.</source> <volume>7</volume>, <fpage>37</fpage>&#x2013;<lpage>69</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s41095-020-0199-z</pub-id>, PMID: <pub-id pub-id-type="pmid">33432275</pub-id></citation></ref>
<ref id="B122">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>He</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Karkee</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Analysis of shaking-induced cherry fruit motion and damage</article-title>. <source>Biosyst. Eng.</source> <volume>144</volume>, <fpage>105</fpage>&#x2013;<lpage>114</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2016.02.007</pub-id>
</citation></ref>
<ref id="B123">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Au</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Kang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Intelligent robots for fruit harvesting: Recent developments and future challenges</article-title>. <source>Precis. Agric.</source> <volume>23</volume>, <fpage>1856</fpage>&#x2013;<lpage>1907</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-022-09913-3</pub-id>
</citation></ref>
<ref id="B124">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Research on pedestrian detection technology based on the svm classifier trained by hog and ltp features</article-title>. <source>Future Generat. Comput. Syst.</source> <volume>125</volume>, <fpage>604</fpage>&#x2013;<lpage>615</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.future.2021.06.016</pub-id>
</citation></ref>
<ref id="B125">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Ying</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Vougioukas</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Optimal scheduling of a dual-arm robot for efficient strawberry harvesting in plant factories</article-title>. <source>arXiv. preprint. arXiv:2507.04240</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2507.04240</pub-id>
</citation></ref>
<ref id="B126">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Su</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deformable DETR: deformable transformers for end-toEnd object detection</article-title>. <source>ArXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2010.04159</pub-id>
</citation></ref>
</ref-list>
</back>
</article>