<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mech. Eng.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Mechanical Engineering</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mech. Eng.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2297-3079</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1741396</article-id>
<article-id pub-id-type="doi">10.3389/fmech.2026.1741396</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Harvesting target positioning and robotic arm obstacle avoidance algorithm based on improved YOLOv8 and BIT&#x2a;</article-title>
<alt-title alt-title-type="left-running-head">Xu</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fmech.2026.1741396">10.3389/fmech.2026.1741396</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xu</surname>
<given-names>Yingwu</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3271642"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>Anqing Vocational and Technical College</institution>, <city>Anqing</city>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Yingwu Xu, <email xlink:href="mailto:godfatherwww@163.com">godfatherwww@163.com</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-21">
<day>21</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>12</volume>
<elocation-id>1741396</elocation-id>
<history>
<date date-type="received">
<day>07</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>04</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Xu.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Xu</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-21">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>To address the core challenges of inaccurate fruit occlusion localization and inefficient robotic arm dynamic obstacle avoidance in complex, unstructured agricultural environments, this study proposes an integrated algorithm for harvesting.</p>
</sec>
<sec>
<title>Methods</title>
<p>The proposed algorithm is built upon an improved YOLOv8 model and the BIT&#x2a; planner. The YOLOv8 model was enhanced by introducing the Swin Transformer module to improve multi-scale feature fusion and global context modeling. The BIT&#x2a; planner was integrated with a BiLSTM network to endow it with dynamic obstacle prediction capabilities, thereby constructing a unified architecture for visual perception and motion planning.</p>
</sec>
<sec>
<title>Results</title>
<p>Experimental results demonstrated that the algorithm achieved real-time performance with a processing frame rate of 32.7 fps and an inference time of 32.6 ms for target localization, with a localization error standard deviation as low as 1.70 mm. In obstacle avoidance planning, it achieved a balance with manipulator energy consumption of 124.58 J, while controlling the computational load and memory resource consumption per task to 22.7 GFlops and 187 MB, respectively.</p>
</sec>
<sec>
<title>Discussion</title>
<p>This approach provides a high-precision, low-energy-consumption cooperative control solution for agricultural harvesting robots, advancing the practical application of automated fruit and vegetable harvesting.</p>
</sec>
</abstract>
<kwd-group>
<kwd>agriculture</kwd>
<kwd>automated harvesting</kwd>
<kwd>BIT&#x2a;</kwd>
<kwd>robotic arm</kwd>
<kwd>YOLOv8</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The research is supported by: Project Level: Key Provincial Teaching Research Project of Higher Education Institutions in Anhui Province; Research on Strategies and Paths to Improve Teachers&#x2019; Informatization Ability under Educational Digital Transformation; (2022jyxm946).</funding-statement>
</funding-group>
<counts>
<fig-count count="13"/>
<table-count count="3"/>
<equation-count count="6"/>
<ref-count count="32"/>
<page-count count="16"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Mechatronics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Background</title>
<p>Harvesting is one of the most labor-intensive and time-consuming steps in the production of fruits and vegetables. Its level of automation and intelligence directly impacts production efficiency, cost control, and industrial upgrading (<xref ref-type="bibr" rid="B14">Liu and Liu, 2024</xref>). Therefore, developing efficient, precise, and autonomous intelligent harvesting robot systems holds significant practical and economic value for freeing up labor, advancing agricultural modernization, and ensuring food security (<xref ref-type="bibr" rid="B32">Zhou et al., 2022</xref>). Among these, the precise target localization of the perception module and the dexterous obstacle avoidance path planning of the execution module represent two critical technological bottlenecks determining system performance (<xref ref-type="bibr" rid="B29">Zeeshan and Aized, 2023</xref>). In complex, unstructured natural field environments, harvesting targets (such as fruits) are frequently disrupted by factors including variable lighting, foliage obstruction, similar colors and textures, variable scales, and overlapping clusters. This places extremely high demands on the robustness and accuracy of visual detection algorithms (<xref ref-type="bibr" rid="B19">Panduranga et al., 2024</xref>). Simultaneously, when executing grasping tasks, robotic arms must navigate dense, intertwined crop branches to plan collision-free, highly efficient trajectories. Any planning failure or delay may lead to task interruption or crop damage. The performance and dependability of path planning algorithms in real-time are severely hampered by this (<xref ref-type="bibr" rid="B3">Droukas et al., 2023</xref>).</p>
<p>To address these challenges, numerous experts in the field of smart agriculture have embarked on exploratory research. To overcome the difficulties of identifying clustered tomato fruits and selecting the best picking locations in challenging situations, <xref ref-type="bibr" rid="B2">Bai et al. (2023)</xref> developed a two-step localization technique that integrated multi-feature extraction and geometry analysis for target recognition in harvesting. This approach could achieve precise fruit region identification and accurate stem-picking point localization (<xref ref-type="bibr" rid="B2">Bai et al., 2023</xref>). To address the challenge of accurately detecting tomato fruits and stems in complex agricultural environments, <xref ref-type="bibr" rid="B16">Miao et al. (2023)</xref> proposed an integrated detection algorithm combining traditional image processing with you only look once version 5 (YOLOv5). Through multi-method fusion and error compensation strategies, this research could achieve precise determination of tomato ripeness and accurate stem localization, providing reliable guidance for efficient robotic harvesting (<xref ref-type="bibr" rid="B16">Miao et al., 2023</xref>). <xref ref-type="bibr" rid="B6">Gong et al. (2022)</xref> suggested a geometric feature reconstruction technique based on multi-source image fusion and an extended mask region-based convolutional neural network (Mask R-CNN) to address the problem of inadequate visual positioning accuracy in fruit-picking robots operating in obscured situations. By integrating multi-source image registration with shape-position recovery algorithms, this approach could achieve high-precision 3D geometric reconstruction and picking point localization for occluded tomatoes (<xref ref-type="bibr" rid="B6">Gong et al., 2022</xref>). To address the high labor costs and fruit identification/localization challenges in strawberry picking, <xref ref-type="bibr" rid="B7">Hu et al. (2022)</xref> proposed a recognition and localization method integrating instance segmentation with stereo vision. By combining a dual-network architecture of Mask R-CNN and YOLOv3 with the 3D localization technology of the Zeid stereo vision camera, this research could achieve precise identification and 3D spatial localization of ripe strawberries, providing accurate target location information for picking robots (<xref ref-type="bibr" rid="B7">Hu et al., 2022</xref>).</p>
<p>To solve the problems of excessive path planning time and low picking efficiency in unstructured orchard environments, <xref ref-type="bibr" rid="B31">Zhang et al. (2024)</xref> suggested a heuristic dynamic rapidly-exploring random tree connect (HDRRT) motion planning algorithm for robotic arms obstacle avoidance planning obstacle avoidance planning. By using a dual-structure strategy that combined heuristic dynamic step size strategies and adaptive target gravity, this study could successfully decrease path planning time and path cost while increasing planning success rates (<xref ref-type="bibr" rid="B31">Zhang et al., 2024</xref>). <xref ref-type="bibr" rid="B13">Liu (2022)</xref> addressed the low efficiency of apple-picking robots in unstructured orchard situations by proposing the hierarchical optimal path planning (HOPP) method. This study significantly reduced the computational time required for three-dimensional picking path planning by combining a two-layer structure with distance-constrained K-means clustering and traveling salesman problem solutions. This approach achieved globally optimal harvesting path planning for multi-objective fruit harvesting (<xref ref-type="bibr" rid="B13">Liu, 2022</xref>). A view planner based on an active vision technique was proposed by <xref ref-type="bibr" rid="B27">Yi et al. (2024)</xref> to solve the problem of accurately localizing fruit-picking points in heavily obstructed settings. Through a three-step structure, including candidate view generation, spatial coverage score function optimization, and iterative viewpoint adjustment, this research effectively addressed stem occlusion issues, significantly improving the robot&#x2019;s picking success rate and operational efficiency (<xref ref-type="bibr" rid="B27">Yi et al., 2024</xref>). <xref ref-type="bibr" rid="B25">Xu et al. (2021)</xref> proposed an improved artificial potential field algorithm to address the issues of local minima and insufficient obstacle shape perception in traditional methods for robotic arms 3D path planning. By incorporating a repulsive isopotential surface movement mechanism and a local path optimization structure, this research effectively resolved local minima traps and enabled obstacle shape perception, significantly enhancing path planning success rates and motion smoothness (<xref ref-type="bibr" rid="B25">Xu et al., 2021</xref>). In summary, existing research exhibits a typical architecture characterized by &#x201c;decoupling perception and planning modules&#x201d; in its technical approach. Its core advantages lie in its perception layer. Techniques such as multi-source information fusion, the integration of traditional and deep learning, and stereo vision effectively enhance the robustness of target recognition and the accuracy of positioning for fruits and vegetables in static environments. At the planning layer, strategies including heuristic random sampling, hierarchical task decomposition, and active perception decision-making significantly optimize path cost and static obstacle avoidance success rates. However, this architecture has fundamental limitations. The perception and planning stages operate in an unidirectional, open-loop manner. They lack real-time visual feedback adjustments based on motion states. The visual module exhibits insufficient generalization capabilities against dynamic occlusions and sudden lighting changes. Moreover, the planning module generally lacks explicit modeling and prediction of dynamic obstacle movement trends. Consequently, the system faces constraints in overall adaptability, real-time responsiveness, and closed-loop stability within highly unstructured, dynamically changing field environments.</p>
<p>YOLOv8 extracts features through a backbone network (BN), fuses multi-scale information via a neck network, and finally performs both bounding box (BOB) regression and classification prediction simultaneously through a detection head (<xref ref-type="bibr" rid="B12">Li et al., 2024</xref>). Batch informed trees&#x2a; (BIT&#x2a;) combines graph search with random sampling, pruning ineffective regions using heuristic information, and progressively optimizes path costs through iterative batch processing (<xref ref-type="bibr" rid="B11">Kyaw et al., 2022</xref>). However, YOLOv8 exhibits insufficient perception of occluded objects and small fruit stems. BIT&#x2a; lacks a mechanism for reacting to dynamic barriers and has poor processing efficiency in high-dimensional areas (<xref ref-type="bibr" rid="B24">Xu and Li, 2025</xref>; <xref ref-type="bibr" rid="B20">Tamizi et al., 2024</xref>). XAmong them, the perception module uses YOLOv8 as its framework and incorporates the Swin Transformer as its BN. Its sliding window attention mechanism improves the accuracy of fruit target recognition and localization in complex occlusion environments by enhancing multi-scale feature fusion (MSFF) and global context modeling. The planning module utilizes the BIT&#x2a; framework, integrating a BiLSTM network to predict dynamic obstacle movement trends. Temporal modeling enhances the robotic arm&#x2019;s foresight and adaptability in path search, enabling efficient and smooth obstacle avoidance in dynamic, unstructured environments. Both modules achieve information integration through hand-eye calibration and coordinate transformation, ultimately forming a unified &#x201c;perception-decision-control&#x201d; collaborative system. This approach ensures positioning accuracy and planning efficiency while significantly reducing computational and energy consumption costs. Its innovation lies in its ability to achieve synergistic breakthroughs in perception, decision-making, and control. This is accomplished through multi-scale feature enhancement, spatio-temporal context modeling, adaptive sampling strategies, and dynamic cost function optimization.</p>
</sec>
<sec sec-type="methods" id="s2">
<label>2</label>
<title>Methodology</title>
<p>This section comprises two parts. The first part introduces the Swin Transformer module based on the YOLOv8 object detection framework to construct a rapid and precise fruit-picking target localization module. It enhances fruit recognition accuracy (RA) in complex environments through MSFF and global context modeling. The second part combines the BiLSTM&#x2019;s temporal prediction capabilities with the BIT&#x2a; path planning algorithm to develop a RAOA module with dynamic obstacle response capabilities. Finally, the two modules are integrated through hand-eye calibration and coordinate transformation mechanisms to form a complete vision-motion control closed-loop system. This realizes the YOLOv8-B&#x2a; algorithm architecture from fruit recognition to picking path planning.</p>
<sec id="s2-1">
<label>2.1</label>
<title>Harvesting target positioning module based on YOLOv8</title>
<p>In automated harvesting systems for fruits and vegetables, robotic arms serve as the core execution units. Their grasping success rate and operational efficiency heavily depend on the precise spatial localization of target fruits. Accurate, real-time identification and localization of fruit positions are fundamental prerequisites for achieving damage-free grasping while avoiding collisions and mispicks. Consequently, this study employs YOLOv8 as the foundation for target localization during harvesting operations. YOLOv8 is selected as the core visual localization framework primarily due to its classic balance in object detection tasks, robust multi-scale feature extraction (MSFE) capabilities, and potential for lightweight deployment. Its efficient cross stage partial network with feature fusion (C2F) architecture and decoupled detector head design provide a stable and scalable baseline. Compared to subsequent versions that focus on specific tasks or architectures, YOLOv8 has broader industrial deployment validation and more experience with lightweight optimization. This makes it better suited for agricultural embedded scenarios with dual constraints on reliability and computational resources (<xref ref-type="bibr" rid="B15">Ma et al., 2024</xref>). The architecture of YOLOv8 is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Schematic illustration of the YOLOv8 architecture.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g001.tif">
<alt-text content-type="machine-generated">Flowchart of a neural network architecture showing three main sections: Backbone, Neck, and Head. Each section includes labeled modules such as CBS, C2F, SPPF, Concat, and Upsample. Additional components are Conv2d, BCE Loss, DF Loss, and CIoU Loss, represented by various colored shapes. Arrows indicate the data flow between sections.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F1">Figure 1</xref>, the YOLOv8 network architecture primarily consists of three components: the BN, the neck network, and the detection head. It achieves MSFE and fusion through modules such as convolution &#x2b; batchnorm &#x2b; sigmoid, C2F, and spatial pyramid pooling fast (<xref ref-type="bibr" rid="B5">Gao et al., 2024</xref>). The detection performance of YOLOv8 relies on optimizing the total loss function. The model learns end-to-end by minimizing the discrepancy between projected values and ground truth annotations while concurrently predicting item BOB coordinates, category labels, and object presence confidence scores during training (<xref ref-type="bibr" rid="B4">Gao et al., 2023</xref>). <xref ref-type="disp-formula" rid="e1">Equation 1</xref> illustrates that the weighted sum of the three terms is the definition of the total loss function.<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>total</mml:mtext>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>cls</mml:mtext>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>box</mml:mtext>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>obj</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e1">Equation 1</xref>, <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>cls</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the classification loss. <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>box</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the BOB regression loss. <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>obj</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> signifies the object confidence loss. <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> serves as the weighting coefficient for each loss term, balancing the optimization scales across different tasks. Specifically, <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>cls</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> employs binary cross-entropy (BCE) loss to calculate the discrepancy between predicted and ground-truth categories. <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>box</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> utilizes a combination of distribution focal (DF) loss and complete intersection over union (CIoU) loss. While CIoU thoroughly takes into account overlap area, center point distance, and aspect ratio to obtain more accurate BOB regression, DF optimizes the focused distribution of BOB position probability. <inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mtext>obj</mml:mtext>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> also employs BCE loss to determine whether an object exists within the BOB (<xref ref-type="bibr" rid="B1">Ayyad et al., 2025</xref>). Localization results can be directly output as fruit center coordinates and size information for subsequent robotic arms motion planning and grasp pose estimation.</p>
<p>However, the CNN backbone of YOLOv8 has limited capabilities for modeling global contextual information and long-range dependencies. The ST achieves powerful global modeling capabilities while maintaining computational efficiency through its sliding window mechanism (<xref ref-type="bibr" rid="B18">Pal et al., 2023</xref>). Therefore, this study incorporates the ST into the BN of YOLOv8 to enable more precise feature extraction and localization of occluded or densely clustered objects in complex environments. <xref ref-type="fig" rid="F2">Figure 2</xref> depicts the structure of the ST.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Schematic illustration of the ST architecture.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture with stages, showing a progression from input through patch partition and linear embedding, followed by four distinct ST blocks. Each block contains Layer Normalization (LN), Window-based Multi-head Self-Attention (W-MSA), Shifted Window Multi-head Self-Attention (SW-MSA), and Multi-layer Perceptron (MLP) components. The process is repeated over two successive ST blocks.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F2">Figure 2</xref>, the ST adopts a hierarchical architecture. Based on window-based multi-head self-attention (W-MSA) and shifted window MSA (SW-MSA), it constructs a general-purpose BN capable of efficiently processing visual tasks. Its core lies in the W-MSA computation, where the standard self-attention (SA) calculation is expressed in <xref ref-type="disp-formula" rid="e2">Equation 2</xref>.<disp-formula id="e2">
<mml:math id="m9">
<mml:mrow>
<mml:mtext>Attention</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>Softmax</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
<mml:msqrt>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msqrt>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e2">Equation 2</xref>, <inline-formula id="inf8">
<mml:math id="m10">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> represents the query, key, and value matrices. <inline-formula id="inf9">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the dimension of the key vector. <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:msqrt>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msqrt>
</mml:mrow>
</mml:math>
</inline-formula> is used to scale the dot product results, preventing softmax gradient saturation. <inline-formula id="inf11">
<mml:math id="m13">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the relative position bias, introducing spatial position priors for each attention head to enhance the model&#x2019;s perception of geometric structures. To greatly reduce computational complexity, the ST splits the input image into non-overlapping windows and calculates SA within each window (<xref ref-type="bibr" rid="B22">Wang et al., 2023</xref>). To further enable cross-window connections, the alternately applied SW-MSA shifts window partitions, allowing attention computations to extend beyond original window boundaries. <xref ref-type="disp-formula" rid="e3">Equation 3</xref> can be used to represent two consecutive ST blocks.<disp-formula id="e3">
<mml:math id="m14">
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>MSA</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>MLP</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>SW</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>MSA</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>MLP</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e3">Equation 3</xref>, <inline-formula id="inf12">
<mml:math id="m15">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> represents the output features of layers <inline-formula id="inf13">
<mml:math id="m16">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf14">
<mml:math id="m17">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf15">
<mml:math id="m18">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf16">
<mml:math id="m19">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mover accent="true">
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> denotes the residual output after the MSA module. LN indicates the layer normalization (LN) operation. <inline-formula id="inf17">
<mml:math id="m20">
<mml:mtext>MLP</mml:mtext>
</mml:math>
</inline-formula> refers to the multilayer perceptron (MLP), which performs nonlinear transformation and feature enhancement. This architecture ensures trainability in deep networks through residual connections and LN, while progressively integrating local and global information at each stage via the alternating W-MSA and SW-MSA mechanism (<xref ref-type="bibr" rid="B21">Tang et al., 2025</xref>). Consequently, the study centers on introducing the ST-based YOLOv8 to construct a harvesting target positioning module. Its structure is illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Schematic illustration of the harvesting target positioning module architecture.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g003.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a neural network architecture with labeled sections: Input, Backbone, Neck, and Head. Various blocks include Patch Partition, Linear Embedding, and ST Block. Functions like Concat and Upsample are indicated along with losses: BCE, DF, and CIoU.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F3">Figure 3</xref>, this module centers on the YOLOv8 network, replacing the original backbone with a ST for deep feature extraction to enhance representation capabilities for occluded and small target fruits. A feature pyramid network (FPN) and path aggregation network (PANet) structure make up the neck after the backbone, allowing multi-scale feature propagation using both top-down and bottom-up methods. Lastly, a decoupled detection head allows the independent prediction of object category confidence scores and precise geographical coordinates by separating the classification work from the BOB regression task. Through these enhancements, the harvesting target positioning module achieves accurate fruit object recognition and highly reliable localization in complex agricultural environments, providing high-quality visual input for subsequent robotic arms grasping planning.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Obstacle avoidance module for robotic arms based on BIT&#x2a; and YOLOv8-B&#x2a; algorithm construction</title>
<p>The harvesting target positioning module developed in this study achieves high-precision spatial localization of fruit targets. However, its output provides only static coordinate information and lacks dynamic path planning capabilities for robotic arms movements. In unstructured orchard environments, effective obstacle avoidance along the robotic arms&#x2019; path is crucial for successful harvesting. BIT&#x2a; significantly enhances RRT&#x2019;s convergence efficiency through batch sampling and heuristic pruning mechanisms. Its incremental graph update structure continuously integrates real-time perception data to adapt to dynamic environments. Unlike gradient-based optimization or data-driven planning methods, BIT does not require differentiable environment models or large-scale labeled trajectories. Through state space sampling and pruning, it achieves robust and efficient dynamic obstacle avoidance in unstructured scenarios. Therefore, this study utilizes BIT&#x2a; as the foundation for RAOA operations. <xref ref-type="fig" rid="F4">Figure 4</xref> provides an illustration of its operational procedures (<xref ref-type="bibr" rid="B17">Nenavath and Perumal, 2024</xref>).</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Schematic of the BIT&#x2a; operating flow.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g004.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a search process using a heuristic approach. The first panel shows an outward search from a minimal solution. The second panel indicates the batch process stops once a solution is found. The third panel shows a new batch of samples, and the search restarts. The final panel suggests repeating the steps with each improved solution.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F4">Figure 4</xref>, the operational flow of BIT&#x2a; constitutes an iterative batch sampling process. It intelligently expands sampling batches within the state space and searches for random geometric configurations to identify and continuously optimize paths. BIT&#x2a; explores the solution space by maintaining a tree structure <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:mi mathvariant="script">T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Among these, the vertex set <inline-formula id="inf19">
<mml:math id="m22">
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents explored states, while the edge set <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes feasible paths between states. Its core lies in generating a sampling batch during each iteration and computing heuristic values to guide the search direction. For any configuration <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> formed by the joint angles of an arbitrary robotic arms, its heuristic value is jointly determined by the cost <inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mtext>current</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the current solution and the estimated cost (EC) <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>h</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to the target. The algorithm first constructs two search trees from the start and target points, respectively, and continuously performs heuristic sorting, as shown in <xref ref-type="disp-formula" rid="e4">Equation 4</xref> (<xref ref-type="bibr" rid="B8">Huynh et al., 2023</xref>).<disp-formula id="e4">
<mml:math id="m27">
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mover accent="true">
<mml:mi>h</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mtext>current</mml:mtext>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mover accent="true">
<mml:mi>h</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e4">Equation 4</xref>, <inline-formula id="inf24">
<mml:math id="m28">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the actual path cost from the starting point <inline-formula id="inf25">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mtext>start</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to the current state <inline-formula id="inf26">
<mml:math id="m30">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (e.g., path length (PL)). <inline-formula id="inf27">
<mml:math id="m31">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>h</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the heuristic EC from <inline-formula id="inf28">
<mml:math id="m32">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to the target point <inline-formula id="inf29">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mtext>goal</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, typically using Euclidean distance. <inline-formula id="inf30">
<mml:math id="m34">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is used to prioritize candidate expansion nodes, favoring exploration in potentially optimal path directions. <inline-formula id="inf31">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mtext>current</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the total path cost of currently known feasible solutions. <inline-formula id="inf32">
<mml:math id="m36">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the upper bound on path cost achievable via node F, used for ranking and pruning (<xref ref-type="bibr" rid="B26">Xu et al., 2022</xref>). In each batch processing, if <inline-formula id="inf33">
<mml:math id="m37">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mtext>current</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> holds, it indicates that the node cannot produce a better solution and is pruned. The algorithm only expands vertices that satisfy <inline-formula id="inf34">
<mml:math id="m38">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mtext>current</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf35">
<mml:math id="m39">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mtext>current</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, thereby effectively pruning search regions unlikely to improve the current solution. Whenever a new solution or a better solution is found, <inline-formula id="inf36">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mtext>current</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is updated, and the search restarts to find a better path on a more finely sampled graph (<xref ref-type="bibr" rid="B9">Johnson et al., 2023</xref>).</p>
<p>However, the standard BIT&#x2a; algorithm is primarily optimized for static environments and struggles to effectively handle dynamic changes such as leaf swaying in orchards. Long-range relationships and forward-backward contextual information in time-series data can be effectively captured by BiLSTM thanks to its special bidirectional gated recurrent structure (<xref ref-type="bibr" rid="B28">Yu et al., 2024</xref>). Therefore, this study introduces BiLSTM into BIT&#x2a;. Its core function is to capture the temporal movement patterns of dynamic obstacles. It learns trends in direction and velocity changes from historical trajectories through a bidirectional gating mechanism, enabling predictions of future positions within short time intervals. These predictions serve as prior knowledge that is fed into the BIT&#x2a; algorithm. This allows the algorithm to proactively avoid areas where dynamic obstacles are expected to be during the path search. This enhances the planning system&#x2019;s foresight and improves the success rate of dynamic obstacle avoidance. The structure of BiLSTM is shown in <xref ref-type="fig" rid="F5">Figure 5</xref>.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Schematic illustration of the BiLSTM architecture.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g005.tif">
<alt-text content-type="machine-generated">Diagram of a Bidirectional Long Short-Term Memory (LSTM) network. It shows two LSTM sequences: LSTM_L (green) processing left-to-right and LSTM_R (blue) processing right-to-left. Input is at the bottom, with arrows indicating the flow of data between the LSTM cells. Intermediate states are represented by vertical stacks of circles, and each LSTM cell is shown as a rectangular block.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F5">Figure 5</xref>, the BiLSTM consists of two independent LSTM layers, forward and backward, which process the sequence input in the forward and reverse directions, respectively. The hidden state (HS) outputs from both directions are ultimately combined to capture the full contextual information. The core of the BiLSTM is its gating mechanism. Its computational steps involve the forget gate <inline-formula id="inf37">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the input gate <inline-formula id="inf38">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the output gate <inline-formula id="inf39">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and cell state (CS) updates. At time step <inline-formula id="inf40">
<mml:math id="m44">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the forward LSTM (denoted as LSTM<sub>L</sub>) first determines which information should be forgotten and which new information needs to be stored, as shown in <xref ref-type="disp-formula" rid="e5">Equation 5</xref> (<xref ref-type="bibr" rid="B10">Kumudham et al., 2024</xref>).<disp-formula id="e5">
<mml:math id="m45">
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>C</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="italic">tanh</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2a;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2a;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>C</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e5">Equation 5</xref>, <inline-formula id="inf41">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the current input. <inline-formula id="inf42">
<mml:math id="m47">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> denotes the corresponding bias. <inline-formula id="inf43">
<mml:math id="m48">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> signifies the corresponding weight. <inline-formula id="inf44">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> indicates the HS from the previous time step. <inline-formula id="inf45">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> determines which information from the previous CS <inline-formula id="inf46">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> should be retained or forgotten. <inline-formula id="inf47">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, together with the candidate CS <inline-formula id="inf48">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>C</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, jointly determines which information needs to be updated into the CS at the current time step. <inline-formula id="inf49">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the current CS, computed jointly by <inline-formula id="inf50">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf51">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf52">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>C</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Next, based on the updated CS, LSTM<sub>L</sub> computes <inline-formula id="inf53">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the current HS <inline-formula id="inf54">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, as shown in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>.<disp-formula id="e6">
<mml:math id="m60">
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2a;</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi mathvariant="italic">tanh</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e6">Equation 6</xref>, <inline-formula id="inf55">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf56">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the weights and bias of <inline-formula id="inf57">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. For the reverse LSTM (denoted as LSTM<sub>R</sub>), it is computed in the same manner but operates in reverse along the time series, thereby generating the reverse HS <inline-formula id="inf58">
<mml:math id="m64">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B30">Zhai et al., 2024</xref>). Finally, the output of the BiLSTM at time step <inline-formula id="inf59">
<mml:math id="m65">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the concatenation of the forward HS <inline-formula id="inf60">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the backward HS <inline-formula id="inf61">
<mml:math id="m67">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, yielding <inline-formula id="inf62">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. This enables the model to fuse bidirectional contextual information across the entire sequence. The BiLSTM takes as input a time-based, sliding-window sequence of dynamic obstacle states, each of which typically contains three-dimensional position coordinates. This sequence is continuously acquired and provided by the system during operation through its real-time perception and tracking module. The network&#x2019;s final output is a predicted sequence of dynamic obstacle positions over several future planning cycles. This sequence is converted into a dynamic cost map that directly guides the generation of collision-free trajectories for the BIT&#x2a; search. Consequently, this study investigates the BIT&#x2a; based on the fusion capabilities of the BiLSTM for temporal prediction, constructing a RAOA module. <xref ref-type="fig" rid="F6">Figure 6</xref> displays its structure.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Schematic illustration of the RAOA module architecture.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g006.tif">
<alt-text content-type="machine-generated">Diagram illustrating a sequence prediction model for dynamic obstacle trajectories. The State sequence inputs into LSTM layers labeled LSTML and LSTMR, producing outputs h0, h1, and h2, leading to predictions of dynamic obstacle trajectories. The output generates a collision-free path point sequence displayed as a graphic with paths, dynamic obstacles, and a goal indicated by a star.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F6">Figure 6</xref>, this module employs the BIT&#x2a; algorithm as its core framework. Through its iterative batch sampling and heuristic pruning mechanisms, it achieves efficient and asymptotically optimal path planning for robotic arms in complex, unstructured environments. This module integrates a BiLSTM neural network, leveraging its powerful bidirectional long-range temporal dependency modeling capabilities to accurately predict the movement trends of dynamic obstacles such as swaying branches and leaves. This predictive information is incorporated into the BIT&#x2a; search process from the beginning, which significantly enhances the planning system&#x2019;s forward-looking decision-making capabilities and the robustness of dynamic obstacle avoidance. Ultimately, this ensures the robotic arms generates collision-free trajectories that are safe, smooth, and actively adapt to environmental changes. In summary, this research integrates the harvesting target positioning module with the RAOA module to construct the YOLOv8-B&#x2a; harvesting target positioning and RAOA algorithm. Its overall structure is illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Schematic illustration of the YOLOv8-B&#x2a; structure.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g007.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture for RGB image processing, highlighting the input module, backbone, neck, and head for feature extraction. It shows pathways leading to the prediction of dynamic obstacle trajectories and collision-free paths. Includes LSTM modules for sequence prediction, generating 2D pixel coordinates, 3D poses, and depth data for hand-eye calibration.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F7">Figure 7</xref>, the algorithm first employs the ST BN within the harvesting target positioning module to extract multi-scale global features, enhancing the model&#x2019;s perception of occluded targets and complex backgrounds. The YOLOv8 framework then utilizes its FPN to achieve MSFF. An uncoupled detection head simultaneously performs fruit classification and precise localization, ultimately outputting the fruit&#x2019;s exact pixel coordinates. Subsequently, hand-eye calibration converts the 2D coordinates into a 3D pose within the robot&#x2019;s base coordinate system. This pose, along with depth point cloud data, is input into the RAOA module. Within this module, a BiLSTM network predicts the motion trajectories of dynamic obstacles. The BIT&#x2a; algorithm performs real-time, collision-free path planning based on environmental geometry and dynamic prediction results. It ultimately generates an optimal sequence of motion trajectories for the joint space of the robotic arms.</p>
<p>Additionally, in practical deployment, the visual system adopted in this research employs an &#x201c;eye-on-hand&#x201d; configuration, where the camera is fixed outside the robot&#x2019;s workspace. This setup stabilizes the camera&#x2019;s field of view during robotic arm movements. This enables continuous observation of the relationships between the robotic arm, the target fruit, and dynamic obstacles. It provides the BIT&#x2a; planner with stable, global environmental perception input. This setup avoids the severe perspective shifts and occlusion issues inherent in &#x201c;eye-on-hand&#x201d; configurations caused by robotic arm motion. It simplifies the complexity of hand-eye calibration and coordinate transformation, thereby enhancing the robustness and real-time performance of the entire vision servo system.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Results and analysis</title>
<p>Testing is done in two dimensions: target localization and obstacle avoidance planning, to confirm the efficacy of the suggested YOLOv8-B&#x2a; algorithm in intricate agricultural settings. The target localization dimension evaluates fruit RA and localization deviation by constructing test sets with varying occlusions and lighting conditions. The obstacle avoidance planning dimension analyzes path planning efficiency by generating dynamic and static obstacles in typical orchard scenarios. The algorithm&#x2019;s efficacy is comprehensively validated through comparative experiments. The testing conducts systematic testing on a mobile robotic platform equipped with a six-degree-of-freedom robotic arm. Using peach trees and their fruits as representative subjects, algorithm validation is performed specifically for their characteristics of dense growth and susceptibility to obstruction by branches and foliage. Subsequent simulations and performance analyses are all based on this specific crop scenario.</p>
<p>To ensure the validity of statistical inference, the study rigorously selects appropriate statistical methods based on data characteristics. Performance metrics for the target localization experiment are calculated using a large-scale independent test set. To account for environmental uncertainty, metrics for the obstacle avoidance planning experiment are obtained through independent, repeated runs across 30 randomly generated dynamic scenarios. For all intergroup comparisons of continuous performance metrics, this study employs independent samples t-tests to assess the significance of mean differences. Benefiting from ample samples and experimental repetitions, the sample mean distributions of performance metrics satisfies the conditions of the central limit theorem, meeting the requirements for parametric testing. All significance results (e.g., <italic>p</italic> &#x3c; 0.05, <italic>p</italic> &#x3c; 0.01) are based on this test, indicating that improvements in algorithm performance are statistically significant.</p>
<sec id="s3-1">
<label>3.1</label>
<title>Target positioning performance testing</title>
<p>In target localization performance testing, the study leverages the PyTorch deep learning framework to implement the YOLOv8-B&#x2a; architecture. OpenCV is utilized for image preprocessing and result visualization, with the Ultralytics YOLOv8 open-source code repository serving as the foundation for algorithm development. Python 3.8 automates testing frameworks simulated various typical agricultural scenarios, including multi-object occlusion, sudden lighting changes, and foliage interference. This approach supports configurable dynamic environmental parameters and real-time system stress testing. Parameter settings align with those described in the research methodology section. The study employs the PhenoBench dataset as both the test and training sets (stratified randomly split 2:8). This dataset comprises over 100,000 high-resolution aerial images of farmland captured by drones, providing pixel-level annotated crop semantic segmentation masks and annotations for more than 500,000 crop leaf instances. The PhenoBench dataset closely mirrors the visual challenges encountered in close-range harvesting scenarios by encompassing dense crop arrangements, complex foliage occlusions, and variable lighting conditions. Its large-scale, high-quality pixel-level annotations enable models to learn more generalizable feature representations, thereby enhancing robustness in both structured and unstructured orchard environments. Consequently, selecting this dataset for algorithm validation is both reasonable and representative (<xref ref-type="bibr" rid="B23">Weyler et al., 2024</xref>).</p>
<p>Additionally, the study compares methods from references (<xref ref-type="bibr" rid="B16">Miao et al., 2023</xref>; <xref ref-type="bibr" rid="B6">Gong et al., 2022</xref>; <xref ref-type="bibr" rid="B7">Hu et al., 2022</xref>) with YOLOv8-B&#x2a;, specifically YOLOv5 and traditional image processing fusion algorithm (YOLOv5T), multisource image-fused mask R-CNN (MMRC), and YOLOv3 and mask R-CNN integrated dual-network framework (YOLOv3MR). These methods represent state-of-the-art approaches from 2022 to 2024, encompassing technical paradigms such as traditional and deep learning fusion, multi-source information perception, and dual-network collaborative optimization. They provide a comprehensive validation of YOLOv8-B&#x2a;&#x2019;s object localization performance. To validate the performance of the algorithm in complex, unstructured field environments, as described in the background section, field images of peach trees exhibiting typical occlusions, uneven lighting, and foliage interference are selected for testing. The target localization performance of different methods is visually compared, with results shown in <xref ref-type="fig" rid="F8">Figure 8</xref>.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Visual validation of the model&#x2019;s object localization performance.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g008.tif">
<alt-text content-type="machine-generated">Five images compare the original image of a tree with red fruit to various filtered versions: Yv5T, MMRC, Y3MR, and Yv8-B*. Each filter highlights the fruit differently, altering their apparent color from light to dark hues. The tree and fruit remain the central focus in each variant.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F8">Figure 8</xref>, YOLOv5T&#x2019;s feature extraction capability is constrained by the simple fusion of traditional image processing with YOLOv5, resulting in the detection of only 17 fruits (recall rate of 77.3%). This highlights the limitations of local modeling mechanisms in complex environments. MMRC detects 18 fruits (81.8%) by relying on multi-source image registration strategies, but its geometric reconstruction process suffers from cumulative errors. Although YOLOv3MR receives 19 detections (86.4%) through dual network integration with YOLOv3 and Mask R-CNN, it fails to resolve issues of insufficient feature alignment and sensitivity to occlusion. Additionally, YOLOv8-B&#x2a; significantly enhances spatial perception of partially occluded fruits through ST&#x2019;s W-MSA/SW-MSA, achieving 21 detections (95.5%) to lead the evaluation. The W-MSA/SW-MSA mechanism allows the model to infer and fill in visual details in areas blocked by foliage. This is done by creating connections between non-local windows. This allows the model to use contextual information from unobscured parts of the fruit. This directly validates its effective handling of unstructured challenges, such as &#x201c;branch occlusion&#x201d; and &#x201c;scale variation,&#x201d; as defined in the background. It demonstrates that the introduced global attention mechanism significantly improves robustness of visual perception in complex, real-world environments. Subsequently, to quantitatively assess model accuracy and robustness, the study compares RA and recall under occlusion (RO) across different methods. The former represents the proportion of correctly identified fruits compared to the total number of fruits. The latter indicates the proportion of successfully detected fruits among all obscured fruits under occlusion conditions. The results are shown in <xref ref-type="fig" rid="F9">Figure 9</xref>.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Validation of target localization accuracy and robustness. <bold>(a)</bold> MRA difference <bold>(b)</bold> RO difference.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g009.tif">
<alt-text content-type="machine-generated">Two box plots comparing four methods: YOLOv5T, YOLOv3MR, MMRC, and YOLOv8-B*. Plot (a) shows MRA percentages, with values around 85% to 95%. Plot (b) shows RO percentages, ranging from 70% to 90%. Each color-coded box represents a different method.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F9">Figure 9a</xref>, YOLOv8-B&#x2a; achieves a significantly higher RA range of 93.0%&#x2013;96.5% compared to the baseline model (<italic>p</italic> &#x3c; 0.001). By incorporating the ST module to enhance MSFF and global context modeling, it effectively improves fruit RA detection under complex occlusions. This improvement stems from the Swin Transformer&#x2019;s ability to surpass the local receptive field limitations of traditional CNNs by incorporating discriminative features throughout the entire image. This makes it more robust against inter-class confusion caused by uneven lighting or similar colors. YOLOv5T, relying on traditional image processing and simple YOLOv5 fusion, is limited in feature extraction capability, achieving an RA range of only 85.2%&#x2013;89.8%. MMRC partially improves perception through its multi-source image fusion strategy, attaining an RA of 88.8%&#x2013;92.4%. In <xref ref-type="fig" rid="F9">Figure 9b</xref>, YOLOv8-B&#x2a; also demonstrates a significant lead in RO ranges of 85.1%&#x2013;89.7% under occlusion scenarios (<italic>p</italic> &#x3c; 0.001). This advantage stems from the ST&#x2019;s sliding window mechanism, which enhances feature retention and spatial reasoning capabilities for partially occluded objects. Specifically, SW-MSA enables cross-window information exchange through window shifting, allowing the model to &#x201c;borrow&#x201d; features from adjacent visible regions to enhance the representation of the occluded fruit body. Although YOLOv3MR achieves relatively high recall rates (80.2%&#x2013;85.9%) by integrating YOLOv3 and Mask R-CNN, it does not fundamentally resolve the issue of feature loss caused by occlusion. MMRC relies on multi-source registration and geometric reconstruction, yielding RO values of 75.3%&#x2013;82.7%. The localization error (LE) of the different methods is then compared to evaluate the positioning accuracy of the models. LE is defined as the Euclidean distance between the predicted fruit center and the ground-truth center, as shown in <xref ref-type="fig" rid="F10">Figure 10</xref>.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Validation of the model&#x2019;s localization accuracy. <bold>(a)</bold> LE difference <bold>(b)</bold> Means and standard deviation.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g010.tif">
<alt-text content-type="machine-generated">Graph (a) shows a scatter plot of task scenarios versus LE (mm) with different methods indicated: YOLOv5T, MMRC, YOLOv3MR, and YOLOv8-B*, each marked by distinct shapes and colors. Graph (b) is a bar chart comparing the means and standard deviations of LE (mm) for the four methods, with YOLOv5T showing the highest mean.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F10">Figures 10a,b</xref>, YOLOv8-B&#x2a; exhibits an average LE of 12.33&#xa0;mm with the lowest standard deviation (1.70&#xa0;mm), demonstrating significantly superior performance compared to YOLOv5T (23.21&#xa0;mm, <italic>p</italic> &#x3c; 0.001), MMRC (19.18&#xa0;mm, <italic>p</italic> &#x3c; 0.01), and YOLOv3MR (15.70&#xa0;mm, <italic>p</italic> &#x3c; 0.05). The main source of this benefit is the W-MSA and SW-MSA processes of the ST, which improve its capacity to represent global spatial relationships. This mechanism enables BOB regression to anchor more precisely to the visible portion of the fruit and its geometric center, reducing drift errors caused by misleading local features. In typical scenarios, YOLOv8-B&#x2a; achieves an optimal value of 9.8&#xa0;mm in Scenario 6, where its sliding window attention effectively captures the geometric features of occluded fruits. The model can more accurately infer the complete contours and center positions of partially obscured fruits through global context, thereby achieving millimeter-level positioning accuracy. YOLOv3MR achieves 13.5&#xa0;mm in Scenario 1 but overlaps with YOLOv8-B&#x2a;&#x27;s 12.9&#xa0;mm performance in Scenario 10, revealing limitations in feature alignment during dual-network integration. MMRC&#x2019;s minimum value of 15.5&#xa0;mm in Scenario 28 remains higher than YOLOv8-B&#x2a; in most scenarios, indicating that multi-source image registration fails to resolve cumulative error issues. YOLOv5T exhibits a maximum error of 28.2&#xa0;mm in Scenario 10, highlighting the instability of traditional frameworks under dynamic lighting conditions. To evaluate the model&#x2019;s object localization efficiency and real-time performance, this study compares the processing frame rate (PFR) and inference time (IT) across different methods, as displayed in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Validation of the model&#x2019;s object localization efficiency and real-time performance.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Task scenarios</th>
<th colspan="4" align="center">PFR (fps)</th>
<th colspan="4" align="center">IT (ms)</th>
</tr>
<tr>
<th align="center">YOLOv5T</th>
<th align="center">MMRC</th>
<th align="center">YOLOv3MR</th>
<th align="center">YOLOv8-B&#x2a;</th>
<th align="center">YOLOv5T</th>
<th align="center">MMRC</th>
<th align="center">YOLOv3MR</th>
<th align="center">YOLOv8-B&#x2a;</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">5</td>
<td align="center">46.3</td>
<td align="center">22.3</td>
<td align="center">30.1</td>
<td align="center">32.0</td>
<td align="center">25.3</td>
<td align="center">62.4</td>
<td align="center">35.3</td>
<td align="center">32.3</td>
</tr>
<tr>
<td align="center">10</td>
<td align="center">43.7</td>
<td align="center">22.4</td>
<td align="center">28.1</td>
<td align="center">31.3</td>
<td align="center">20.8</td>
<td align="center">57.6</td>
<td align="center">39.3</td>
<td align="center">34.9</td>
</tr>
<tr>
<td align="center">15</td>
<td align="center">45.1</td>
<td align="center">15.4</td>
<td align="center">26.9</td>
<td align="center">31.1</td>
<td align="center">24.3</td>
<td align="center">46.1</td>
<td align="center">38.8</td>
<td align="center">34.3</td>
</tr>
<tr>
<td align="center">20</td>
<td align="center">46.1</td>
<td align="center">19.3</td>
<td align="center">25.4</td>
<td align="center">34.9</td>
<td align="center">20.9</td>
<td align="center">52.9</td>
<td align="center">33.8</td>
<td align="center">28.3</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">44.0</td>
<td align="center">20.0</td>
<td align="center">26.9</td>
<td align="center">35.7</td>
<td align="center">19.3</td>
<td align="center">52.4</td>
<td align="center">31.7</td>
<td align="center">34.4</td>
</tr>
<tr>
<td align="center">30</td>
<td align="center">40.3</td>
<td align="center">18.5</td>
<td align="center">25.3</td>
<td align="center">31.0</td>
<td align="center">22.1</td>
<td align="center">59.1</td>
<td align="center">30.8</td>
<td align="center">31.6</td>
</tr>
<tr>
<td align="center">Means</td>
<td align="center">44.3</td>
<td align="center">19.7</td>
<td align="center">27.1</td>
<td align="center">32.7</td>
<td align="center">22.1</td>
<td align="center">55.1</td>
<td align="center">35.0</td>
<td align="center">32.6</td>
</tr>
<tr>
<td align="center">Standard deviation</td>
<td align="center">2.0</td>
<td align="center">2.4</td>
<td align="center">1.6</td>
<td align="center">1.9</td>
<td align="center">2.1</td>
<td align="center">5.3</td>
<td align="center">3.2</td>
<td align="center">2.3</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In <xref ref-type="table" rid="T1">Table 1</xref>, YOLOv8-B&#x2a; achieves the optimal performance-speed balance with a frame rate of 32.7 fps and a latency of 32.6&#xa0;ms. Its IT is significantly lower than MMRC (55.1&#xa0;ms, <italic>p</italic> &#x3c; 0.001) and YOLOv3MR (35.0&#xa0;ms, <italic>p</italic> &#x3c; 0.01), attributed to YOLOv8&#x2019;s C2F module and decoupled detection head effectively mitigating the computational overhead of ST. Although Swin Transformer introduces global computations, its windowed attention design effectively complements YOLOv8&#x2019;s efficient feature extraction pipeline and keeps computational complexity within acceptable limits. Although YOLOv5T achieves the highest frame rate of 46.1 fps in Scenario 20, this comes at the expense of localization accuracy. MMRC exhibits a worst latency of 59.1&#xa0;m in Scenario 30, revealing inherent bottlenecks in multi-source fusion. YOLOv3MR achieves the best IT of 31.7&#xa0;ms in Scenario 25, overlapping with YOLOv8-B&#x2a; performance, yet its average frame rate of 27.1 fps remains insufficient. YOLOv8-B&#x2a; simultaneously achieves 34.9 fps and 28.3&#xa0;ms latency in Scenario 20, validating the synergistic advantages of global modeling and lightweight design.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Obstacle avoidance performance verification of robotic arms</title>
<p>For RAOA performance validation, the study constructs a multi-scenario integrated testing environment within the Gazebo simulation platform, featuring dense orchards, crop row aisles, and mobile obstacles. Robot control and algorithm deployment are implemented via ROS. Continuous multi-source data streams are captured at high precision: point cloud from depth cameras (30Hz), LiDAR scans (40Hz), and robotic arm joint torques (1kHz sampling). These streams encompasses typical stress events marked by dynamic foliage interference, sudden obstacle intrusions, and multi-target harvesting path conflicts. The robot and algorithm implementation architecture aligns with the target positioning performance testing. Furthermore, the study compares YOLOv8-B&#x2a; with methods from (<xref ref-type="bibr" rid="B31">Zhang et al., 2024</xref>; <xref ref-type="bibr" rid="B13">Liu, 2022</xref>; <xref ref-type="bibr" rid="B27">Yi et al., 2024</xref>): HDRRT, HOPP, and active vision-based view planner (AVVP). These advanced methods from 2022&#x2013;2024 encompass dynamic sampling path planning, hierarchical optimization decision-making, and active perception planning, comprehensively validating YOLOv8-B&#x2a;&#x2019;s RAOA capabilities. The study first selects four ripe fruits as targets within a 1.5&#xa0;m<sup>3</sup> space. Different methods are employed to control the robotic arms for fruit picking. By comparing the picking paths generated by each method, their planning efficiency is intuitively evaluated, as shown in <xref ref-type="fig" rid="F11">Figure 11</xref>.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Efficiency verification of harvesting path planning. <bold>(a)</bold> Yv8-B&#x2a; <bold>(b)</bold> HDRRT <bold>(c)</bold> HOPP <bold>(d)</bold> AVVP.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g011.tif">
<alt-text content-type="machine-generated">Diagram illustrating pathfinding in a grid with four panels labeled (a) to (d). Each grid contains blue circles and several red circles connected by lines, indicating different paths. The grid size is labeled 0.5 meters in each panel. Panels show variations in path connections and configurations.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F11">Figure 11a</xref>, the YOLOv8-B&#x2a; algorithm effectively avoids obstacles and generates a globally optimal path through its improved heuristic search structure and dynamic weight adjustment mechanism, achieving a minimum distance of 2.27&#xa0;m, significantly outperforming the comparison model. The BiLSTM&#x2019;s dynamic obstacle prediction prior enables the BIT&#x2a; algorithm to proactively avoid areas where obstacles may appear in the future during heuristic pruning. This directs the search toward safer, more direct pathways and prevents path detours caused by temporary obstacle avoidance. In <xref ref-type="fig" rid="F11">Figure 11b</xref>, HDRRT (2.39&#xa0;m) enhances exploration efficiency through random tree expansion but remains inferior to YOLOv8-B&#x2a;&#x27;s structured search strategy. This fully demonstrates the core influence of algorithmic architecture on path planning performance in complex environments. In <xref ref-type="fig" rid="F11">Figure 11c</xref>, HOPP (2.46&#xa0;m) relies on a traditional rule base, resulting in numerous sharp angles in the path and generating redundant acceleration/deceleration phases during RA motion. In <xref ref-type="fig" rid="F11">Figure 11d</xref>, AVVP (3.06&#xa0;m) integrates visual perception but fails to prioritize targets, resulting in the longest planned path. The YOLOv8-B&#x2a; model has shorter global paths, which directly reduces the overall exposure risk and cumulative collision probability for robotic arms navigating through dense obstacles. This extends the fault-free operation time of robotic arms in unstructured environments with tangled branches, thereby enhancing picking efficiency. Subsequently, the study compares the single obstacle avoidance PL and single planning time (SPT) across different methods to evaluate the overall efficiency of obstacle avoidance planning, as shown in <xref ref-type="fig" rid="F12">Figure 12</xref>.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Validation of the model&#x2019;s comprehensive efficiency in obstacle avoidance planning. <bold>(a)</bold> PL difference <bold>(b)</bold> SPT difference.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g012.tif">
<alt-text content-type="machine-generated">Two box plot graphs compare four methods labeled HDRRT, HOPP, AVVP, and YOLOv8-B*. Graph (a) shows &#x22;PL (cm)&#x22; on the y-axis with values ranging from 9 to 25. HDRRT shows the highest median. Graph (b) shows &#x22;SPT (s)&#x22; on the y-axis with values from 2 to 7. AVVP records the highest median here. Each method is visually distinct with different patterns.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F12">Figure 12a</xref>, the PL range of YOLOv8-B&#x2a; (9.8&#xa0;cm<sup>-1</sup> to 15.0&#xa0;cm<sup>-1</sup>) significantly exceeds that of HDRRT (18.5&#xa0;cm<sup>-1</sup> to 24.3&#xa0;cm<sup>-1</sup>, <italic>p</italic> &#x3c; 0.001) and HOPP&#x2019;s 15.2&#xa0;cm&#x2013;20.4&#xa0;cm (<italic>p</italic> &#x3c; 0.01), and AVVP&#x2019;s 13.0&#xa0;cm&#x2013;18.7&#xa0;cm (<italic>p</italic> &#x3c; 0.05). Its BIT&#x2a; algorithm generates compact paths through heuristic pruning and BiLSTM dynamic obstacle prediction. The prediction error of BiLSTM primarily influences the conservatism of pruning: high-confidence predictions enable BIT&#x2a; to prune future safe regions more aggressively, directly planning shorter paths. Whereas with low confidence, the algorithm retains a wider safety margin, slightly increasing PL to ensure robustness. AVVP achieves an optimal single point of 13.0&#xa0;cm, but increases to 18.4&#xa0;cm in Scenario 30, indicating instability in its view iteration mechanism. HDRRT&#x2019;s random sampling results in the highest path redundancy, reaching 23.4&#xa0;cm in Scenario 0. In environments with dense obstacles, the more compact path of YOLOv8-B&#x2a; enables the robotic arm&#x2019;s end-effector to navigate narrow spaces with smaller movements and closer adherence to the intended trajectory. This significantly reduces unexpected scrapes or collisions caused by path redundancy. In <xref ref-type="fig" rid="F12">Figure 12b</xref>, YOLOv8-B&#x2a; also significantly outperforms competitors (<italic>p</italic> &#x3c; 0.001) with an SPT range of 1.45&#xa0;s&#x2013;2.86&#xa0;s, where its BiLSTM-augmented architecture compresses the search space through spatio-temporal modeling. By preemptively excluding a large number of invalid sampling regions containing future collision risks, BiLSTM&#x2019;s predictions reduce the number of vertices and edges that BIT&#x2a; needs to evaluate. This substantially lowers the computational overhead per iteration. Although HOPP achieves 2.42&#xa0;s in Scenario 0, its peak value of 3.87&#xa0;s overlaps with YOLOv8-B&#x2a;, revealing the computational burden of hierarchical optimization. HDRRT is the least efficient in random sampling, taking 3.04&#xa0;s&#x2013;5.25&#xa0;s YOLOv8-B&#x2a;&#x27;s extremely short planning time enables the system to perform high-frequency replanning. The robotic arm can adjust its trajectory nearly in real time when encountering sudden dynamic obstacles, such as swaying branches, or target position updates. This ability is a prerequisite for achieving reliable dynamic obstacle avoidance. The study also compares the energy consumption of manipulator (ECM) across different methods to evaluate model efficiency, as shown in <xref ref-type="fig" rid="F13">Figure 13</xref>.</p>
<fig id="F13" position="float">
<label>FIGURE 13</label>
<caption>
<p>Validation of the model&#x2019;s economic efficiency. <bold>(a)</bold> ECM difference <bold>(b)</bold> Means and standard deviation.</p>
</caption>
<graphic xlink:href="fmech-12-1741396-g013.tif">
<alt-text content-type="machine-generated">Chart (a) displays ECM values against task scenario numbers, comparing methods HDRRT, HOPP, AVVP, and YOLOv8-B*. Chart (b) shows means and standard deviations for each method. Bars represent means with overlays for standard deviations.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="F13">Figures 13a,b</xref>, the average ECM of YOLOv8-B&#x2a; is 124.58&#xa0;J, significantly lower than that of HDRRT (228.35&#xa0;J, <italic>p</italic> &#x3c; 0.001), HOPP (186.68&#xa0;J, <italic>p</italic> &#x3c; 0.01), and AVVP (158.52&#xa0;J, <italic>p</italic> &#x3c; 0.05). This advantage stems from the BIT&#x2a; algorithm generating optimal paths to minimize redundant motion, combined with BiLSTM dynamic prediction to avoid abrupt stops and re-planning. BiLSTM&#x2019;s precise predictions enable the robotic arm to smoothly navigate around dynamic obstacles in advance, avoiding the abrupt braking and re-acceleration processes common in traditional reactive obstacle avoidance. This represents one of the key mechanisms for reducing energy consumption. YOLOv8-B&#x2a; achieves the lowest energy consumption of 95&#xa0;J in Scenario 1, where its heuristic pruning and spatio-temporal prediction effectively optimize trajectories. The shorter PL combined with forward-looking speed planning enables the joint motor to operate within its high-efficiency range most of the time. This reduces the additional torque required to overcome inertia and minimizes energy loss. Although AVVP achieves 128&#xa0;J in Scenario 20, overlapping with YOLOv8-B&#x2a;&#x27;s 142&#xa0;J performance in Scenario 1, its view iteration mechanism causes additional kinetic energy consumption to rise to 143&#xa0;J in Scenario 8. HOPP achieves low energy consumption of 153&#xa0;J in Scenario 23, but does not consider joint torque continuity, resulting in energy consumption as high as 223&#xa0;J in Scenarios 11 and 12. HDRRT&#x2019;s random sampling leads to path redundancy, causing the highest energy consumption of 283&#xa0;J in cenario 30. YOLOv8-B&#x2a; has a lower kinetic energy consumption that directly reflects the smoothness and efficiency of the robotic arm&#x2019;s trajectory. This avoids abrupt acceleration and deceleration caused by emergency obstacle avoidance or suboptimal path planning. This stable motion further reduces the risk of contact collisions between the end-effector and fruits or branches due to vibration or inertia. Subsequent studies compares the single-task computational load (CL) and memory consumption (MC) of different methods to evaluate the models&#x2019; potential for broader application, as displayed in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Validation of the model&#x2019;s potential for promotion and application.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Task scenarios</th>
<th colspan="4" align="center">CL (GFlops)</th>
<th colspan="4" align="center">MC (MB)</th>
</tr>
<tr>
<th align="center">HDRRT</th>
<th align="center">HOPP</th>
<th align="center">AVVP</th>
<th align="center">YOLOv8-B&#x2a;</th>
<th align="center">HDRRT</th>
<th align="center">HOPP</th>
<th align="center">AVVP</th>
<th align="center">YOLOv8-B&#x2a;</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">5</td>
<td align="center">20.3</td>
<td align="center">25.5</td>
<td align="center">72.2</td>
<td align="center">22.7</td>
<td align="center">171</td>
<td align="center">315</td>
<td align="center">212</td>
<td align="center">187</td>
</tr>
<tr>
<td align="center">10</td>
<td align="center">23.5</td>
<td align="center">23.9</td>
<td align="center">49.4</td>
<td align="center">21.3</td>
<td align="center">160</td>
<td align="center">224</td>
<td align="center">221</td>
<td align="center">205</td>
</tr>
<tr>
<td align="center">15</td>
<td align="center">25.1</td>
<td align="center">32.8</td>
<td align="center">54.5</td>
<td align="center">21.9</td>
<td align="center">167</td>
<td align="center">245</td>
<td align="center">253</td>
<td align="center">207</td>
</tr>
<tr>
<td align="center">20</td>
<td align="center">24.4</td>
<td align="center">23.6</td>
<td align="center">64.4</td>
<td align="center">24.5</td>
<td align="center">160</td>
<td align="center">247</td>
<td align="center">186</td>
<td align="center">171</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">23.2</td>
<td align="center">28.2</td>
<td align="center">70.1</td>
<td align="center">24.2</td>
<td align="center">126</td>
<td align="center">232</td>
<td align="center">231</td>
<td align="center">190</td>
</tr>
<tr>
<td align="center">30</td>
<td align="center">19.9</td>
<td align="center">22.4</td>
<td align="center">72.6</td>
<td align="center">21.3</td>
<td align="center">167</td>
<td align="center">230</td>
<td align="center">259</td>
<td align="center">160</td>
</tr>
<tr>
<td align="center">Means</td>
<td align="center">22.7</td>
<td align="center">26.1</td>
<td align="center">63.9</td>
<td align="center">22.7</td>
<td align="center">159</td>
<td align="center">249</td>
<td align="center">227</td>
<td align="center">187</td>
</tr>
<tr>
<td align="center">Standard deviation</td>
<td align="center">2.0</td>
<td align="center">3.5</td>
<td align="center">9.0</td>
<td align="center">1.3</td>
<td align="center">15</td>
<td align="center">31</td>
<td align="center">25</td>
<td align="center">17</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T2">Table 2</xref> shows that YOLOv8-B&#x2a; achieves optimal resource efficiency with a CL of 22.7 GFlops and MC of 187&#xa0;MB. Its CL is significantly lower than AVVP&#x2019;s 63.9 GFlops (<italic>p</italic> &#x3c; 0.001). Its MC is significantly lower than HOPP&#x2019;s 249&#xa0;MB (<italic>p</italic> &#x3c; 0.01). This advantage stems from the synergistic optimization of BIT&#x2a; heuristic search and BiLSTM prediction, which reduces computational iterations, while the ST window attention mechanism minimizes memory usage through parameter sharing. As a lightweight temporal module, BiLSTM replaces the dynamic environment modeling traditionally achieved through extensive sampling and collision detection, fundamentally reducing the computational complexity of the planner. Among these, YOLOv8-B&#x2a; achieves the lowest CL of 21.3 GFlops in Scenarios 10 and 30. While HDRRT reaches 19.9 GFlops in Scenario 30, overlapping with YOLOv8-B&#x2a;&#x2019;s performance, this comes at the cost of reduced path quality. AVVP, benefiting from multi-source perception and view iteration optimization, achieves peak loads of 72.2 GFlops and 72.6 GFlops in Scenarios 5 and 30, respectively. HOPP&#x2019;s hierarchical structure requires pre-storing global path information, leading to MC of 315&#xa0;MB in Scenario 5, significantly exceeding YOLOv8-B&#x2a;&#x2019;s 187&#xa0;MB performance in the same scenario. The reduced computational and memory demands of YOLOv8-B&#x2a; ensure stable operation of the algorithm on onboard computing units. This frees ample resources for processing high-frequency visual feedback and continuous obstacle avoidance planning. This safeguards the real-time performance and reliability of the entire perception-planning loop in complex scenarios, forming the foundational system for achieving sustained safe obstacle avoidance. The deep collaboration between modules significantly reduces the system&#x2019;s overall resource consumption compared to the simple sum of individual modules, demonstrating the superiority of the architectural design.</p>
<p>To validate the necessity of the base module design, the study performs ablation tests on the base module. Visual backbones are replaced with YOLOv5, YOLOv10, efficient channel attention (ECA), squeeze-and-excitation (SE) attention, mobile vision Transformer (MobileViT), and convolution-enhanced Transformer (ConvFormer). Planners are replaced with rapidly-exploring random tree star (RRT&#x2a;) and RRT with eXact anytime optimization (RRTX). LSTM and temporal convolutional network (TCN) are used to replace the temporal predictor (BiLSTM). The results are shown in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Verification of absorption/replacement for basic modules.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Settings</th>
<th align="center">YOLOv8</th>
<th align="center">Swin transformer</th>
<th align="center">BIT&#x2a;</th>
<th align="center">BiLSTM</th>
<th align="center">RA (%)</th>
<th align="center">PL (cm)</th>
<th align="center">CL (GFlops)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Full (Yv8-B&#x2a;)</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">96.5</td>
<td align="center">12.4</td>
<td align="center">22.7</td>
</tr>
<tr>
<td align="center">A1 (w/o Swin Transformer)</td>
<td align="center">&#x2713;</td>
<td align="center">&#xd7;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">90.1</td>
<td align="center">13.1</td>
<td align="center">18.4</td>
</tr>
<tr>
<td align="center">A2 (w/o BiLSTM)</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#xd7;</td>
<td align="center">95.8</td>
<td align="center">18.7</td>
<td align="center">21.8</td>
</tr>
<tr>
<td align="center">A3 (w/o SwinT and BiLSTM)</td>
<td align="center">&#x2713;</td>
<td align="center">&#xd7;</td>
<td align="center">&#x2713;</td>
<td align="center">&#xd7;</td>
<td align="center">89.6</td>
<td align="center">19.2</td>
<td align="center">17.2</td>
</tr>
<tr>
<td align="center">YOLOv5</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">93</td>
<td align="center">13.8</td>
<td align="center">24.5</td>
</tr>
<tr>
<td align="center">YOLOv10</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">95.3</td>
<td align="center">12.9</td>
<td align="center">25.7</td>
</tr>
<tr>
<td align="center">ECA</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">92.7</td>
<td align="center">14.5</td>
<td align="center">19.3</td>
</tr>
<tr>
<td align="center">SE attention</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">93.4</td>
<td align="center">14.2</td>
<td align="center">19.5</td>
</tr>
<tr>
<td align="center">MobileViT</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">94.6</td>
<td align="center">13.6</td>
<td align="center">20.3</td>
</tr>
<tr>
<td align="center">ConvFormer</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">95</td>
<td align="center">13.3</td>
<td align="center">23</td>
</tr>
<tr>
<td align="center">RRT&#x2a;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">96</td>
<td align="center">15.9</td>
<td align="center">19.7</td>
</tr>
<tr>
<td align="center">RRTX</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2713;</td>
<td align="center">96.1</td>
<td align="center">16.4</td>
<td align="center">26.4</td>
</tr>
<tr>
<td align="center">LSTM</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">95.9</td>
<td align="center">13.8</td>
<td align="center">22</td>
</tr>
<tr>
<td align="center">TCN</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2014;</td>
<td align="center">95.8</td>
<td align="center">14.1</td>
<td align="center">23.4</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T3">Table 3</xref>, the complete model (Yv8-B&#x2a;) achieves optimal overall performance in terms of RA (96.5%), PL (12.4&#xa0;cm), and CL (22.7 GFlops). Ablation experiments reveal that removing Swin Transformer (A1) significantly degrades RA (<italic>p</italic> &#x3c; 0.001) and increases PL. This demonstrates its critical role in enhancing RA and generating compact paths through global modeling. Removing BiLSTM (A2) substantially increases PL to 18.7&#xa0;cm (<italic>p</italic> &#x3c; 0.001), validating dynamic prediction&#x2019;s core contribution to path planning efficiency. In module replacements, MobileViT and ConvFormer both yields lower RA (94.6%, 95.0%) than the full model with longer paths. ECA and SE demonstrates weaker accuracy and path performance. Replacing BIT with RRT and RRTX increases PL to 15.9&#xa0;cm and 16.4&#xa0;cm respectively (<italic>p</italic> &#x3c; 0.01), with RRTX achieving higher CL. Substituting BiLSTM with LSTM and TCN also results in longer PLs. Experiments quantitatively confirm that the selected Swin Transformer and BiLSTM modules achieve the optimal balance among RA, path planning efficiency, and CL.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Discussion and conclusion</title>
<p>To address the challenges of inaccurate fruit localization and inefficient dynamic obstacle avoidance in complex agricultural environments, this study proposed the YOLOv8-B&#x2a; fusion algorithm based on an enhanced YOLOv8 and BIT&#x2a;. By incorporating the ST module to enhance MSFF and global context modeling, and integrating a BiLSTM network to endow the BIT&#x2a; algorithm with dynamic obstacle prediction capabilities, an integrated perception-decision-control harvesting robot system was constructed. Experiments demonstrated that YOLOv8-B&#x2a; achieved RA of 93.0%&#x2013;96.5%, RO of 85.1%&#x2013;89.7%, and a mean LE of 12.33&#xa0;mm in the target localization dimension. Compared to the optimal reference model, it improved accuracy by 3.5% and reduced LE by 21.5%. Moreover, in the obstacle avoidance planning dimension, it achieved a PL of 9.8&#xa0;cm&#x2013;5.0&#xa0;cm and a planning time of 1.45&#xa0;s&#x2013;2.86&#xa0;s, reducing PL by 17.8% and improving planning efficiency by 38.2% compared to the optimal comparison model. In actual deployment, the ECM is reduced to 124.58&#xa0;J, with single-task CL and MC at 22.7 GFlops and 187&#xa0;MB respectively. Compared to mainstream methods, resource consumption is reduced by an average of 42.3% and 24.9%, validating the algorithm&#x2019;s comprehensive advantages in accuracy, efficiency, energy consumption, and resource economy.</p>
<p>The architectural innovation of YOLOv8-B&#x2a; lies in its dual-module coordination mechanism: The visual perception module based on ST overcomes the local perception limitations of traditional CNNs through a sliding window attention mechanism, significantly enhancing the representation capability of occluded object features. The BIT&#x2a;-enhanced planning module addresses the response lag issue for dynamic obstacles by combining spatio-temporal context prediction with heuristic search. The two components form a closed-loop system through hand-eye calibration, enabling seamless transition from fruit recognition to path planning. This research has achieved a significant reduction in the CL and memory footprint of single-task operations, compared to the typical computing power and memory capacity of mainstream embedded AI computing platforms like Jetson Orin NX. This indicates that the Yv8-B&#x2a; algorithm architecture possesses the potential for direct porting to such platforms and achieving real-time operation. However, the research has several limitations. First, the ST module has high computational demands, so it needs to be optimized and validated further for deployment on embedded devices. BiLSTM&#x2019;s dynamic prediction relies on historical data quality, potentially leading to error accumulation under extreme occlusion scenarios. Future work will address these challenges through the design of a lightweight hybrid attention mechanism that balances computational efficiency and model performance. Additionally, the development of model lightweighting and operator optimization deployment strategies tailored for edge computing platforms like Jetson will ensure the stable, real-time operation of the algorithm in actual onboard robot systems. Additionally, a multi-sensor fusion dynamic obstacle trajectory compensation algorithm will be developed to enhance system robustness in adverse environments.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>YX: Writing &#x2013; original draft, Formal Analysis, Methodology, Investigation, Writing &#x2013; review and editing, Data curation, Conceptualization.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/134506/overview">Hamid Reza Karimi</ext-link>, Polytechnic University of Milan, Italy</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1691713/overview">Yingxing Jiang</ext-link>, Jiangsu University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3313228/overview">Peng Huo</ext-link>, Inner Mongolia Agricultural University, China</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ayyad</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Sallam</surname>
<given-names>N. M.</given-names>
</name>
<name>
<surname>Gamel</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>Z. H.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Particle swarm optimization with YOLOv8 for improved detection performance of tomato plants</article-title>. <source>J. Big Data</source> <volume>12</volume> (<issue>1</issue>), <fpage>152</fpage>&#x2013;<lpage>153</lpage>. <pub-id pub-id-type="doi">10.1186/s40537-025-01206-6</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Clustered tomato detection and picking point location using machine learning-aided image analysis for automatic robotic harvesting</article-title>. <source>Precis. Agric.</source> <volume>24</volume> (<issue>2</issue>), <fpage>727</fpage>&#x2013;<lpage>743</lpage>. <pub-id pub-id-type="doi">10.1007/s11119-022-09972-6</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Droukas</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Doulgeri</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tsakiridis</surname>
<given-names>N. L.</given-names>
</name>
<name>
<surname>Triantafyllou</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kleitsiotis</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Mariolis</surname>
<given-names>I.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>A survey of robotic harvesting systems and enabling technologies</article-title>. <source>J. Intell. Robot. Syst.</source> <volume>107</volume> (<issue>2</issue>), <fpage>21</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1007/s10846-022-01793-z</pub-id>
<pub-id pub-id-type="pmid">36721646</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Detection of fruit using YOLOv8-based single stage detectors</article-title>. <source>Int. J. Adv. Comput. Sci. Appl.</source> <volume>14</volume> (<issue>12</issue>), <fpage>83</fpage>&#x2013;<lpage>84</lpage>. <pub-id pub-id-type="doi">10.14569/IJACSA.2023.0141208</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>Z. C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J. X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Shao</surname>
<given-names>T. Y.</given-names>
</name>
<name>
<surname>Ni</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>H. H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Deep transfer learning-based computer vision for real-time harvest period classification and impurity detection of porphyra haitnensis</article-title>. <source>Aquac. Int.</source> <volume>32</volume> (<issue>4</issue>), <fpage>5171</fpage>&#x2013;<lpage>5198</lpage>. <pub-id pub-id-type="doi">10.1007/s10499-024-01422-6</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gong</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Robotic harvesting of the occluded fruits with a precise shape and position reconstruction approach</article-title>. <source>J. Field Robot.</source> <volume>39</volume> (<issue>1</issue>), <fpage>69</fpage>&#x2013;<lpage>84</lpage>. <pub-id pub-id-type="doi">10.1002/rob.22041</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kaizu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H. D.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y. W.</given-names>
</name>
<name>
<surname>Imou</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Recognition and localization of strawberries from 3D binocular cameras for a strawberry picking robot using coupled YOLO/mask R-CNN</article-title>. <source>Int. J. Agric. Biol. Eng.</source> <volume>15</volume> (<issue>6</issue>), <fpage>175</fpage>&#x2013;<lpage>179</lpage>. <pub-id pub-id-type="doi">10.25165/j.ijabe.20221506.7306</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huynh</surname>
<given-names>L. Q.</given-names>
</name>
<name>
<surname>Tran</surname>
<given-names>L. V.</given-names>
</name>
<name>
<surname>Phan</surname>
<given-names>P. N.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>S. V.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Intermediary RRT&#x2217;-PSO: a multi-directional hybrid fast convergence sampling-based path planning algorithm</article-title>. <source>Comput. Mater. Contin.</source> <volume>76</volume> (<issue>2</issue>), <fpage>2281</fpage>&#x2013;<lpage>2300</lpage>. <pub-id pub-id-type="doi">10.32604/cmc.2023.034872</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Johnson</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Qureshi</surname>
<given-names>A. H.</given-names>
</name>
<name>
<surname>Yip</surname>
<given-names>M. C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Learning sampling dictionaries for efficient and generalizable robot motion planning with transformers</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>8</volume> (<issue>12</issue>), <fpage>7946</fpage>&#x2013;<lpage>7953</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2023.3322087</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumudham</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Shakir</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Abishek B</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Enhancing brix value prediction in strawberries using machine learning: a fusion of physiochemical and color-based features for improved sweetness assessment</article-title>. <source>Malays. J. Comput. Sci.</source> <volume>37</volume> (<issue>2</issue>), <fpage>107</fpage>&#x2013;<lpage>123</lpage>. <pub-id pub-id-type="doi">10.22452/mjcs</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kyaw</surname>
<given-names>P. T.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>A. V.</given-names>
</name>
<name>
<surname>Veerajagadheswar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Elara</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Thu</surname>
<given-names>T. T.</given-names>
</name>
<name>
<surname>Nhan</surname>
<given-names>N. H. K.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Energy-efficient path planning of reconfigurable robots in complex environments</article-title>. <source>IEEE Trans. Robot.</source> <volume>38</volume> (<issue>4</issue>), <fpage>2481</fpage>&#x2013;<lpage>2494</lpage>. <pub-id pub-id-type="doi">10.1109/TRO.2022.3147408</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Positioning of mango picking point using an improved YOLOv8 architecture with object detection and instance segmentation</article-title>. <source>Biosyst. Eng.</source> <volume>247</volume> (<issue>1</issue>), <fpage>202</fpage>&#x2013;<lpage>220</lpage>. <pub-id pub-id-type="doi">10.1016/j.biosystemseng.2024.09.015</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>D. W.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Hierarchial optimal path planning (HOPP) for robotic apple harvesting</article-title>. <source>Int. J. Health Sci. Res.</source> <volume>4</volume> (<issue>3</issue>), <fpage>6</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.36838/v4i3.2</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>The vision-based target recognition, localization, and control for harvesting robots: a review</article-title>. <source>Int. J. Precis. Eng. Manuf.</source> <volume>25</volume> (<issue>2</issue>), <fpage>409</fpage>&#x2013;<lpage>428</lpage>. <pub-id pub-id-type="doi">10.1007/s12541-023-00911-7</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hua</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Pu</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Using an improved lightweight YOLOv8 model for real-time detection of multi-stage apple fruit in complex orchard environments</article-title>. <source>Artif. Intell. Agric.</source> <volume>11</volume> (<issue>1</issue>), <fpage>70</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1016/j.aiia.2024.02.001</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Efficient tomato harvesting robot based on image processing and deep learning</article-title>. <source>Precis. Agric.</source> <volume>24</volume> (<issue>1</issue>), <fpage>254</fpage>&#x2013;<lpage>287</lpage>. <pub-id pub-id-type="doi">10.1007/s11119-022-09944-w</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nenavath</surname>
<given-names>D. N.</given-names>
</name>
<name>
<surname>Perumal</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>On-tree mango fruit count using live video-split image dataset to predict better yield at pre-harvesting stage</article-title>. <source>Int. J. Elect. Comput. Eng. Syst.</source> <volume>15</volume> (<issue>9</issue>), <fpage>771</fpage>&#x2013;<lpage>782</lpage>. <pub-id pub-id-type="doi">10.32985/ijeces.15.9.5</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pal</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shivakumara</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pal</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Adapting a swin transformer for license plate number and text detection in drone images</article-title>. <source>Artif. Intell. Appl.</source> <volume>1</volume> (<issue>3</issue>), <fpage>145</fpage>&#x2013;<lpage>154</lpage>. <pub-id pub-id-type="doi">10.47852/bonviewAIA3202549</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Panduranga</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Ranganathasharma</surname>
<given-names>R. H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Sustainability insights on learning-based approaches in precision agriculture in internet-of-things</article-title>. <source>Int. J. Elect. Comput. Eng.</source> <volume>14</volume> (<issue>3</issue>), <fpage>3495</fpage>&#x2013;<lpage>3511</lpage>. <pub-id pub-id-type="doi">10.11591/IJECE.V14I3.PP3495-3511</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tamizi</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Honari</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Nozdryn-Plotnicki</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Najjaran</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>End-to-end deep learning-based framework for path planning and collision checking: bin-picking application</article-title>. <source>Robotica</source> <volume>42</volume> (<issue>4</issue>), <fpage>1094</fpage>&#x2013;<lpage>1112</lpage>. <pub-id pub-id-type="doi">10.1017/S0263574724000109</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Shao</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>TransSSA: invariant cue perceptual feature focused learning for dynamic fruit target detection</article-title>. <source>Comput. Mater. Contin.</source> <volume>83</volume> (<issue>2</issue>), <fpage>2829</fpage>&#x2013;<lpage>2850</lpage>. <pub-id pub-id-type="doi">10.32604/cmc.2025.063287</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A transformer-based mask R-CNN for tomato detection and segmentation</article-title>. <source>J. Intell. Fuzzy Syst.</source> <volume>44</volume> (<issue>5</issue>), <fpage>8585</fpage>&#x2013;<lpage>8595</lpage>. <pub-id pub-id-type="doi">10.3233/JIFS-222954</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weyler</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Magistri</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Marks</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Chong</surname>
<given-names>Y. L.</given-names>
</name>
<name>
<surname>Sodano</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Roggiolani</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Phenobench: a large dataset and benchmarks for semantic image interpretation in the agricultural domain</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>46</volume> (<issue>12</issue>), <fpage>9583</fpage>&#x2013;<lpage>9594</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2024.3419548</pub-id>
<pub-id pub-id-type="pmid">38923484</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Lightweight improvement algorithm for target detection of Pu&#x27;er tea harvesting robotic arm based on YOLOv8</article-title>. <source>Int. J. Inf. Commun. Technol.</source> <volume>26</volume> (<issue>8</issue>), <fpage>1</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1504/IJICT.2025.145720</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ju</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Mechanical arm obstacle avoidance path planning based on improved artificial potential field method</article-title>. <source>Ind. Robot.</source> <volume>49</volume> (<issue>2</issue>), <fpage>271</fpage>&#x2013;<lpage>279</lpage>. <pub-id pub-id-type="doi">10.1108/IR-06-2021-0120</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A random path sampling-based method for motion planning in many dimensions</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>73</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1109/TIM.2022.3212036</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>View planning for grape harvesting based on active vision strategy under occlusion</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>9</volume> (<issue>3</issue>), <fpage>2535</fpage>&#x2013;<lpage>2542</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2024.3357397</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>An</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A vision system based on CNN-LSTM for robotic citrus sorting</article-title>. <source>Inf. Process. Agric.</source> <volume>11</volume> (<issue>1</issue>), <fpage>14</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1016/j.inpa.2022.06.002</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeeshan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Aized</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Performance analysis of path planning algorithms for fruit harvesting robot</article-title>. <source>J. Biosyst. Eng.</source> <volume>48</volume> (<issue>2</issue>), <fpage>178</fpage>&#x2013;<lpage>197</lpage>. <pub-id pub-id-type="doi">10.1007/s42853-023-00184-y</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhai</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chung</surname>
<given-names>S. O.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Feature deformation network with multi-range feature enhancement for agricultural machinery operation mode identification</article-title>. <source>Int. J. Agric. Biol. Eng.</source> <volume>17</volume> (<issue>4</issue>), <fpage>265</fpage>&#x2013;<lpage>275</lpage>. <pub-id pub-id-type="doi">10.25165/j.ijabe.20241704.8831</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Harvest motion planning for mango picking robot based on improved RRT-connect</article-title>. <source>Biosyst. Eng.</source> <volume>248</volume> (<issue>1</issue>), <fpage>177</fpage>&#x2013;<lpage>189</lpage>. <pub-id pub-id-type="doi">10.1016/j.biosystemseng.2024.10.008</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Au</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Intelligent robots for fruit harvesting: recent developments and future challenges</article-title>. <source>Precis. Agric.</source> <volume>23</volume> (<issue>5</issue>), <fpage>1856</fpage>&#x2013;<lpage>1907</lpage>. <pub-id pub-id-type="doi">10.1007/s11119-022-09913-3</pub-id>
</mixed-citation>
</ref>
</ref-list>
</back>
</article>