<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1739203</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>TeaNeRF: an integrated 3D visual perception pipeline for tea bud harvesting</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Chen</surname><given-names>Weiheng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2937525/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Li</surname><given-names>Xun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3330976/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Rao</surname><given-names>Lei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Xia</surname><given-names>Xiang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Computer Science and Engineering, Wuhan Institute of Technology</institution>, <city>Wuhan</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Hubei Key Laboratory of Intelligent Robot, Wuhan Institute of Technology</institution>, <city>Wuhan</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Xun Li, <email xlink:href="mailto:lixun@wit.edu.cn">lixun@wit.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-02">
<day>02</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1739203</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>07</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Chen, Li, Rao and Xia.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Chen, Li, Rao and Xia</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-02">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Accurate perception of tea buds is a fundamental prerequisite for intelligent and precise tea harvesting planning. However, in real tea plantation environments, reliable harvesting-oriented perception at the planning level remains highly challenging due to the small size of tea buds, severe occlusion, complex background clutter, and the lack of accurate three-dimensional spatial information. To address these challenges, we propose TeaNeRF, an integrated three-dimensional visual perception pipeline designed for harvesting-oriented tea bud analysis. Instead of treating detection, segmentation, and spatial analysis as independent tasks, TeaNeRF integrates sequential two-dimensional recognition, monocular depth estimation, and neural radiance field reconstruction into a coherent perception pipeline, allowing accurate spatial understanding of tea buds in complex natural scenes. It should be noted that the proposed integration is conducted at the perception-output level, where multiple modular components are connected through fixed interfaces, rather than through joint optimization or an end-to-end trainable formulation. The proposed framework combines an enhanced YOLO-based detector, prompt-guided segmentation, and monocular depth priors to guide NeRF-based three-dimensional reconstruction. By incorporating depth supervision and semantic-aware neural fields, TeaNeRF generates dense and geometrically consistent point clouds with reliable semantic separation. Quantitative evaluations show consistent improvements in reconstruction fidelity, as reflected by increased PSNR and reduced LPIPS across multiple tea tree scenes. Based on the reconstructed semantic point cloud, a three-dimensional clustering and geometric fitting strategy is further developed to enable tea bud counting and harvesting-oriented candidate point estimation at the perception level. Experiments conducted on a real-world dataset of 4,700 tea plantation images demonstrate that TeaNeRF improves detection accuracy (mAP@50 = 91.7%), segmentation quality (IoU = 0.640), and overall three-dimensional perception performance. Case-level counting results on representative tea trees indicate that the proposed 3D semantic point cloud&#x2013;based approach can provide feasible tea bud counting behavior and consistent spatial guidance cues for downstream harvesting planning. By providing structured three-dimensional spatial information, including tea bud locations, counts, and harvesting-oriented candidate points, TeaNeRF offers practical perception-level outputs for downstream planning in automated tea harvesting systems.</p>
</abstract>
<kwd-group>
<kwd>3D reconstruction</kwd>
<kwd>automated harvesting</kwd>
<kwd>NeRF</kwd>
<kwd>SAM2</kwd>
<kwd>semantic segmentation</kwd>
<kwd>tea bud detection</kwd>
<kwd>YOLOv11</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="17"/>
<table-count count="7"/>
<equation-count count="12"/>
<ref-count count="39"/>
<page-count count="21"/>
<word-count count="11112"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Tea is a major agricultural product in China, supported by a long-standing tea-drinking tradition and a rapidly expanding market. China accounts for approximately 50% of global tea production, ranking as the world&#x2019;s largest producer (<xref ref-type="bibr" rid="B31">Yu and He, 2018</xref>). Tea harvesting is highly seasonal and labor-intensive, creating substantial workforce demand during peak periods. Although reciprocating cutting tea plucking machines and adaptive canopy-following control strategies have been developed to improve mechanized harvesting stability (<xref ref-type="bibr" rid="B36">Zhang et&#xa0;al., 2025</xref>), they often damage tea buds and unintentionally collect mature leaves, limiting product quality. As a result, manual picking remains the dominant harvesting method. These limitations indicate that precise and selective tea bud harvesting is not merely a mechanical challenge, but fundamentally a perception problem. Harvesting-oriented automation requires accurate tea bud identification and reliable three-dimensional spatial understanding in complex plantation environments, which are not sufficiently supported by existing harvesting systems.</p>
<p>To realize the mechanization and precision of tea picking, accurate identification and localization of tea buds in complex but controlled field environments are essential. Early studies relied primarily on traditional image processing techniques for tea shoot detection. For example, Yang et&#xa0;al. (<xref ref-type="bibr" rid="B29">Yang et&#xa0;al., 2009</xref>) achieved a recognition accuracy of 94% using color component extraction and edge detection. Wu et&#xa0;al. (<xref ref-type="bibr" rid="B24">Wu et&#xa0;al., 2013</xref>) analyzed the color differences between the tea buds and the background regions based on components G and B, reporting recognition accuracies that exceed 92%. Long et&#xa0;al. (<xref ref-type="bibr" rid="B13">Long et&#xa0;al., 2022</xref>) proposed an image segmentation approach based on ultra-green features and the Otsu thresholding method, combined with morphological operations, to effectively extract tea shoot regions. Despite these promising results under relatively controlled conditions, traditional image processing methods heavily depend on hand-crafted features and predefined thresholds, making them highly sensitive to illumination variation, background clutter, and growth-stage diversity in real tea plantation environments. As a result, their robustness and generalization ability are limited in practical harvesting scenarios, motivating the adoption of learning-based approaches with stronger feature representation capability.</p>
<p>With the rapid development of deep learning, tea bud perception has increasingly adopted neural network&#x2013;based methods to overcome the limitations of hand-crafted features, particularly for small-target detection in complex but controlled field environments. Compared with traditional image processing approaches, deep models learn hierarchical feature representations with improved robustness and generalization. Representative studies have focused on enhancing YOLO-based detectors through multi-scale feature extraction, lightweight architectures, and improved loss designs. <xref ref-type="bibr" rid="B37">Zhou et al. (2024)</xref> proposed an optimized YOLOv8-based framework for tea bud detection and yield estimation, demonstrating the feasibility of integrating object detection with production-related analysis in tea plantation environments. For example, Yang et&#xa0;al. (<xref ref-type="bibr" rid="B26">Yang et&#xa0;al., 2019</xref>) and Wang et&#xa0;al. (<xref ref-type="bibr" rid="B22">Wang et&#xa0;al., 2024</xref>, <xref ref-type="bibr" rid="B23">Wang et&#xa0;al., 2025</xref>) improved detection performance by introducing pyramid structures, lightweight convolutions, and decoupled heads, while Gui et&#xa0;al. (<xref ref-type="bibr" rid="B5">Gui et&#xa0;al., 2023</xref>, <xref ref-type="bibr" rid="B6">Gui et&#xa0;al., 2024</xref>), Liu et&#xa0;al. (<xref ref-type="bibr" rid="B12">Liu et&#xa0;al., 2024</xref>), and Jianqiang et&#xa0;al. (<xref ref-type="bibr" rid="B8">Jianqiang et&#xa0;al., 2024</xref>) emphasized lightweight design and robustness under occlusion and high background similarity. Beyond detection, several studies explored two-dimensional segmentation and picking point localization, including attention-enhanced semantic segmentation networks (<xref ref-type="bibr" rid="B33">Zhang et&#xa0;al., 2023a</xref>; <xref ref-type="bibr" rid="B2">Chen et&#xa0;al., 2024a</xref>) and joint detection&#x2013;keypoint frameworks based on heatmap supervision or instance segmentation (<xref ref-type="bibr" rid="B21">Song et&#xa0;al., 2025</xref>; <xref ref-type="bibr" rid="B25">Yan et&#xa0;al., 2022</xref>). Pan et&#xa0;al. (<xref ref-type="bibr" rid="B16">Pan et&#xa0;al., 2024</xref>) further combined traditional algorithms with Transformer-based detection and segmentation models to improve performance across multiple tea categories. Zhu et al. (<xref ref-type="bibr" rid="B38">Zhu et al., 2023a</xref>) proposed a tea bud detection and localization method based on an improved YOLOv5s model combined with 3D point cloud processing, enabling three-dimensional picking point estimation from detected regions. Despite these advances, existing deep learning&#x2013;based methods predominantly operate in two-dimensional image space and remain limited in providing reliable harvesting-oriented spatial information. Severe occlusion, bud&#x2013;leaf adhesion, and depth ambiguity often lead to inaccurate localization when inferring picking points from 2D results alone. Consequently, although deep models substantially improve detection and segmentation accuracy, they still fall short of delivering explicit three-dimensional spatial cues required for precise and automated tea bud harvesting.</p>
<p>In recent years, three-dimensional reconstruction technologies have been increasingly adopted in agricultural and crop-related studies, as they provide richer and more accurate spatial information than conventional two-dimensional methods and enable multi-view structural analysis of plants. Representative works have combined deep learning&#x2013;based detection with point cloud reconstruction to estimate picking points (<xref ref-type="bibr" rid="B39">Zhu et&#xa0;al., 2023b</xref>), employed UAV-based multi-view imagery for crop canopy reconstruction (<xref ref-type="bibr" rid="B38">Zhu et&#xa0;al., 2023</xref>), and developed multi-sensor robotic platforms for high-resolution 3D plant scanning in field environments (<xref ref-type="bibr" rid="B4">Esser et&#xa0;al., 2023</xref>). Three-dimensional reconstruction has also been widely explored for agricultural phenotyping, including NeRF-based peanut plant reconstruction (<xref ref-type="bibr" rid="B19">Saeed et&#xa0;al., 2023</xref>), 3D Gaussian splatting for cotton phenotyping (<xref ref-type="bibr" rid="B7">Jiang et&#xa0;al., 2024</xref>), and SfM-MVS&#x2013;based beet reconstruction for trait extraction (<xref ref-type="bibr" rid="B2">Chen et&#xa0;al., 2024b</xref>). More recently, NeRF has been applied to complex orchard and crop scenes, such as strawberry garden reconstruction (<xref ref-type="bibr" rid="B34">Zhang et&#xa0;al., 2024</xref>), rice spike reconstruction combining YOLOv8 and SAM (<xref ref-type="bibr" rid="B28">Yang et&#xa0;al., 2024b</xref>), and fruit yield estimation using FruitNeRF (<xref ref-type="bibr" rid="B14">Meyer et&#xa0;al., 2024</xref>). From a harvesting-oriented perspective, however, most existing 3D reconstruction and NeRF-based agricultural studies primarily focus on phenotyping, yield estimation, or visual rendering quality, rather than actionable harvesting perception. In particular, they often lack tight integration with robust two-dimensional detection and segmentation, fail to produce dense and reliable bud-level semantic point clouds, and do not explicitly generate precise three-dimensional cues, such as picking points, required for automated harvesting. These limitations motivate the development of an integrated 3D perception pipeline that bridges two-dimensional recognition and three-dimensional reconstruction for practical tea bud harvesting.</p>
<p>In summary, although significant progress has been made in tea bud detection, segmentation, and three-dimensional crop reconstruction, existing approaches remain insufficient to meet the practical requirements of automated tea harvesting. Current methods either lack robustness in complex natural environments, fail to provide dense and reliable bud-level spatial representations, or do not effectively integrate two-dimensional perception with high-fidelity three-dimensional reconstruction.</p>
<p>To address these challenges, this study proposes TeaNeRF, an integrated three-dimensional visual perception pipeline tailored for tea bud harvesting planning. By coherently integrating robust 2D detection and segmentation with depth-aware neural radiance field reconstruction, TeaNeRF enables accurate three-dimensional localization, counting, and candidate picking point estimation of tea buds in complex but controlled field environments, thereby providing a practical and reliable perception-level foundation for mechanized tea harvesting preparation.</p>
<p>The main contributions of this study are summarized as follows:</p>
<list list-type="simple">
<list-item>
<p>1. We formulate tea bud harvesting as an integrated three-dimensional visual perception problem and propose a harvesting-oriented perception pipeline that bridges two-dimensional recognition and three-dimensional reconstruction.</p></list-item>
<list-item>
<p>2. We develop a depth-enhanced semantic NeRF representation that supports bud-level three-dimensional reconstruction and semantic point cloud generation for individual tea plants under real-world imaging conditions.</p></list-item>
<list-item>
<p>3. Based on the reconstructed semantic point cloud, we enable accurate tea bud counting and three- dimensional harvesting-oriented candidate point estimation, providing actionable spatial cues for automated tea harvesting.</p></list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Pipeline overview</title>
<p>In this paper, we propose an integrated three-dimensional visual perception pipeline that integrates tea bud recognition, semantic segmentation, counting, and harvesting-oriented candidate picking point estimation to support harvesting planning and preparation in real tea plantation environments. The overall workflow of the proposed framework is illustrated in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Overview of the proposed TeaNeRF pipeline. <bold>(A)</bold> Multi-view images of tea trees are acquired, and camera poses are estimated using COLMAP; <bold>(B)</bold> YOLOv11 is employed to detect tea buds, and the detection results are used to guide SAM2 for accurate semantic segmentation; <bold>(C)</bold> Depth Anything v2 is applied to estimate monocular depth maps; <bold>(D)</bold> Based on the images and depth priors, a semantic NeRF model reconstructs the 3D structure of the tea tree, from which bud-level point clouds are extracted for tea bud counting and harvesting-oriented candidate point localization.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a workflow for extracting features from tea plants using video data, including modules for data acquisition, 2D image processing, depth estimation, and 3D reconstruction, with labeled steps, sample input images, neural network components, segmentation outputs, and point cloud results for tea and bud counting tasks.</alt-text>
</graphic></fig>
<p>The selection of YOLOv11, SAM2, Depth Anything V2, and NeRF was guided by a balance between accuracy, robustness, and practical applicability in complex outdoor harvesting environments. YOLOv11 provides strong performance for small-object detection with high efficiency, while SAM2 enables flexible and accurate segmentation with minimal annotation overhead. Depth Anything V2 offers reliable monocular depth estimation with strong cross-scene generalization, which is well-suited for natural environments. NeRF, combined with depth supervision and semantic rendering, is adopted to obtain spatially coherent fine-scale geometry under repetitive textures and self-occlusion, providing a coherent three-dimensional representation that better supports subsequent harvesting-oriented analysis.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Data acquisition</title>
<p>This study focuses on tea bud image acquisition in complex outdoor environments, and all data were collected autonomously. To improve acquisition efficiency and satisfy the requirements of subsequent 3D reconstruction, an automated image acquisition system was developed. The device enables controlled rotation around individual tea trees and flexible adjustment of the number of captured images by setting the acquisition interval.</p>
<p>All images were captured using an Obsmeet 4K camera, with a resolution of 3840 &#xd7; 2160 pixels and stored in JPG format. Data collection was conducted in April under natural field conditions in Chongyang County, Xianning City, Hubei Province, China. The final dataset contains 4,700 static images of yellow tea buds, covering diverse viewpoints and occlusion conditions, as illustrated in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>. Tea bud targets in the images were manually annotated using the Trex annotation tool to generate bounding-box labels for model training.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Tea tree images captured from different viewpoints. <bold>(A&#x2013;D)</bold> Representative images of the same tea tree captured from four different perspectives.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g002.tif">
<alt-text content-type="machine-generated">Panel A shows a person standing behind healthy green shrubs with fresh light green shoots. Panel B presents another shrub with slightly more discolored leaves, while a person stands partially visible. Panel C features a dense shrub with green and some purplish leaves, displaying numerous new shoots. Panel D depicts a shrub with several leaves bearing purplish-brown discoloration, and a person&#x2019;s feet and lower legs are visible behind it.</alt-text>
</graphic></fig>
<p>The dataset used in this study was collected from a single geographic region and tea variety, which limits direct evaluation of cross-region and cross-variety generalization. The primary objective of this work is to validate the feasibility and effectiveness of a harvesting-oriented three-dimensional perception pipeline under controlled field conditions, rather than to establish generalization across different tea cultivars, growth stages, or environmental settings. Systematic evaluation under more diverse regions, tea varieties, and growth conditions will be explored in future work.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Data preprocessing</title>
<p>In the data preprocessing stage, COLMAP was first employed to estimate the camera poses (including position and orientation) of the original images, thereby restoring the geometric information during image acquisition; in this work, COLMAP is used for camera pose estimation rather than dense surface reconstruction.</p>
<p>Meanwhile, to further enhance data diversity and improve the generalization of the YOLO-based tea bud recognition model under the available training data conditions, various data augmentation techniques were applied to the dataset, as shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. These augmentations were only used during the training of the two-dimensional recognition model.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Examples of data augmentation used for tea bud recognition. <bold>(A)</bold> Original image, <bold>(B)</bold> Grayscale conversion, <bold>(C)</bold> Brightness adjustment, <bold>(D)</bold> Gaussian noise addition, <bold>(E)</bold> Hue-saturation adjustment, and <bold>(F)</bold> Cutout.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g003.tif">
<alt-text content-type="machine-generated">Panel A shows a person standing behind green leafy bushes with visible shoes and lower legs. Panel B presents the same scene in grayscale. Panel C displays the same scene with higher brightness. Panel D matches Panel A for comparison. Panel E shows a cropped version focusing on the left bush and partial shoes. Panel F replicates Panel A with three black rectangles obscuring parts of the plants and ground.</alt-text>
</graphic></fig>
<p>Specifically, the applied methods included converting images to grayscale and adjusting brightness to approximate different illumination appearances, rather than explicitly modeling real-world illumination variations. Random noise was also synthetically added to increase data variability and expose the recognition model to diverse perturbations during training. In addition, random hue and saturation adjustments were applied to increase color diversity and improve generalization in complex scenes.</p>
<p>In contrast, for the three-dimensional reconstruction process, data augmentation and noise injection were not applied. Instead, images were further screened based on quality indicators such as lighting conditions and sharpness. Low-quality images&#x2014;such as those affected by overexposure, underexposure, or low contrast&#x2014;were manually removed to ensure the geometric stability of camera pose estimation and subsequent neural reconstruction, rather than to optimize any specific reconstruction method. After selection, the number of high-quality images retained in each group ranged from 168 to 225.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>2D image processing</title>
<sec id="s2_4_1">
<label>2.4.1</label>
<title>YOLOv11</title>
<p>YOLOv11 (<xref ref-type="bibr" rid="B9">Khanam and Hussain, 2024</xref>) is the new version of the YOLO series, which combines accuracy, speed, and efficiency in real-time target detection tasks. Compared with its predecessor, YOLOv11 is deeply optimized in terms of network structure and training strategy, significantly enhancing feature extraction capability and inference performance, and is suitable for high-precision detection tasks in complex visual scenes.</p>
<p>The traditional nearest-neighbor interpolation upsampling strategy adopted in YOLOv11 relies on fixed interpolation rules and lacks adaptability to local image features. Since only neighboring pixels are involved in the interpolation process, the effective receptive field is limited, making it difficult to capture broader contextual information. For small tea buds distributed in dense and cluttered plantation scenes, this limitation often leads to blurred spatial details and weakened semantic representation, thereby degrading detection accuracy.</p>
<p>To address this issue, we introduce the lightweight DySample dynamic upsampling module (<xref ref-type="bibr" rid="B11">Liu et&#xa0;al., 2023</xref>). DySample employs a point-based adaptive sampling mechanism that dynamically adjusts sampling locations according to local feature responses. Compared with conventional interpolation-based upsampling, DySample significantly reduces parameter count and computational overhead while preserving fine-grained spatial details.</p>
<p>By enhancing feature alignment during upsampling, DySample strengthens the model&#x2019;s focus on tea bud regions and suppresses background interference from leaves and branches. This design improves the robustness of small-object detection in complex natural environments. The specific structure of the DySample module is illustrated in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Architecture of the proposed Dysample module. The input feature map is first processed by a sampling point generator to produce a sampling set. The generated sampling grid is then applied through a grid sampling operation to obtain the output feature map with scaled spatial resolution. The height, width, and channel dimensions are preserved in the channel domain while the spatial dimensions are adjusted according to the sampling scale.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g004.tif">
<alt-text content-type="machine-generated">Flowchart diagram illustrating the Dysample structure, with data x of dimensions&#xa0;H by W by C&#xa0;entering a sampling point generator, feeding into a sampling set labeled&#xa0;sH by sW by 2g, then processed by a grid sample step producing x prime&#xa0;of dimensions&#xa0;sH by sW by C.</alt-text>
</graphic></fig>
<p>In the backbone of YOLOv11, the C3k2 module is widely adopted in lightweight variants (e.g., n/s) for efficient feature extraction. While shallow C3k2 blocks are effective in capturing local texture and edge information, conventional convolution operations rely on fixed local receptive fields and exhibit strong sensitivity to background clutter. Such locality and noise sensitivity become more pronounced when detecting small or low-contrast targets, such as tea buds under dense occlusion and complex foliage backgrounds, thereby limiting feature discrimination and localization accuracy.</p>
<p>To alleviate these limitations, we propose a C3k2_DG-SimAM module that introduces lightweight gated feature modulation and parameter-free attention into the original C3k2 design. As illustrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, the proposed module consists of two main components: a Bottleneck_DG-SimAM unit and a modified C3k2 aggregation structure.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Structure of the Convolutional Gated Linear Unit (CGLU). The module consists of parallel linear projections, a depthwise 3&#xd7;3 convolution, and a nonlinear activation function, followed by a gated multiplication operation and a residual connection to enhance feature representation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g005.tif">
<alt-text content-type="machine-generated">Flowchart on a blue background illustrating a neural network module: input splits into two paths, both labeled Linear; one path continues through DW Conv three by three, Activation, then merges with the other path via multiplication, followed by Linear and addition, forming an output.</alt-text>
</graphic></fig>
<p>Bottleneck_DG-SimAM. As shown in <xref ref-type="fig" rid="f6"><bold>Figures&#xa0;6</bold></xref>, <xref ref-type="fig" rid="f7"><bold>7</bold></xref>, the Bottleneck_DG-SimAM integrates a Convolutional Gated Linear Unit (CGLU) with the Simple Attention Module (SimAM) (<xref ref-type="bibr" rid="B30">Yang et al., 2021</xref>). The CGLU introduces lightweight depthwise gating to suppress redundant activations and selectively emphasize informative feature responses, enabling more robust feature extraction under cluttered background conditions. SimAM further refines feature representations by modeling neuron-level importance across spatial and channel dimensions without introducing additional learnable parameters, thereby enhancing feature discrimination while preserving computational efficiency.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Structure of the Simple Attention Module (SimAM). The module generates attention weights from the input feature map, expands them to match the original feature dimensions, and performs feature fusion to adaptively recalibrate channel-wise responses.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g006.tif">
<alt-text content-type="machine-generated">Diagram illustrating the Simam structure, showing a cuboid labeled x with dimensions H, W, and C. Three processes labeled Generation, Fusion, and Expansion transform and rearrange colored matrices, ending with three colored blocks labeled C, H, and W.</alt-text>
</graphic></fig>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Structure of the proposed C3k2 DG-SimAM module integrating CGLU and SimAM. The left panel illustrates the Bottleneck_DG-SimAM block, which consists of convolution, CGLU, and SimAM attention operations. The right panel shows the overall C3k2-DG-SimAM architecture, including feature splitting, skip connection, stacked Bottleneck_DG-SimAM blocks, concatenation, and final convolution for feature aggregation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g007.tif">
<alt-text content-type="machine-generated">Flowchart diagram of two deep learning modules. Left box shows Bottleneck_DG-SimAM with sequential blocks: Input, Conv, CGLU, SimAM_Attention, and Output. Right box shows C3K2_DG-SimAM: Input, Conv, Split into two paths, one to Branch1_Skip, one to multiple Bottleneck_DG-SimAM units, then Concat, Conv, and Output.</alt-text>
</graphic></fig>
<p>C3k2_DG-SimAM. As illustrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, multiple Bottleneck_DG-SimAM units are embedded into the original C3k2 framework following a split&#x2013;transform&#x2013;concat strategy. This design preserves the efficient multi-branch feature aggregation characteristics of C3k2 while incorporating gated modulation and attention-enhanced feature refinement. By strengthening the representation of fine-grained and low-contrast features, the proposed C3k2_DG-SimAM module improves robus.</p>
<p>In addition, a new loss function, innerIoU (Inner Intersection over Union), is introduced to improve the accuracy of the bounding box for small and difficult-to-locate tea bud targets. Unlike traditional IoU, which only evaluates the overlapping area between the bounding boxes, innerIoU (<xref ref-type="bibr" rid="B33">Zhang et&#xa0;al., 2023b</xref>) further considers the degree of alignment between the predicted box and the actual box, thus providing a more rigorous criterion for the localization of the target. The calculation method is shown in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Illustration of the Inner-IoU bounding box relationship. The solid boxes represent the target and inner target regions, while the dashed boxes denote the anchor and inner anchor regions. The diagram shows how the inner bounding boxes are defined within the corresponding target and anchor boxes to guide the Inner-IoU prediction.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g008.tif">
<alt-text content-type="machine-generated">Diagram comparing two sets of overlapping rectangles, labeled as Target box, InnerTarget box, Anchor box, and InnerAnchor box, each with different colored solid and dashed outlines, coordinates, and center points. Mathematical symbols denote box dimensions and centers, illustrating spatial relationships between target and anchor boxes for a computer vision context.</alt-text>
</graphic></fig>
<p>Step 1: Compute the inner truth box of the ground by shrinking the GT box with the ratio <italic>r</italic>, as defined in <xref ref-type="disp-formula" rid="eq1">Equation (1)</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mtable columnalign="left" equalrows="true" equalcolumns="true"><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:msubsup><mml:mi>b</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msup><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:msubsup><mml:mi>b</mml:mi><mml:mi>r</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msup><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:msubsup><mml:mi>b</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mi>y</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msup><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:msubsup><mml:mi>b</mml:mi><mml:mi>b</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mi>y</mml:mi><mml:mi>c</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msup><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>Step 2: Compute the inner anchor box by shrinking the predicted box in the ratio <italic>r</italic>, as defined in <xref ref-type="disp-formula" rid="eq2">Equation (2)</xref>:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:mtable columnalign="left" equalrows="true" equalcolumns="true"><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:msub><mml:mi>b</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:mi>w</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:msub><mml:mi>b</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>w</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:msub><mml:mi>b</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:msub><mml:mi>b</mml:mi><mml:mi>b</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math>
</disp-formula>
<p>Step 3: Calculate the intersection and union areas of the inner boxes, as given in <xref ref-type="disp-formula" rid="eq3">Equations (3)</xref> and <xref ref-type="disp-formula" rid="eq4">(4)</xref>:</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mtext>min</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>b</mml:mi><mml:mi>r</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mtext>max</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>b</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xd7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mtext>min</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>b</mml:mi><mml:mi>b</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mi>b</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mtext>max</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>b</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:mi>u</mml:mi><mml:mi>n</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>w</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msup><mml:mo>&#xd7;</mml:mo><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mtext>gt</mml:mtext></mml:mrow></mml:msup><mml:mo>&#xd7;</mml:mo><mml:msup><mml:mi>r</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>h</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msup><mml:mi>r</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:math>
</disp-formula>
<p>Finally, InnerIoU is given by <xref ref-type="disp-formula" rid="eq5">Equation (5)</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:mi>I</mml:mi><mml:mi>o</mml:mi><mml:msup><mml:mi>U</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mi>n</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>Through the above improvements, the accuracy and robustness of target detection are improved. Its overall structure is shown in <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Overall architecture of the proposed tea bud detection model based on YOLOv11. The network incorporates DG-SimAM blocks in the backbone and neck, a Dysample module for feature upsampling, and the Inner-IoU loss for improved localization.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g009.tif">
<alt-text content-type="machine-generated">Network architecture diagram divided into Backbone, Neck, and Head sections; Backbone includes layers such as Conv, C3K2_DG-Simam, SPPF, and C2PSA; Neck features Concat, Dysample, and Conv operations; Head outputs three Detect layers.</alt-text>
</graphic></fig>
</sec>
<sec id="s2_4_2">
<label>2.4.2</label>
<title>Image semantic segmentation</title>
<p>Segment Anything Model 2 (<xref ref-type="bibr" rid="B17">Ravi et&#xa0;al., 2024</xref>) (SAM2) is a new generation of image segmentation model introduced by Meta AI, which outperforms its predecessor SAM (<xref ref-type="bibr" rid="B10">Kirillov et&#xa0;al., 2023</xref>) in terms of inference efficiency and segmentation performance. SAM2 adopts a more compact and efficient architectural design, which reduces latency and shows better segmentation capability in complex backgrounds and small targets. SAM2 supports flexible segmentation of regions of interest in the form of cues such as dots, bounding boxes, or text.</p>
<p>To realize accurate extraction and background rejection of the tea leaf region, this article combines YOLOv11 and SAM2 to construct an efficient semantic segmentation framework. The method uses the target detection frame provided by YOLOv11 as a hint to guide SAM2 to generate corresponding masks, thus overcoming the limitation that SAM2 only outputs binary masks and lacks semantic annotations, and completing the semantic level segmentation.</p>
<p>To improve the accuracy and robustness of the segmentation mask, a filtering strategy based on the area and segmentation stability is further introduced to eliminate mask regions that do not meet the threshold requirements, to reduce the impact of noise and false detection. At the same time, morphological corrosion operation is used to optimize the edges of the retained mask to remove small artifacts in the area and improve the integrity and clarity of the target contour. Finally, the optimized mask is converted to a JPG format image to complete the semantic segmentation process.</p>
<p>This method not only significantly reduces the cost of manual annotation, but also improves the segmentation accuracy of tea leaf images in complex environments, which provides a reliable basis for subsequent target analysis and 3D reconstruction.</p>
</sec>
<sec id="s2_4_3">
<label>2.4.3</label>
<title>Depth estimation</title>
<p>The task of three-dimensional reconstruction of tea leaves poses significant challenges to traditional image-based matching methods due to their intricate structure, severe leaf occlusion, high surface texture repetition, and susceptibility to lighting conditions. To address this, the incorporation of high-quality depth maps can significantly enhance the geometric accuracy and robustness of the reconstruction. Depth maps provide spatial location information for each pixel, effectively alleviating matching difficulties caused by occlusion, texture loss, or sparse features, particularly for natural objects like tea stems and leaves that exhibit non-rigid shapes and locally repetitive structures.</p>
<p>To achieve the depth estimation of the images, the Depth Anything V2 model (<xref ref-type="bibr" rid="B28">Yang et&#xa0;al., 2024a</xref>) is used in this paper. This model is a generalized depth estimation framework based on the vision-based model, which is capable of high-quality depth prediction for a wide range of natural images in unsupervised and weakly supervised scenarios. Depth Anything V2 architecturally combines an image encoder and a multiscale depth decoder, with good cross-scene generalization ability and robustness to complex lighting and texture occlusion. By applying this model to multi-view images, it can provide dense and continuous depth information for subsequent 3D reconstruction tasks, which provides strong support for improving the training accuracy and geometric restoration capability of NeRF.</p>
</sec>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Tea tree 3D reconstruction</title>
<p>To reconstruct high-quality three-dimensional representations of tea trees, we adopt the Nerfacto framework within NeRFStudio (<xref ref-type="bibr" rid="B15">Mildenhall et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B32">Zhang et&#xa0;al., 2021</xref>). Nerfacto provides an efficient implementation of neural radiance fields by optimizing the sampling strategy and network design, achieving a favorable balance between rendering quality and computational efficiency. This makes it suitable for high-resolution reconstruction of complex plant structures under field conditions.</p>
<p>To further enhance geometric fidelity and accelerate convergence, monocular depth priors are incorporated during training through a depth-supervised loss, enabling the model to better capture fine-scale structures such as thin branches and densely clustered tea buds. In addition, a semantic branch is introduced to extend Nerfacto from pure appearance modeling to semantic-aware 3D reconstruction. The semantic field predicts tea bud probabilities as a function of spatial location, allowing semantic information to be consistently aligned with reconstructed geometry. Standard volumetric rendering formulations for color, depth, and semantics are consistent with the original NeRF framework and are therefore not detailed here.</p>
</sec>
<sec id="s2_6">
<label>2.6</label>
<title>Tea bud point cloud processing</title>
<p>Unlike most existing NeRF-based agricultural studies that primarily focus on visual reconstruction or phenotypic analysis, this section introduces a harvesting-oriented three-dimensional perception strategy. By explicitly processing bud-level semantic point clouds, the proposed method bridges three-dimensional reconstruction with practical harvesting tasks, including tea bud counting and harvesting-oriented candidate point estimation. The harvesting-oriented candidate point is obtained from bud-level semantic point clouds and provides a three-dimensional spatial reference for subsequent harvesting planning. This design enables the extraction of actionable spatial cues from reconstructed point clouds, establishing an effective perception foundation for harvesting-oriented applications.</p>
<sec id="s2_6_1">
<label>2.6.1</label>
<title>Point cloud processing and bud-level clustering</title>
<p>Before clustering analysis, the reconstructed three-dimensional point cloud is first preprocessed to remove isolated noise points. Specifically, a radius-based filtering strategy is applied, in which points with an insufficient number of neighboring points within a predefined radius are discarded. This step effectively reduces spurious points introduced during reconstruction and improves the reliability of subsequent clustering and geometric analysis.</p>
<p>To enable reliable tea bud instance separation and counting, a density-based spatial clustering algorithm (DBSCAN) (<xref ref-type="bibr" rid="B20">Schubert et&#xa0;al., 2017</xref>) is adopted. DBSCAN identifies core points by evaluating local point density within a predefined neighborhood and groups density-reachable points into clusters, while automatically rejecting sparse outliers. Through this process, individual tea buds are separated into distinct three-dimensional clusters, each corresponding to a candidate tea bud instance.</p>
<p>For clusters with a small spatial scale, an additional merging strategy is applied. If the Euclidean distance between the centroids of two clusters is smaller than the average radius of a tea bud, the clusters are considered to belong to the same tea bud structure and are merged accordingly. The remaining tiny clusters are further examined by estimating their volumetric size. Clusters whose volumes are significantly smaller than those of typical tea bud clusters are regarded as non-target structures and are removed. After these steps, the resulting bud-level clusters provide a robust basis for tea bud counting and subsequent spatial analysis.</p>
<p>All hyperparameters that directly affect the reproducibility of the proposed three-dimensional perception pipeline, including density-based clustering, semantic filtering thresholds, loss weighting, and geometric estimation criteria, are summarized in the <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material</bold></xref> (<xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Table&#xa0;1</bold></xref>).</p>
</sec>
<sec id="s2_6_2">
<label>2.6.2</label>
<title>Harvesting-oriented candidate point estimation</title>
<p>Based on the bud-level point cloud clusters obtained in the previous step, a harvesting-oriented candidate point is estimated for each tea bud. This estimation aims to provide a stable three-dimensional spatial reference derived from perception, rather than a final robot-executable cutting command.</p>
<p>For each tea bud cluster, all points are first sorted along the Z-axis (vertical direction), and a subset of points within the lowest height range (e.g., the lowest 5%) is extracted. Rather than assuming a fixed anatomical stem&#x2013;bud junction, these points are used as a geometric approximation of the basal-side surface region under a harvesting-oriented prior. Such a proxy often corresponds to the attachment side of the bud cluster when the basal structure is visible in the reconstructed point cloud, but it does not rely on the assumption that all buds grow strictly upward in the global coordinate system. Consequently, laterally growing or obliquely oriented buds can still be handled, as the candidate point is not defined as an anatomically exact junction.</p>
<p>To mitigate the influence of noise and isolated extreme points, a robust estimation strategy is employed. Specifically, RANSAC-based plane fitting (<xref ref-type="bibr" rid="B3">Derpanis, 2010</xref>) or local geometric center estimation is applied to the selected lower subset, providing a local geometric prior for basal-side localization. The geometric center of the fitted plane (or the estimated local center when plane fitting is degenerate) is taken as the candidate point. Compared with directly selecting individual extreme points, this strategy yields improved stability and repeatability under point cloud perturbations.</p>
<p>The estimated candidate point is derived purely from three-dimensional perception and represents a harvesting-oriented guidance cue rather than a ground-truth stem&#x2013;bud junction. It provides a consistent spatial reference for tea bud localization and can support downstream harvesting planning. Specific implementations related to cutting execution, such as end-effector configuration and motion control, are system-dependent and therefore beyond the scope of this study. An illustrative example of the candidate point estimation process is shown in <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Harvesting-oriented candidate picking point estimation from the tea bud point cloud.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g010.tif">
<alt-text content-type="machine-generated">Three-dimensional scatter plot illustration labeled with tea bud point cloud in green, a purple dashed clustering boundary ellipse, an orange fit plane, lowers points in orange below the plane, and a red candidate picking point identified at the bottom center.</alt-text>
</graphic></fig>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Detection modeling of tea buds</title>
<p>The detection models in this study were evaluated using precision (P), recall (R), and mean accuracy (mAP), where P denotes the proportion of accurate predictions in all prediction examples and R denotes the proportion of accurate predictions in all true examples. mAP denotes the composite accuracy metric used to evaluate the detection models. The formulas for the computation of P, R, and mAP are shown in <xref ref-type="disp-formula" rid="eq6">Equations (6</xref>&#x2013;<xref ref-type="disp-formula" rid="eq8">8)</xref>.</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mi>R</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mtext>mAP</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mtext>&#x394;</mml:mtext><mml:msub><mml:mi>R</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mi>N</mml:mi></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>TP: number of positive samples predicted as positive samples.</p>
<p>FP: Number of negative samples predicted as positive samples.</p>
<p>FN: number of positive samples predicted as negative samples.</p>
<p>N: denotes the number of bud types detected (only one type of tea bud is studied in this paper, so N is equal to 1).</p>
<p>It can be seen from <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref> that each component, including DySample, DG-SimAM, and InnerIoU, contributes positively to the overall detection performance. Specifically, DySample mainly improves detection coverage, as reflected by the increase in mAP@50, while InnerIoU enhances localization accuracy, resulting in a higher mAP@50:95. DG-SimAM further improves recall and robustness under complex backgrounds.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Results of ablation experiments.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Number</th>
<th valign="middle" align="center">+DySample</th>
<th valign="middle" align="center">+DG-SimAM</th>
<th valign="middle" align="center">+InnerIoU</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50:95</th>
<th valign="middle" align="center">Params</th>
<th valign="middle" align="center">GFLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">0</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.776</td>
<td valign="middle" align="center">0.801</td>
<td valign="middle" align="center">0.869</td>
<td valign="middle" align="center">0.624</td>
<td valign="middle" align="center">2 938 835</td>
<td valign="middle" align="center">6.7</td>
</tr>
<tr>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.813</td>
<td valign="middle" align="center">0.839</td>
<td valign="middle" align="center">0.907</td>
<td valign="middle" align="center">0.567</td>
<td valign="middle" align="center">2 602 387</td>
<td valign="middle" align="center">6.5</td>
</tr>
<tr>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.789</td>
<td valign="middle" align="center">0.826</td>
<td valign="middle" align="center">0.889</td>
<td valign="middle" align="center">0.603</td>
<td valign="middle" align="center">3 157 075</td>
<td valign="middle" align="center">7.1</td>
</tr>
<tr>
<td valign="middle" align="center">3</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">0.800</td>
<td valign="middle" align="center">0.829</td>
<td valign="middle" align="center">0.897</td>
<td valign="middle" align="center">0.631</td>
<td valign="middle" align="center">2 938 835</td>
<td valign="middle" align="center">6.7</td>
</tr>
<tr>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">0.819</td>
<td valign="middle" align="center">0.841</td>
<td valign="middle" align="center">0.911</td>
<td valign="middle" align="center">0.582</td>
<td valign="middle" align="center">2 602 387</td>
<td valign="middle" align="center">6.5</td>
</tr>
<tr>
<td valign="middle" align="center">5</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">0.799</td>
<td valign="middle" align="center">0.837</td>
<td valign="middle" align="center">0.901</td>
<td valign="middle" align="center">0.621</td>
<td valign="middle" align="center">3 157 075</td>
<td valign="middle" align="center">7.1</td>
</tr>
<tr>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.815</td>
<td valign="middle" align="center">0.837</td>
<td valign="middle" align="center">0.909</td>
<td valign="middle" align="center">0.605</td>
<td valign="middle" align="center">3 169 427</td>
<td valign="middle" align="center">7.1</td>
</tr>
<tr>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">0.827</td>
<td valign="middle" align="center">0.843</td>
<td valign="middle" align="center">0.917</td>
<td valign="middle" align="center">0.651</td>
<td valign="middle" align="center">3 169 427</td>
<td valign="middle" align="center">7.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>When combined, the proposed model achieves the best results, with Precision, Recall, mAP@50, and mAP@50:95 reaching 0.827, 0.843, 0.917, and 0.651, respectively. Compared to the baseline YOLOv11n, these values represent improvements of 5.1% in Precision, 4.2% in Recall, 4.8% in mAP@50, and 2.7% in mAP@50:95, while maintaining comparable parameters and computational cost (GFLOPs). This indicates that the introduced modules not only enhance feature extraction and localization accuracy, but also achieve a favorable trade-off between accuracy and efficiency.</p>
<p>Furthermore, as summarized in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref> and illustrated in <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref>, the proposed model consistently outperforms other YOLO variants across all evaluation metrics. Although YOLOv5n, YOLOv8n, and YOLOv10n demonstrate competitive performance, the improved YOLOv11-based model achieves superior detection accuracy and localization stability. In particular, the improvement in mAP@50:95 indicates more precise bounding box regression under stricter IoU thresholds, which is crucial for detecting small, densely distributed, and partially occluded tea buds in complex tea garden environments.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparison of the improved model with other models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">mAP@50</th>
<th valign="middle" align="center">mAP@50:95</th>
<th valign="middle" align="center">Params</th>
<th valign="middle" align="center">GFLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">YOLOv5n</td>
<td valign="middle" align="center">0.759</td>
<td valign="middle" align="center">0.786</td>
<td valign="middle" align="center">0.851</td>
<td valign="middle" align="center">0.554</td>
<td valign="middle" align="center">2,188,019</td>
<td valign="middle" align="center">5.9</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv8n</td>
<td valign="middle" align="center">0.778</td>
<td valign="middle" align="center">0.790</td>
<td valign="middle" align="center">0.862</td>
<td valign="middle" align="center">0.550</td>
<td valign="middle" align="center">2,690,403</td>
<td valign="middle" align="center">6.9</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv10n</td>
<td valign="middle" align="center">0.789</td>
<td valign="middle" align="center">0.805</td>
<td valign="middle" align="center">0.875</td>
<td valign="middle" align="center">0.599</td>
<td valign="middle" align="center">2,707,430</td>
<td valign="middle" align="center">8.4</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv11n</td>
<td valign="middle" align="center">0.776</td>
<td valign="middle" align="center">0.801</td>
<td valign="middle" align="center">0.869</td>
<td valign="middle" align="center">0.624</td>
<td valign="middle" align="center">2,938,835</td>
<td valign="middle" align="center">6.7</td>
</tr>
<tr>
<td valign="middle" align="left">Ours</td>
<td valign="middle" align="center">0.827</td>
<td valign="middle" align="center">0.843</td>
<td valign="middle" align="center">0.917</td>
<td valign="middle" align="center">0.651</td>
<td valign="middle" align="center">3,169,427</td>
<td valign="middle" align="center">7.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Tea bud detection results. Red ovals mark misidentified and missed tea buds.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g011.tif">
<alt-text content-type="machine-generated">Comparison of tea bud detection results across multiple methods, shown in rows for two example scenes. Detected objects are indicated by bounding boxes, and red circles highlight example regions of missed or incorrect detections. The final row is labeled &#x201c;OURS.&#x201d;</alt-text>
</graphic></fig>
<p>Accurate localization is especially important in the proposed pipeline, as the detection results are directly used as prompts to guide SAM2 for fine-grained semantic segmentation. Inaccurate or shifted bounding boxes may propagate errors to the segmentation stage, leading to incomplete or imprecise tea bud masks.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>2D image segmentation</title>
<p>To achieve semantic segmentation of tea buds, we employed three methods: YOLO+SAM, YOLO+SAM2, and a self-trained U-Net (<xref ref-type="bibr" rid="B18">Ronneberger et&#xa0;al., 2015</xref>). In addition, a representative single-stage instance segmentation model, Mask R-CNN, was included as a baseline for comparison. The U-Net training data set consisted of two parts: 60 manually annotated images from our captured data set and 30 additional images refined from the YOLO+SAM2 segmentation results. Data augmentation techniques, including color transformation (brightness, contrast, and saturation adjustments within 0.5&#x2013;1.5), image flipping, rotation, scaling, and noise injection, were applied to improve robustness. The U-Net model was implemented in PyTorch.</p>
<p>The comparative performance of all methods is summarized in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>. YOLO+SAM2 achieved the highest IoU (0.640) and Dice coefficient (0.779) while maintaining a reasonable inference time (0.511 s). YOLO+SAM closely followed with an IoU of 0.629 and a Dice score of 0.771. U-Net was significantly faster (0.013 s) but less accurate (IoU 0.597, Dice 0.747). Mask R-CNN exhibited lower segmentation accuracy (IoU 0.578, Dice 0.725) while remaining computationally efficient (0.028 s).</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of different segmentation methods for tea bud images.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">IoU</th>
<th valign="middle" align="center">Dice</th>
<th valign="middle" align="center">Inference time (s)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">YOLO + SAM</td>
<td valign="middle" align="center">0.629</td>
<td valign="middle" align="center">0.771</td>
<td valign="middle" align="center">0.523</td>
</tr>
<tr>
<td valign="middle" align="left">YOLO + SAM2</td>
<td valign="middle" align="center">0.640</td>
<td valign="middle" align="center">0.779</td>
<td valign="middle" align="center">0.511</td>
</tr>
<tr>
<td valign="middle" align="left">U-Net</td>
<td valign="middle" align="center">0.597</td>
<td valign="middle" align="center">0.747</td>
<td valign="middle" align="center">0.013</td>
</tr>
<tr>
<td valign="middle" align="left">Mask R-CNN</td>
<td valign="middle" align="center">0.578</td>
<td valign="middle" align="center">0.725</td>
<td valign="middle" align="center">0.028</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref>, YOLO+SAM2 produces more precise segmentation of small and occluded tea buds, with improved boundary delineation compared to SAM, benefiting from its more efficient attention mechanism and lighter architecture. YOLO+SAM also shows competitive performance, though with slightly reduced accuracy. U-Net, despite its high speed and independence from cue information, tends to over-segment tea leaves and struggles with fine or deeply embedded tea buds under complex canopy conditions. Mask R-CNN, while capable of directly predicting instance-level masks, often suffers from incomplete or fragmented bud segmentation in densely occluded regions, which limits its effectiveness in this scenario.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Examples of segmentation results for different models. <bold>(A)</bold> Original <bold>(B)</bold> SAM2 <bold>(C)</bold> SAM <bold>(D)</bold> U-Net <bold>(E)</bold> Mask R-CNN.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g012.tif">
<alt-text content-type="machine-generated">Panel A shows a tea plant with light green buds and young leaves against a background of soil and other plants, partially revealing a person&#x2019;s legs. Panels B, C, D, and E display segmented extractions of these bright leaf tips and buds represented as scattered yellow-green shapes on a black background, with different distribution patterns in each panel.</alt-text>
</graphic></fig>
<p>Overall, the quantitative results in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref> and the qualitative comparisons in <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref> demonstrate that YOLO+SAM2 achieves the best balance between segmentation accuracy and inference efficiency. Although segmentation errors remain under realistic harvesting conditions&#x2014;mainly due to boundary ambiguity and severe occlusion&#x2014;the improved boundary precision of YOLO+SAM2 leads to more compact and consistent semantic point clusters, which is beneficial for downstream tasks such as tea bud counting and candidate picking-point estimation in harvesting-oriented 3D perception.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Tea tree 3D reconstruction</title>
<p>Semantic 3D reconstruction is a key part of the process. The classical 3D representations of point clouds, meshes, and voxels are limited in their effectiveness in representing complex scenes and are limited in representing fine-grained geometry and view-dependent appearance in densely cluttered canopies, motivating the use of NeRF to model radiance and density as continuous fields. With body rendering techniques, NeRF can generate high-quality images from arbitrary viewpoints. Nerfstudio is an open-source framework that provides many frameworks for NeRF. We extend the Nerfacto method by adding a semantic layer that maps points and viewpoint directions in 3D space to semantic information as well, which can be viewed as a semantic field.</p>
<p>As shown in <xref ref-type="fig" rid="f13"><bold>Figure&#xa0;13</bold></xref>, the proposed 3D reconstruction framework for tea trees adopts multimodal encoding and collaborative neural network modeling to establish an efficient mapping between 3D space and 2D images at the reconstruction level. Specifically, spatial points (<italic>x, y, z</italic>) along each ray and their corresponding view direction vectors d are encoded using hash encoding and spherical harmonic (SH) encoding for feature extraction, respectively. Hash encoding effectively reduces the computational overhead of traditional positional encoding and supports high-resolution geometric modeling, while SH encoding captures view-dependent orientation information to improve the modeling accuracy of lighting and material appearance. The fused appearance embedding vectors are nonlinearly mapped by a multilayer perceptron (MLP) to predict volume density (<italic>&#x3c3;</italic>) and color (RGB), and novel views are rendered via volume rendering. This decoupled architecture improves reconstruction efficiency compared to conventional NeRF formulations while maintaining reconstruction quality. On an NVIDIA RTX 4090 (24GB VRAM), training on a set of 3840&#xd7;2160 resolution images takes approximately 5 minutes per tea tree. This runtime reflects an offline, per-tree reconstruction setting focused on high-fidelity geometric and semantic perception, rather than real-time harvesting execution.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Network architecture of the proposed TeaNeRF reconstruction framework. The model takes spatial coordinates and appearance embeddings as input, applies hash encoding and multi-layer perceptrons, and performs volume rendering to generate RGB images, depth maps, and segmentation outputs. The training loss consists of depth, image reconstruction, and segmentation supervision terms.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g013.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural rendering pipeline where spatial coordinates pass through a hash encoder and viewing directions and appearance embeddings pass through a spherical harmonics encoder, then processed through neural networks with outputs rendered as RGB image, depth map, and segmentation map, guided by a combined loss function.</alt-text>
</graphic></fig>
<p>For completeness, an approximate runtime breakdown of the proposed pipeline is provided in the <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material</bold></xref> (<xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Table&#xa0;2</bold></xref>).</p>
<p>In the training phase, the photometric loss is calculated as defined in <xref ref-type="disp-formula" rid="eq9">Equation (9)</xref> by comparing the difference between the predicted color of each ray and the real pixel RGB value, thus optimizing the color reconstruction performance of the model. For semantic rendering, the semantic loss is defined in <xref ref-type="disp-formula" rid="eq10">Equation (10)</xref>, where the model estimates the probability of each pixel belonging to the tea bud or the background based on the prediction density and introduces the binary cross-entropy loss to enhance semantic segmentation accuracy. Meanwhile, to enhance the model&#x2019;s ability to model the geometric structure of the scene, the depth-supervised loss is defined in <xref ref-type="disp-formula" rid="eq11">Equation (11)</xref>, improving the accuracy and robustness of 3D reconstruction by constraining the consistency between the predicted depth and the true depth.</p>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:msub><mml:mi mathvariant="script">L</mml:mi><mml:mrow><mml:mtext>image</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mo>|</mml:mo><mml:mi mathvariant="script">R</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mover accent="true"><mml:mi>C</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:msubsup><mml:mo>||</mml:mo><mml:mn>2</mml:mn><mml:mn>2</mml:mn></mml:msubsup></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:msub><mml:mi mathvariant="script">L</mml:mi><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mo>|</mml:mo><mml:mi mathvariant="script">R</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">R</mml:mi></mml:mrow></mml:munder><mml:mo stretchy="false">[</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:mi>log</mml:mi><mml:mover accent="true"><mml:mi>p</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mtext>log</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mover accent="true"><mml:mi>p</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:msub><mml:mi mathvariant="script">L</mml:mi><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mo>|</mml:mo><mml:mi mathvariant="script">R</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="script">R</mml:mi></mml:mrow></mml:munder><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mi>D</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mover accent="true"><mml:mi>D</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>r</mml:mi></mml:mstyle><mml:mo stretchy="false">)</mml:mo><mml:mo>|</mml:mo><mml:mo>|</mml:mo></mml:mrow><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>
The overall training objective is defined in <xref ref-type="disp-formula" rid="eq12">Equation (12)</xref>:</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:mi mathvariant="script">L</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant="script">L</mml:mi><mml:mrow><mml:mtext>image</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi mathvariant="script">L</mml:mi><mml:mrow><mml:mtext>sem</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x3bb;</mml:mi><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:msub><mml:mo>&#xb7;</mml:mo><mml:msub><mml:mi mathvariant="script">L</mml:mi><mml:mrow><mml:mtext>depth</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>In this study, NeRF is applied to achieve 3D reconstruction of tea trees based on multi-view images. The model synthesizes geometrically consistent views from different perspectives, thereby generating dense point clouds that support subsequent structural and semantic analysis. Nevertheless, conventional NeRF often produces rendered images that appear less sharp than the original photos, particularly in regions with densely distributed tea buds. This is mainly because NeRF prioritizes geometric consistency and multi-view synthesis fidelity, while fine-grained details may accumulate rendering errors and lead to local blurring. To overcome this limitation, depth information was incorporated into NeRF (Depth-NeRF), allowing more accurate depiction of subtle tea bud structures, clearer leaf-edge contours, and improved overall sharpness, as illustrated in <xref ref-type="fig" rid="f14"><bold>Figure&#xa0;14</bold></xref>.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Qualitative NeRF rendering results. The top rows show rendered views of tea plants, with highlighted regions indicating areas of interest. The bottom rows present zoomed-in views for detailed comparison of reconstructed structures.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g014.tif">
<alt-text content-type="machine-generated">Comparison of NeRF rendering results for tea plant scenes. The figure shows three columns of rendered images with three rows of different viewpoints. Yellow rectangles indicate selected regions for close-up visualization. Corresponding zoomed-in views highlight structural details within the marked areas.</alt-text>
</graphic></fig>
<p>To systematically evaluate the contribution of monocular depth priors to Neural Radiance Fields (NeRF) for reconstructing plant-like natural objects, three tea trees with distinct morphologies (denoted as Tree1, Tree2, and Tree3) were selected. The baseline NeRFacto model was compared with a depth-supervised variant, Depth-NeRF, which incorporates monocular depth priors estimated using Depth Anything v2 during training iterations ranging from 2,000 to 30,000. Reconstruction performance was evaluated using Peak Signal-to-Noise Ratio (PSNR), Structural Similarity Index (SSIM), and Learned Perceptual Image Patch Similarity (LPIPS), providing a comprehensive assessment of both image fidelity and perceptual quality.</p>
<p>As shown in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>, Depth-NeRF consistently outperforms NeRFacto in all three test cases. In the Tree1 scene, Depth-NeRF achieves a PSNR of 24.08, SSIM of 0.704, and LPIPS of 0.209 at 30,000 iterations, compared to 23.91, 0.698, and 0.247 for NeRFacto, respectively. Although the absolute PSNR and SSIM improvements appear modest, the substantial reduction in LPIPS highlights a clear enhancement in perceptual realism and fine structural detail. A similar trend is observed for Tree2 and Tree3, with Tree3 showing the most significant gain (LPIPS reduced from 0.599 to 0.405), indicating improved robustness in handling occlusion, texture repetition, and complex leaf structures. However, residual reconstruction uncertainty remains in densely occluded regions and around thin structures, where monocular depth ambiguity may cause local geometric blur or incomplete surfaces that can propagate to subsequent semantic point cloud extraction.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Evaluation results for different iteration counts.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="left">Tree</th>
<th valign="middle" rowspan="2" align="left">Iterations</th>
<th valign="middle" colspan="3" align="center">NeRF</th>
<th valign="middle" colspan="3" align="center">Depth-NeRF</th>
</tr>
<tr>
<th valign="middle" align="center">PSNR</th>
<th valign="middle" align="center">SSIM</th>
<th valign="middle" align="center">LPIPS</th>
<th valign="middle" align="center">PSNR</th>
<th valign="middle" align="center">SSIM</th>
<th valign="middle" align="center">LPIPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="5" align="left">Tree1</td>
<td valign="middle" align="left">2000</td>
<td valign="middle" align="center">21.78</td>
<td valign="middle" align="center">0.543</td>
<td valign="middle" align="center">0.493</td>
<td valign="middle" align="center">22.11</td>
<td valign="middle" align="center">0.569</td>
<td valign="middle" align="center">0.473</td>
</tr>
<tr>
<td valign="middle" align="left">4000</td>
<td valign="middle" align="center">22.37</td>
<td valign="middle" align="center">0.589</td>
<td valign="middle" align="center">0.411</td>
<td valign="middle" align="center">22.67</td>
<td valign="middle" align="center">0.605</td>
<td valign="middle" align="center">0.393</td>
</tr>
<tr>
<td valign="middle" align="left">8000</td>
<td valign="middle" align="center">23.10</td>
<td valign="middle" align="center">0.631</td>
<td valign="middle" align="center">0.334</td>
<td valign="middle" align="center">23.43</td>
<td valign="middle" align="center">0.652</td>
<td valign="middle" align="center">0.323</td>
</tr>
<tr>
<td valign="middle" align="left">10000</td>
<td valign="middle" align="center">23.43</td>
<td valign="middle" align="center">0.655</td>
<td valign="middle" align="center">0.288</td>
<td valign="middle" align="center">23.62</td>
<td valign="middle" align="center">0.674</td>
<td valign="middle" align="center">0.303</td>
</tr>
<tr>
<td valign="middle" align="left">30000</td>
<td valign="middle" align="center">23.91</td>
<td valign="middle" align="center">0.698</td>
<td valign="middle" align="center">0.247</td>
<td valign="middle" align="center">24.08</td>
<td valign="middle" align="center">0.704</td>
<td valign="middle" align="center">0.209</td>
</tr>
<tr>
<td valign="middle" rowspan="5" align="left">Tree2</td>
<td valign="middle" align="left">2000</td>
<td valign="middle" align="center">17.07</td>
<td valign="middle" align="center">0.399</td>
<td valign="middle" align="center">0.771</td>
<td valign="middle" align="center">17.21</td>
<td valign="middle" align="center">0.405</td>
<td valign="middle" align="center">0.747</td>
</tr>
<tr>
<td valign="middle" align="left">4000</td>
<td valign="middle" align="center">17.17</td>
<td valign="middle" align="center">0.404</td>
<td valign="middle" align="center">0.749</td>
<td valign="middle" align="center">17.43</td>
<td valign="middle" align="center">0.413</td>
<td valign="middle" align="center">0.710</td>
</tr>
<tr>
<td valign="middle" align="left">8000</td>
<td valign="middle" align="center">17.43</td>
<td valign="middle" align="center">0.410</td>
<td valign="middle" align="center">0.722</td>
<td valign="middle" align="center">17.68</td>
<td valign="middle" align="center">0.421</td>
<td valign="middle" align="center">0.678</td>
</tr>
<tr>
<td valign="middle" align="left">10000</td>
<td valign="middle" align="center">17.79</td>
<td valign="middle" align="center">0.425</td>
<td valign="middle" align="center">0.677</td>
<td valign="middle" align="center">17.81</td>
<td valign="middle" align="center">0.428</td>
<td valign="middle" align="center">0.657</td>
</tr>
<tr>
<td valign="middle" align="left">30000</td>
<td valign="middle" align="center">17.88</td>
<td valign="middle" align="center">0.431</td>
<td valign="middle" align="center">0.681</td>
<td valign="middle" align="center">18.13</td>
<td valign="middle" align="center">0.447</td>
<td valign="middle" align="center">0.572</td>
</tr>
<tr>
<td valign="middle" rowspan="5" align="left">Tree3</td>
<td valign="middle" align="left">2000</td>
<td valign="middle" align="center">18.70</td>
<td valign="middle" align="center">0.456</td>
<td valign="middle" align="center">0.749</td>
<td valign="middle" align="center">19.44</td>
<td valign="middle" align="center">0.485</td>
<td valign="middle" align="center">0.635</td>
</tr>
<tr>
<td valign="middle" align="left">4000</td>
<td valign="middle" align="center">19.45</td>
<td valign="middle" align="center">0.486</td>
<td valign="middle" align="center">0.649</td>
<td valign="middle" align="center">19.60</td>
<td valign="middle" align="center">0.493</td>
<td valign="middle" align="center">0.620</td>
</tr>
<tr>
<td valign="middle" align="left">8000</td>
<td valign="middle" align="center">19.46</td>
<td valign="middle" align="center">0.485</td>
<td valign="middle" align="center">0.633</td>
<td valign="middle" align="center">20.15</td>
<td valign="middle" align="center">0.551</td>
<td valign="middle" align="center">0.580</td>
</tr>
<tr>
<td valign="middle" align="left">10000</td>
<td valign="middle" align="center">20.05</td>
<td valign="middle" align="center">0.501</td>
<td valign="middle" align="center">0.599</td>
<td valign="middle" align="center">20.83</td>
<td valign="middle" align="center">0.613</td>
<td valign="middle" align="center">0.405</td>
</tr>
<tr>
<td valign="middle" align="left">30000</td>
<td valign="middle" align="center">20.90</td>
<td valign="middle" align="center">0.599</td>
<td valign="middle" align="center">0.399</td>
<td valign="middle" align="center">21.03</td>
<td valign="middle" align="center">0.622</td>
<td valign="middle" align="center">0.394</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Another important observation is that Depth-NeRF exhibits faster convergence during the early training phase (2k&#x2013;8k iterations), producing sharper and less noisy reconstructions compared to the baseline. This demonstrates that monocular depth priors not only improve final reconstruction quality but also stabilize training and accelerate geometry learning under limited iterations.</p>
<p><xref ref-type="fig" rid="f15"><bold>Figure&#xa0;15</bold></xref> further illustrates the qualitative improvements. The RGB renderings generated by Depth-NeRF under natural lighting accurately restore the tea tree morphology, with clearer branch-leaf hierarchies than NeRFacto. In semantic rendering, tea buds (highlighted in red) are accurately separated from surrounding leaves (blue-green), verifying the capability of Depth-NeRF to achieve fine-grained semantic segmentation in cluttered environments. The depth maps reveal an enhanced contrast between the body of the tea tree (blue-to-yellow gradient) and the background (orange-red), strengthening the model&#x2019;s ability to capture meaningful depth differences.</p>
<fig id="f15" position="float">
<label>Figure&#xa0;15</label>
<caption>
<p>Volume rendering results of the proposed TeaNeRF framework. From left to right, the columns show the rendered RGB images, semantic predictions, and depth maps under different viewpoints.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g015.tif">
<alt-text content-type="machine-generated">Three rows each show a set of plant images, with each row containing three panels: a standard RGB photo, a semantic segmentation map in green and orange, and a depth map using blue to red hues to indicate spatial distances.</alt-text>
</graphic></fig>
<p>Overall, the integration of monocular depth priors significantly improves both perceptual realism and structural fidelity in the reconstruction of NeRF-based plants. These improvements are particularly valuable for downstream applications, such as tea bud counting and candidate picking-point estimation for harvesting- oriented perception, where precise structural representation and semantic separation are essential.</p>
<p>After training the neural radiation field model, we utilize the volume sampling module provided by Nerfstudio to reconstruct the scene in three dimensions and generate high-quality point cloud data that integrate geometric structure, appearance color, and semantic information. The model is designed with three cooperative components. The Density Field encodes the volume density of each spatial point, which enables effective separation of solid surfaces such as tea tree leaves and branches from the background. The Appearance Field associates each point with RGB color information to recover realistic lighting and material properties. The Semantic Field predicts the semantic probabilities of key targets such as tea buds, thereby supporting semantic-level 3D reconstruction.</p>
<p>Nevertheless, during neural field training, semantic features may propagate along the line-of-sight direction, and direct sampling of the semantic field often introduces noisy labels in invalid regions, such as empty background space. To overcome this limitation, we propose a density-symmetric Coupling (DSC) strategy. In this method, a density threshold (<italic>&#x3c3;</italic> &#x2265; 0.45) is applied and semantic predictions are only retained for spatial points where the density field indicates the presence of solid surfaces. This constraint effectively suppresses spurious semantic predictions in invalid space and ensures that semantic labels remain consistent with actual physical structures, as shown in <xref ref-type="fig" rid="f16"><bold>Figure&#xa0;16</bold></xref>. By constraining semantic predictions with density responses, DSC suppresses a major source of structured semantic noise and improves the reliability of downstream clustering and counting.</p>
<fig id="f16" position="float">
<label>Figure&#xa0;16</label>
<caption>
<p>Tea tree and tea bud point clouds. <bold>(A)</bold> tea tree point cloud. <bold>(B)</bold> tea bud point cloud.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g016.tif">
<alt-text content-type="machine-generated">Panel A shows a tea tree point cloud forming the overall plant structure. Panel B shows a tea bud point cloud as a sparser set of points corresponding to detected bud regions on a white background.</alt-text>
</graphic></fig>
<p>Based on the reconstructed semantic point cloud, tea bud counting and harvesting-oriented candidate point estimation are performed as perception-level outputs to support downstream harvesting tasks. Once the point cloud data of the tea buds are obtained, a clustering algorithm is applied to structure the data for tea bud counting and candidate point estimation. Before clustering, the point clouds are preprocessed using Statistical Outlier Removal and Radius Outlier Removal to eliminate noise and anomalies, thereby improving the reliability of subsequent analysis. In practice, multiple small tea buds located in proximity may be geometrically merged into the same cluster, leading to biased counting results. To alleviate this issue, a DBSCAN-based clustering strategy with an adaptive neighborhood radius (guided by local point density) and a normal-consistency constraint (clamp angle &#x2264; 15<sup>&#xb0;</sup>) is adopted, which helps reduce cluster adhesion in dense bud regions.</p>
<p>Following clustering, the RANSAC algorithm is applied to fit the dominant growth plane of each tea bud cluster. A distance threshold is then used to retain in-plane points, which are subsequently projected along the normal vector of the fitted plane. The point with the smallest projection value is selected as the harvesting-oriented candidate point for the corresponding tea bud cluster. It should be noted that this candidate point represents a perception-level three-dimensional guidance cue derived from geometric structure, rather than an anatomically exact stem&#x2013;bud junction or a robot-executable cutting command.</p>
<p>To illustrate the feasibility of three-dimensional tea bud counting based on semantic point clouds under different occlusion conditions, three individual tea trees are selected as representative case studies. These trees are not randomly sampled; instead, they are deliberately chosen to reflect increasing levels of occlusion complexity, ranging from relatively sparse foliage to severe self-occlusion. The tea buds on each tree are manually harvested, and the corresponding counts are recorded as reference values. Three-dimensional reconstruction is then performed using NeRF, and the resulting semantic point clouds are used for clustering-based counting. The qualitative correspondence between reconstructed bud clusters and manually observed buds is visualized in <xref ref-type="fig" rid="f17"><bold>Figure&#xa0;17</bold></xref>, while the case-level counting results are summarized in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>. This experiment is intended as an illustrative, case-level evaluation rather than a population-level statistical analysis.</p>
<fig id="f17" position="float">
<label>Figure&#xa0;17</label>
<caption>
<p>Illustration of tea bud counting and harvesting-oriented candidate point estimation. <bold>(A&#x2013;C)</bold> Representative examples from three different tea tree scenes. For each row, the left panel shows the reconstructed tea tree point cloud, and the right panel shows the extracted tea bud point cloud. Red dots indicate harvesting-oriented candidate points.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1739203-g017.tif">
<alt-text content-type="machine-generated">Panel A shows a tea shrub on the left and a corresponding scatter-style visualization of detected tea buds on the right. Panel B shows a denser shrub structure with a corresponding scatter distribution of detected tea buds. Panel C shows a taller shrub with a corresponding scatter visualization of detected tea buds. Each row presents a different tea plant example and its associated bud distribution.</alt-text>
</graphic></fig>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Case-level tea bud counting results for three representative tea trees with increasing occlusion complexity.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Tree</th>
<th valign="middle" align="center">The number of predictions</th>
<th valign="middle" align="center">GT</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">A</td>
<td valign="middle" align="center">25</td>
<td valign="middle" align="center">30</td>
</tr>
<tr>
<td valign="middle" align="left">B</td>
<td valign="middle" align="center">198</td>
<td valign="middle" align="center">228</td>
</tr>
<tr>
<td valign="middle" align="left">C</td>
<td valign="middle" align="center">74</td>
<td valign="middle" align="center">87</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Proxy-based evaluation of candidate picking point localization</title>
<p>To quantitatively assess the localization reliability of candidate picking points under realistic conditions, we employ a proxy-based evaluation strategy that focuses on geometric stability under repeated perturbations. Specifically, for each tea bud cluster, the candidate picking point is re-estimated multiple times under random subsampling of the point cloud, and the variability of the resulting points is measured.</p>
<p>It should be clarified that this proxy-based evaluation does not aim to directly measure biological correctness with respect to manually annotated stem&#x2013;bud junctions. Due to the lack of reliable junction-level ground truth for real tea plants at scale, the adopted evaluation focuses on repeatability and consistency, which are necessary properties for harvesting-oriented guidance cues used in downstream planning.</p>
<p>Three quantitative metrics are adopted to assess localization stability: (1) the mean point deviation <italic>&#xb5;<sub>p</sub></italic>, which measures the average Euclidean distance between repeated estimations; (2) the 90th percentile deviation <italic>p</italic>90<italic><sub>p</sub></italic>, which reflects worst-case sensitivity under perturbation; and (3) the pass rate, defined as the proportion of estimations whose deviation falls below a predefined tolerance threshold. Lower values of <italic>&#xb5;<sub>p</sub></italic> and <italic>p</italic>90<italic><sub>p</sub></italic>, together with a higher pass rate, indicate more stable and reliable candidate point estimation.</p>
<p><xref ref-type="table" rid="T6"><bold>Table&#xa0;6</bold></xref> summarizes the proxy-based localization stability results across three individual tea trees under consistent experimental settings (<italic>p</italic> = 5, subsample ratio = 0.5, noise = 0). The proposed method (<italic>ours</italic>) achieves substantially lower spatial deviation than PCA-based baselines and demonstrates competitive stability compared with centroid-based approaches. Although the cluster centroid method yields the lowest numerical deviation and the highest pass rate, it tends to favor geometrically central locations that lack explicit harvesting-oriented structural cues, as it does not incorporate any basal-side or attachment-related priors of tea buds.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Proxy-based localization stability evaluation of candidate picking point estimation methods across different tea trees (<italic>p</italic> &#xa0;=&#xa0;5, subsample ratio = 0.5, noise = 0).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left" rowspan="2">Tree</th>
<th valign="middle" rowspan="2" align="left">Method</th>
<th valign="middle" colspan="3" align="center">Localization stability metrics</th>
</tr>
<tr>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msub><mml:mi>&#x3bc;</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mo>&#x2193;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center"><inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:mi>p</mml:mi><mml:msub><mml:mrow><mml:mn>90</mml:mn></mml:mrow><mml:mi>p</mml:mi></mml:msub><mml:mo>&#x2193;</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="center">Pass rate <inline-formula>
<mml:math display="inline" id="im3"><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula></td>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="7" align="left">Tree1</td>
<td valign="middle" align="left">ours</td>
<td valign="middle" align="center">0.00986</td>
<td valign="middle" align="center">0.01583</td>
<td valign="middle" align="center">0.0849</td>
</tr>
<tr>
<td valign="middle" align="left">low_centroid</td>
<td valign="middle" align="center">0.01403</td>
<td valign="middle" align="center">0.02396</td>
<td valign="middle" align="center">0.0683</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=1.5)</td>
<td valign="middle" align="center">0.01351</td>
<td valign="middle" align="center">0.02129</td>
<td valign="middle" align="center">0.0708</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=3.0)</td>
<td valign="middle" align="center">0.02107</td>
<td valign="middle" align="center">0.03306</td>
<td valign="middle" align="center">0.0557</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=5.0)</td>
<td valign="middle" align="center">0.03256</td>
<td valign="middle" align="center">0.05051</td>
<td valign="middle" align="center">0.0481</td>
</tr>
<tr>
<td valign="middle" align="left">cluster_centroid</td>
<td valign="middle" align="center">0.00403</td>
<td valign="middle" align="center">0.00645</td>
<td valign="middle" align="center">0.6368</td>
</tr>
<tr>
<td valign="middle" align="left">pca_low</td>
<td valign="middle" align="center">0.06240</td>
<td valign="middle" align="center">0.08161</td>
<td valign="middle" align="center">0.0038</td>
</tr>
<tr>
<td valign="middle" rowspan="7" align="left">Tree2</td>
<td valign="middle" align="left">ours</td>
<td valign="middle" align="center">0.01221</td>
<td valign="middle" align="center">0.01976</td>
<td valign="middle" align="center">0.0980</td>
</tr>
<tr>
<td valign="middle" align="left">low_centroid</td>
<td valign="middle" align="center">0.01623</td>
<td valign="middle" align="center">0.03545</td>
<td valign="middle" align="center">0.0911</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=1.5)</td>
<td valign="middle" align="center">0.01511</td>
<td valign="middle" align="center">0.02438</td>
<td valign="middle" align="center">0.0940</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=3.0)</td>
<td valign="middle" align="center">0.02131</td>
<td valign="middle" align="center">0.03443</td>
<td valign="middle" align="center">0.0880</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=5.0)</td>
<td valign="middle" align="center">0.03105</td>
<td valign="middle" align="center">0.04957</td>
<td valign="middle" align="center">0.0820</td>
</tr>
<tr>
<td valign="middle" align="left">cluster_centroid</td>
<td valign="middle" align="center">0.00415</td>
<td valign="middle" align="center">0.00636</td>
<td valign="middle" align="center">0.4880</td>
</tr>
<tr>
<td valign="middle" align="left">pca_low</td>
<td valign="middle" align="center">0.06877</td>
<td valign="middle" align="center">0.08681</td>
<td valign="middle" align="center">0.0000</td>
</tr>
<tr>
<td valign="middle" rowspan="7" align="left">Tree3</td>
<td valign="middle" align="left">ours</td>
<td valign="middle" align="center">0.01003</td>
<td valign="middle" align="center">0.01646</td>
<td valign="middle" align="center">0.2076</td>
</tr>
<tr>
<td valign="middle" align="left">low_centroid</td>
<td valign="middle" align="center">0.01502</td>
<td valign="middle" align="center">0.02545</td>
<td valign="middle" align="center">0.1895</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=1.5)</td>
<td valign="middle" align="center">0.01284</td>
<td valign="middle" align="center">0.02050</td>
<td valign="middle" align="center">0.1973</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=3.0)</td>
<td valign="middle" align="center">0.01811</td>
<td valign="middle" align="center">0.02913</td>
<td valign="middle" align="center">0.1781</td>
</tr>
<tr>
<td valign="middle" align="left">ours_offset (dk=5.0)</td>
<td valign="middle" align="center">0.02626</td>
<td valign="middle" align="center">0.04258</td>
<td valign="middle" align="center">0.1570</td>
</tr>
<tr>
<td valign="middle" align="left">cluster_centroid</td>
<td valign="middle" align="center">0.00353</td>
<td valign="middle" align="center">0.00560</td>
<td valign="middle" align="center">0.6908</td>
</tr>
<tr>
<td valign="middle" align="left">pca_low</td>
<td valign="middle" align="center">0.08069</td>
<td valign="middle" align="center">0.10390</td>
<td valign="middle" align="center">0.0027</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To further analyze the sensitivity of the proposed method to candidate point displacement, an offset strategy is introduced, where the estimated candidate picking point is shifted along the negative normal direction of the fitted local plane by a scaled distance &#x394;<italic>k</italic>. As &#x394;<italic>k</italic> increases from 1.5 to 5.0, both <italic>&#xb5;<sub>p</sub></italic> and <italic>p</italic>90<italic><sub>p</sub></italic> increase monotonically, accompanied by a gradual decrease in pass rate. This trend indicates that excessive displacement amplifies instability under perturbations, highlighting the importance of moderate offsets when incorporating harvesting-oriented geometric priors.</p>
<p>Overall, the results demonstrate that the proposed method provides a stable and task-consistent harvesting-oriented guidance cue under geometric perturbations. While centroid-based approaches achieve strong numerical stability, the proposed strategy integrates basal-side geometric priors that are more aligned with harvesting-oriented localization. Its localization objective focuses on harvesting-oriented geometric referencing rather than precise anatomical reconstruction of the stem&#x2013;bud junction. The offset analysis further clarifies the trade-off between robustness and harvesting-oriented displacement in candidate point estimation.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<sec id="s4_1">
<label>4.1</label>
<title>Multi-model fusion for tea bud semantic segmentation</title>
<p>The experimental results demonstrate that the proposed tea bud segmentation framework benefits from the complementary strengths of the large-scale foundation model SAM2 and the lightweight detector YOLOv11. Tea bud segmentation remains a challenging task due to limited annotated data and severe occlusion among buds and surrounding leaves, which often leads to boundary ambiguity and missed detections in complex canopy environments.</p>
<p>While SAM2 exhibits strong generalization capability on unseen samples, its performance on small-scale targets such as tea buds is constrained in the absence of effective localization cues. Conversely, the YOLOv11-based detector provides reliable coarse localization but may suffer from incomplete boundaries under heavy occlusion. By introducing YOLOv11 detection results as prompts for SAM2, the proposed fusion strategy reduces segmentation ambiguity for small and partially occluded buds, leading to improved segmentation consistency and reduced annotation effort.</p>
<p>Nevertheless, segmentation errors are not entirely eliminated. Residual over-segmentation and under-segmentation persist in regions with extreme occlusion or weak visual contrast, and such errors may propagate to subsequent 3D reconstruction stages. In particular, missed or truncated bud regions in 2D segmentation cannot be recovered in later processing, highlighting the critical role of segmentation recall in harvesting-oriented 3D perception pipelines.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>3D reconstruction effect</title>
<p>Most existing tea bud studies focus on recognition and localization in two-dimensional images, where occlusion and perspective effects fundamentally limit reliable structural analysis. By introducing a NeRF-based reconstruction framework with monocular depth priors, this study extends tea bud perception into three-dimensional space, enabling bud counting and candidate picking-point estimation from a geometric perspective.</p>
<p>Incorporating depth information derived from Depth Anything v2 into the Nerfacto framework improves reconstruction sharpness and structural fidelity, particularly during the early stages of training and in regions with complex leaf&#x2013;bud interactions. From a methodological perspective, NeRF-based reconstruction provides a more flexible representation for modeling fine-scale plant organs with severe self-occlusion and repetitive textures, which often pose challenges to traditional structure-from-motion pipelines (e.g., COLMAP) by degrading feature matching and geometric consistency.</p>
<p>Nevertheless, reconstruction uncertainty remains an inherent challenge when modeling tea trees with fine-grained geometry and heavy self-occlusion. Monocular depth estimation is prone to ambiguity around thin structures and densely layered foliage, leading to local geometric blur or incomplete surfaces in the reconstructed space. Such errors are spatially correlated rather than random and may propagate into the semantic field and subsequent point cloud extraction, although depth supervision substantially suppresses reconstruction noise.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Depth error analysis in ambiguous regions</title>
<p>While monocular depth supervision improves the overall geometric consistency of NeRF reconstruction, it is inherently subject to estimation errors in visually ambiguous regions such as thin branches and dense foliage. To quantitatively assess how such depth errors may affect the reconstructed geometry, we conduct a region-aware depth reliability analysis by comparing Depth Anything V2 predictions with sparse depth estimates derived from COLMAP reconstruction.</p>
<p>Specifically, sparse 3D points reconstructed by COLMAP are projected onto the image plane, and their camera-space depths are treated as reference depth samples. At corresponding pixel locations, monocular depth predictions are extracted and aligned using robust median scaling to account for the inherent scale ambiguity of monocular depth estimation. Depth errors are then evaluated for all valid samples (Global), manually selected ambiguous regions of interest (ROIs), including thin branches and dense foliage, and non-ROI regions as a control group.</p>
<p>Quantitative results are summarized in <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>. The global region exhibits moderate depth errors, with performance comparable to that of non-ROI regions, indicating that monocular depth supervision provides reasonably consistent structural guidance at the scene level. In contrast, error magnitudes increase noticeably in visually ambiguous regions. Thin-branch ROIs show moderately higher errors, reflecting the difficulty of estimating depth for fine, elongated structures. Dense-foliage ROIs exhibit substantially larger absolute and relative errors, together with significantly heavier-tailed error distributions, as evidenced by the elevated p90 AbsRel values. This behavior can be attributed to severe occlusion, multi-layer leaf overlap, and weak monocular depth cues in dense foliage areas.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>ROI-based depth error statistics after scale alignment.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Region</th>
<th valign="middle" align="center">Points</th>
<th valign="middle" align="center">AbsRel &#x2193;</th>
<th valign="middle" align="center">RMSE &#x2193;</th>
<th valign="middle" align="center">p90 AbsRel &#x2193;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Global</td>
<td valign="middle" align="center">24,765</td>
<td valign="middle" align="center">1.507</td>
<td valign="middle" align="center">9.727</td>
<td valign="middle" align="center">3.975</td>
</tr>
<tr>
<td valign="middle" align="left">Thin branches ROI</td>
<td valign="middle" align="center">402</td>
<td valign="middle" align="center">1.826</td>
<td valign="middle" align="center">10.862</td>
<td valign="middle" align="center">3.243</td>
</tr>
<tr>
<td valign="middle" align="left">Dense foliage ROI</td>
<td valign="middle" align="center">505</td>
<td valign="middle" align="center">4.234</td>
<td valign="middle" align="center">15.008</td>
<td valign="middle" align="center">7.487</td>
</tr>
<tr>
<td valign="middle" align="left">Non-ROI</td>
<td valign="middle" align="center">23,858</td>
<td valign="middle" align="center">1.444</td>
<td valign="middle" align="center">9.563</td>
<td valign="middle" align="center">3.859</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>These results indicate that depth estimation errors are not uniformly distributed across the scene, but are spatially concentrated in regions with inherent visual ambiguity. Consequently, monocular depth supervision may introduce localized geometric blur or noise in such regions, while preserving global structural fidelity. This region-dependent error characteristic explains the localized reconstruction artifacts observed in dense foliage areas and supports the subsequent stability analysis of candidate picking point estimation.</p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Analysis of tea bud counts and harvesting-oriented candidate points</title>
<p>Most existing tea bud yield estimation approaches rely on two-dimensional images, where occlusion and projection overlap frequently lead to counting ambiguity. By leveraging a reconstructed semantic point cloud, the proposed method enables tea bud counting from a three-dimensional perspective, which helps alleviate ambiguity caused by overlapping buds in 2D views. Rather than serving as a population-level statistical evaluation, the counting results presented in this study are intended as a case-level illustration to examine the feasibility and behavior of 3D-based counting under different occlusion conditions.</p>
<p>As observed in the representative cases, discrepancies between reconstructed counts and manual reference values still occur. Under-counting typically arises when severely occluded tea buds are incompletely reconstructed and fail to form sufficiently dense point cloud clusters, or when closely adjacent buds are geometrically merged into a single cluster. In contrast, partial fragmentation of an individual bud may occasionally lead to over-counting. These error patterns are not randomly distributed; instead, they reflect the stage-wise accumulation of uncertainty across segmentation, depth estimation, and point cloud clustering, rather than isolated inaccuracies at a single processing step.</p>
<p>For harvesting-oriented candidate point estimation, existing approaches predominantly rely on two-dimensional localization cues or point clouds acquired through LiDAR scanning. In contrast, this study derives candidate points directly from reconstructed semantic point clouds by combining bud-level clustering with RANSAC-based local geometric modeling. The resulting candidate points represent perception-level three-dimensional guidance cues that capture the relative spatial structure of tea buds, rather than anatomically exact stem&#x2013;bud junctions. Although their spatial consistency depends on the completeness of bud reconstruction and the stability of cluster-level geometric fitting, these candidate points provide actionable spatial references for downstream harvesting planning. Systematic validation across larger-scale datasets and explicit occlusion-level stratification are left for future work.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Limitations</title>
<p>Despite the promising results achieved by the proposed pipeline, several limitations remain. First, the proposed framework is designed for high-precision, single-tree-level three-dimensional perception and harvesting planning, where reconstruction quality and geometric reliability are prioritized over processing throughput. Consequently, the current implementation targets offline analysis and is not intended for real-time deployment in large-scale tea gardens, where efficiency and coverage are critical considerations.</p>
<p>Second, the performance of the system is sensitive to data acquisition conditions. Reliable reconstruction requires relatively stable environmental settings, including limited wind disturbance and reasonably uniform illumination. Under dynamic outdoor conditions with strong wind or rapidly changing lighting, image quality degradation may affect both reconstruction accuracy and downstream point cloud analysis.</p>
<p>Third, the dataset used in this study was collected from a single geographic region and a single tea variety, which limits a comprehensive evaluation of cross-region and cross-variety generalization. However, the primary objective of this work is to validate a harvesting-oriented three-dimensional perception framework rather than to claim universal generalization across diverse tea species or plantation conditions. The proposed pipeline relies on general visual and geometric cues, such as object appearance, depth structure, and spatial consistency, rather than handcrafted features specific to a particular tea variety. Systematic evaluation on multi-region and multi-variety datasets will be explored in future work.</p>
<p>In addition, the evaluation of picking-point estimation remains challenging. Due to the dense distribution and large quantity of tea buds on a single tea tree, it is difficult to obtain precise ground-truth picking locations for direct quantitative comparison. Consequently, the current validation primarily focuses on perceptual consistency rather than execution-level accuracy.</p>
<p>Addressing these limitations will require further investigation, including improving data acquisition robustness under complex environmental conditions, enhancing system efficiency for larger-scale scenarios, expanding dataset diversity, and developing reliable strategies for validating picking-point estimates under practical harvesting constraints.</p>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusions</title>
<p>This study presents a harvesting-oriented three-dimensional perception framework for tea bud analysis that integrates two-dimensional recognition and segmentation with depth-assisted NeRF reconstruction and semantic point cloud processing. By combining lightweight detection, prompt-guided semantic segmentation based on SAM2, and monocular depth priors, the proposed pipeline improves robustness under occlusion and enables structured three-dimensional representation of tea buds in complex plantation environments.</p>
<p>Based on the reconstructed semantic point cloud, the framework provides two perception-level outputs relevant to harvesting applications: tea bud counting and harvesting-oriented candidate point estimation. Experimental results on captured tea tree scenes indicate that the estimated tea bud counts are generally consistent with manual measurements, demonstrating the potential of three-dimensional perception to support yield estimation. In addition, the estimated candidate points capture the relative spatial structure of tea buds and serve as three-dimensional spatial references that can support downstream harvesting planning.</p>
<p>Overall, this work demonstrates the feasibility of unifying two-dimensional visual perception and neural field-based 3D reconstruction into a single pipeline for harvesting-oriented tea bud analysis. While further improvements are required to enhance scalability, robustness under varying environmental conditions, and execution-level validation, the proposed framework establishes a solid perceptual foundation for future research on intelligent tea harvesting systems. </p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p></sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>WC: Writing &#x2013; review &amp; editing, Writing &#x2013; original draft, Methodology. XL: Resources, Funding acquisition, Writing &#x2013; review &amp; editing, Methodology. LR: Writing &#x2013; review &amp; editing, Data curation, Visualization. XX: Data curation, Writing &#x2013; review &amp; editing, Investigation.</p></sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<sec id="S13" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fpls.2026.1739203/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fpls.2026.1739203/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>T.</given-names></name>
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Lv</surname> <given-names>J.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Wu</surname> <given-names>W.</given-names></name>
</person-group> (<year>2024</year>a). 
<article-title>Segmentation network for multi-shape tea bud leaves based on attention and path feature aggregation</article-title>. <source>Agriculture</source> <volume>14</volume>, <fpage>1388</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture14081388</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>H.</given-names></name>
<name><surname>Zhang</surname> <given-names>M.</given-names></name>
<name><surname>Xiao</surname> <given-names>S.</given-names></name>
<name><surname>Wang</surname> <given-names>Q.</given-names></name>
<name><surname>Cai</surname> <given-names>Z.</given-names></name>
<name><surname>Dong</surname> <given-names>Q.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>b). 
<article-title>Quantitative analysis and planting optimization of multi-genotype sugar beet plant types based on 3D plant architecture</article-title>. <source>Comput. Electron. Agric.</source> <volume>225</volume>, <elocation-id>109231</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109231</pub-id>, PMID: <pub-id pub-id-type="pmid">41737640</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Derpanis</surname> <given-names>K. G.</given-names></name>
</person-group> (<year>2010</year>). 
<article-title>Overview of the RANSAC algorithm</article-title>. <source>Img. Rochester. NY.</source> <volume>4</volume>, <fpage>2</fpage>&#x2013;<lpage>3</lpage>.
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Esser</surname> <given-names>F.</given-names></name>
<name><surname>Rosu</surname> <given-names>R. A.</given-names></name>
<name><surname>Corneli&#xdf;en</surname> <given-names>A.</given-names></name>
<name><surname>Klingbeil</surname> <given-names>L.</given-names></name>
<name><surname>Kuhlmann</surname> <given-names>H.</given-names></name>
<name><surname>Behnke</surname> <given-names>S.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Field robot for high- throughput and high-resolution 3D plant phenotyping: towards efficient and sustainable crop production</article-title>. <source>IEEE Robot. Automat. Mag.</source> <volume>30</volume>, <fpage>20</fpage>&#x2013;<lpage>29</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/MRA.2023.3321402</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gui</surname> <given-names>Z.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Chen</surname> <given-names>Z.</given-names></name>
<name><surname>Wu</surname> <given-names>C.</given-names></name>
<name><surname>Dong</surname> <given-names>C.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>A lightweight tea bud detection model based on Yolov5</article-title>. <source>Comput. Electron. Agric.</source> <volume>205</volume>, <elocation-id>107636</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107636</pub-id>, PMID: <pub-id pub-id-type="pmid">41737640</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gui</surname> <given-names>J.</given-names></name>
<name><surname>Wu</surname> <given-names>J.</given-names></name>
<name><surname>Wu</surname> <given-names>D.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Tong</surname> <given-names>J.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>A lightweight tea buds detection model with occlusion handling</article-title>. <source>J. Food Meas. Character.</source> <volume>18</volume>, <fpage>7533</fpage>&#x2013;<lpage>7549</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11694-024-02746-w</pub-id>, PMID: <pub-id pub-id-type="pmid">41737715</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Jiang</surname> <given-names>L.</given-names></name>
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
<name><surname>Chee</surname> <given-names>P.</given-names></name>
<name><surname>Fu</surname> <given-names>L.</given-names></name>
</person-group> (<year>2024</year>). &#x201c;
<article-title>Estimation of cotton boll number and main stem length based on 3D Gaussian splatting</article-title>,&#x201d; in <conf-name>Proceedings of the ASABE Annual International Meeting (AIM)</conf-name>, (<publisher-loc>Anaheim, CA, USA</publisher-loc>: 
<publisher-name>American Society of Agricultural and Biological Engineers, ASABE</publisher-name>), <conf-date>28&#x2013;31 July</conf-date>. <fpage>1</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13031/aim.202400898</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jianqiang</surname> <given-names>L.</given-names></name>
<name><surname>Haoxuan</surname> <given-names>L.</given-names></name>
<name><surname>Chaoran</surname> <given-names>Y.</given-names></name>
<name><surname>Xiao</surname> <given-names>L.</given-names></name>
<name><surname>Jiewei</surname> <given-names>H.</given-names></name>
<name><surname>Haiwei</surname> <given-names>W.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Tea Bud DG: A lightweight tea bud detection model based on dynamic detection head and adaptive loss function</article-title>. <source>Comput. Electron. Agric.</source> <volume>227</volume>, <fpage>109522</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109522</pub-id>, PMID: <pub-id pub-id-type="pmid">41737640</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Khanam</surname> <given-names>R.</given-names></name>
<name><surname>Hussain</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>YOLOv11: An overview of the key architectural enhancements</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2410.17725</pub-id>, PMID: <pub-id pub-id-type="pmid">41363103</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kirillov</surname> <given-names>A.</given-names></name>
<name><surname>Mintun</surname> <given-names>E.</given-names></name>
<name><surname>Ravi</surname> <given-names>N.</given-names></name>
<name><surname>Mao</surname> <given-names>H.</given-names></name>
<name><surname>Rolland</surname> <given-names>C.</given-names></name>
<name><surname>Gustafson</surname> <given-names>L.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Segment anything</article-title>. In: <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>. (<publisher-loc>Vancouver, BC, Canada</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>4015</fpage>&#x2013;<lpage>4026</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00396</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>W.</given-names></name>
<name><surname>Lu</surname> <given-names>H.</given-names></name>
<name><surname>Fu</surname> <given-names>H.</given-names></name>
<name><surname>Cao</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Learning to upsample by learning to sample</article-title>. In: <source>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</source>. (<publisher-loc>Paris, France</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>6004</fpage>&#x2013;<lpage>6014</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV51070.2023.00554</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>F.</given-names></name>
<name><surname>Wang</surname> <given-names>S.</given-names></name>
<name><surname>Pang</surname> <given-names>S.</given-names></name>
<name><surname>Han</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Detection and recognition of tea buds by integrating deep learning and image-processing algorithm</article-title>. <source>J. Food Meas. Character.</source> <volume>18</volume>, <fpage>2744</fpage>&#x2013;<lpage>2761</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11694-023-02351-3</pub-id>, PMID: <pub-id pub-id-type="pmid">41737715</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Long</surname> <given-names>Z.</given-names></name>
<name><surname>Jiang</surname> <given-names>Q.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Zhu</surname> <given-names>H.</given-names></name>
<name><surname>Li</surname> <given-names>B.</given-names></name>
<name><surname>Wen</surname> <given-names>F. J.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Research on method of tea flushes vision recognition and picking point localization</article-title>. <source>Sens. Microsyst.</source> <volume>41</volume>, <fpage>39</fpage>&#x2013;<lpage>41</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13873/J.1000-9787(2022)02-0039-03</pub-id>, PMID: <pub-id pub-id-type="pmid">41740205</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Meyer</surname> <given-names>L.</given-names></name>
<name><surname>Gilson</surname> <given-names>A.</given-names></name>
<name><surname>Schmid</surname> <given-names>U.</given-names></name>
<name><surname>Stamminger</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). &#x201c;
<article-title>FruitNeRF: A unified neural radiance field based fruit counting framework</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>, (<publisher-loc>Abu Dhabi, UAE</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/IROS58592.2024.10802065</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mildenhall</surname> <given-names>B.</given-names></name>
<name><surname>Srinivasan</surname> <given-names>P. P.</given-names></name>
<name><surname>Tancik</surname> <given-names>M.</given-names></name>
<name><surname>Barron</surname> <given-names>J. T.</given-names></name>
<name><surname>Ramamoorthi</surname> <given-names>R.</given-names></name>
<name><surname>Ng</surname> <given-names>R.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>NeRF: Representing scenes as neural radiance fields for view synthesis</article-title>. <source>Commun. ACM</source> <volume>65</volume>, <fpage>99</fpage>&#x2013;<lpage>106</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3503250</pub-id>, PMID: <pub-id pub-id-type="pmid">40727313</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Pan</surname> <given-names>Z.</given-names></name>
<name><surname>Gu</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Fang</surname> <given-names>X.</given-names></name>
<name><surname>Xia</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>Q.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Picking point identification and localization method based on swin-transformer for high-quality tea</article-title>. <source>J. King Saud Univ. Comput. Inf. Sci</source>. <volume>36</volume>, <fpage>102262</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jksuci.2024.102262</pub-id>, PMID: <pub-id pub-id-type="pmid">41737640</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ravi</surname> <given-names>N.</given-names></name>
<name><surname>Gabeur</surname> <given-names>V.</given-names></name>
<name><surname>Hu</surname> <given-names>Y.-T.</given-names></name>
<name><surname>Hu</surname> <given-names>R.</given-names></name>
<name><surname>Ryali</surname> <given-names>C.</given-names></name>
<name><surname>Ma</surname> <given-names>T.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>SAM 2: Segment anything in images and videos</article-title>. <source>arXiv preprint</source>. <volume>arXiv</volume>, <fpage>2408.00714</fpage>. Available online at: <uri xlink:href="https://arxiv.org/abs/2408.00714">https://arxiv.org/abs/2408.00714</uri> (Accessed <date-in-citation content-type="access-date">March 20, 2026</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Ronneberger</surname> <given-names>O.</given-names></name>
<name><surname>Fischer</surname> <given-names>P.</given-names></name>
<name><surname>Brox</surname> <given-names>T.</given-names></name>
</person-group> (<year>2015</year>). &#x201c;
<article-title>U-net: convolutional networks for biomedical image segmentation</article-title>,&#x201d; in <conf-name>Proceedings of the International Conference on Medical Image Computing and Computer-Assisted Intervention (MICCAI)</conf-name>, (<publisher-loc>Munich, Germany</publisher-loc>: 
<publisher-name>Springer</publisher-name>) <fpage>234</fpage>&#x2013;<lpage>241</lpage>.
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Saeed</surname> <given-names>F.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
<name><surname>Ozias-Akins</surname> <given-names>P.</given-names></name>
<name><surname>Chu</surname> <given-names>Y. J.</given-names></name>
<name><surname>Li</surname> <given-names>C. C.</given-names></name>
</person-group> (<year>2023</year>). &#x201c;
<article-title>PeanutNeRF: 3D radiance field for peanuts</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, (<publisher-loc>Vancouver, BC, Canada</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). <fpage>6254</fpage>&#x2013;<lpage>6263</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPRW59228.2023.00665</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Schubert</surname> <given-names>E.</given-names></name>
<name><surname>Sander</surname> <given-names>J.</given-names></name>
<name><surname>Ester</surname> <given-names>M.</given-names></name>
<name><surname>Kriegel</surname> <given-names>H. P.</given-names></name>
<name><surname>Xu</surname> <given-names>X.</given-names></name>
</person-group> (<year>2017</year>). 
<article-title>DBSCAN revisited, revisited: why and how you should (Still) use DBSCAN</article-title>. <source>ACM Trans. Database Syst.</source> <volume>42</volume>, <fpage>1</fpage>&#x2013;<lpage>19</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3068335</pub-id>, PMID: <pub-id pub-id-type="pmid">40727313</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Song</surname> <given-names>Y.</given-names></name>
<name><surname>Zheng</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>Chen</surname> <given-names>L.</given-names></name>
<name><surname>Ning</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>High-precision method for simultaneous tea bud and picking point detection using morphology heatmap labels</article-title>. <source>Appl. Eng. Agric</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.13031/aea.16407</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>M.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Meng</surname> <given-names>H.</given-names></name>
<name><surname>Chen</surname> <given-names>Z.</given-names></name>
<name><surname>Gui</surname> <given-names>Z.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Small target tea bud detection based on improved YOLOv5 in complex background</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1393138</pub-id>, PMID: <pub-id pub-id-type="pmid">38887461</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Wu</surname> <given-names>Z.</given-names></name>
<name><surname>Xiao</surname> <given-names>G.</given-names></name>
<name><surname>Han</surname> <given-names>C.</given-names></name>
<name><surname>Fang</surname> <given-names>C.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>YOLOv7-DWS: Tea bud recognition and detection network in multi-density environment via improved YOLOv7</article-title>. <source>Front. Plant Sci.</source> <volume>15</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2024.1503033</pub-id>, PMID: <pub-id pub-id-type="pmid">39840356</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>F.</given-names></name>
<name><surname>L&#xfc;</surname> <given-names>J.</given-names></name>
</person-group> (<year>2013</year>). 
<article-title>Research on recognition of tea tender leaf based on image color information</article-title>. <source>J. Tea. Sci.</source> <volume>33</volume>, <fpage>584</fpage>&#x2013;<lpage>589</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3969/j.issn.1000-369X.2013.06.018</pub-id>, PMID: <pub-id pub-id-type="pmid">35900448</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yan</surname> <given-names>L.</given-names></name>
<name><surname>Wu</surname> <given-names>K.</given-names></name>
<name><surname>Lin</surname> <given-names>J.</given-names></name>
<name><surname>Xu</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<name><surname>Zhao</surname> <given-names>X.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Identification and picking point positioning of tender tea shoots based on MR3P-TS model</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>962391</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.962391</pub-id>, PMID: <pub-id pub-id-type="pmid">36035663</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>H.</given-names></name>
<name><surname>Chen</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>M.</given-names></name>
<name><surname>Ma</surname> <given-names>Z.</given-names></name>
<name><surname>Deng</surname> <given-names>F.</given-names></name>
<name><surname>Li</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2019</year>). 
<article-title>Tender tea shoots recognition and positioning for picking robot using improved YOLO-V3 model</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>180998</fpage>&#x2013;<lpage>181011</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2019.2958614</pub-id>, PMID: <pub-id pub-id-type="pmid">41116384</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>L.</given-names></name>
<name><surname>Kang</surname> <given-names>B.</given-names></name>
<name><surname>Huang</surname> <given-names>Z.</given-names></name>
<name><surname>Zhao</surname> <given-names>Z.</given-names></name>
<name><surname>Xu</surname> <given-names>X.</given-names></name>
<name><surname>Feng</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>a). 
<article-title>Depth anything V2</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>37</volume>, <fpage>21875</fpage>&#x2013;<lpage>21911</lpage>.
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>X.</given-names></name>
<name><surname>Lu</surname> <given-names>X.</given-names></name>
<name><surname>Xie</surname> <given-names>P.</given-names></name>
<name><surname>Guo</surname> <given-names>Z.</given-names></name>
<name><surname>Fang</surname> <given-names>H.</given-names></name>
<name><surname>Fu</surname> <given-names>H.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>b). 
<article-title>PanicleNeRF: Low-cost, high-precision in-field phenotyping of rice panicles with smartphone</article-title>. <source>Plant Phenomics.</source> <volume>6</volume>, <elocation-id>279</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.34133/plantphenomics.0279</pub-id>, PMID: <pub-id pub-id-type="pmid">39639877</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>F.</given-names></name>
<name><surname>Yang</surname> <given-names>L.</given-names></name>
<name><surname>Tian</surname> <given-names>Y.</given-names></name>
<name><surname>Yang</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2009</year>). 
<article-title>Recognition of the tea sprout based on color and shape features</article-title>. <source>Trans. Chin. Soc Agric. Mach.</source> <volume>40</volume>, <fpage>119</fpage>&#x2013;<lpage>123</lpage>.
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>R.-Y.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<name><surname>Xie</surname> <given-names>X.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>SimAM: A simple, parameter-free attention module for convolutional neural networks</article-title>,&#x201d; in <conf-name>Proceedings of the 38th International Conference on Machine Learning (ICML)</conf-name>. (<publisher-loc>Virtual Event</publisher-loc>: 
<publisher-name>PMLR</publisher-name>) <fpage>18</fpage>&#x2013;<lpage>24</lpage>.
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yu</surname> <given-names>X.-L.</given-names></name>
<name><surname>He</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>Optimization of tea-leaf saponins water extraction and relationships between their contents and tea (<italic>Camellia sinensis</italic>) tree varieties</article-title>. <source>Food Sci. Nutr.</source> <volume>6</volume>, <fpage>1734</fpage>&#x2013;<lpage>1740</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/fsn3.724</pub-id>, PMID: <pub-id pub-id-type="pmid">30258618</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Srinivasan</surname> <given-names>P. P.</given-names></name>
<name><surname>Deng</surname> <given-names>B.</given-names></name>
<name><surname>Debevec</surname> <given-names>P.</given-names></name>
<name><surname>Freeman</surname> <given-names>W. T.</given-names></name>
<name><surname>Barron</surname> <given-names>J. T.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>NeRFactor: Neural factorization of shape and reflectance under an unknown illumination</article-title>. <source>ACM Trans. Graph.</source> <volume>40</volume>, <fpage>1</fpage>&#x2013;<lpage>237</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3478513.3480496</pub-id>, PMID: <pub-id pub-id-type="pmid">40727313</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>F.</given-names></name>
<name><surname>Sun</surname> <given-names>H.</given-names></name>
<name><surname>Xie</surname> <given-names>S.</given-names></name>
<name><surname>Dong</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Xu</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>a). 
<article-title>A tea bud segmentation, detection and picking point localization based on the MDY7-3PTB model</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1199473</pub-id>, PMID: <pub-id pub-id-type="pmid">37841621</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Ni</surname> <given-names>X.</given-names></name>
<name><surname>Dong</surname> <given-names>F.</given-names></name>
<name><surname>Tang</surname> <given-names>L.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Neural radiance fields for multi-scale constraint-free 3D reconstruction and rendering in orchard scenes</article-title>. <source>Comput. Electron. Agric.</source> <volume>217</volume>, <elocation-id>108629</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.108629</pub-id>, PMID: <pub-id pub-id-type="pmid">41737640</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
</person-group> (<year>2023</year>b). 
<article-title>Inner-IoU: More effective intersection over union loss with auxiliary bounding box</article-title>. <source>arXiv preprint</source>. <volume>arXiv</volume>, <fpage>2311.02877</fpage>. Available online at: <uri xlink:href="https://arxiv.org/abs/2311.02877">https://arxiv.org/abs/2311.02877</uri> (Accessed <date-in-citation content-type="access-date">March 20, 2026</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>D.</given-names></name>
<name><surname>Zhang</surname> <given-names>R.</given-names></name>
<name><surname>Chen</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Yi</surname> <given-names>T.</given-names></name>
<name><surname>Feng</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Adaptive tracking and cutting control system for tea canopy: design and experimental evaluation</article-title>. <source>Agriculture</source> <volume>15</volume>, <fpage>557</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture15050557</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhou</surname> <given-names>C.</given-names></name>
<name><surname>Zhu</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<name><surname>Ding</surname> <given-names>Z.</given-names></name>
<name><surname>Jiang</surname> <given-names>W.</given-names></name>
<name><surname>Zhang</surname> <given-names>K.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>The tea buds detection and yield estimation method based on optimized YOLOv8</article-title>. <source>Sci. Hortic.</source> <volume>338</volume>, <elocation-id>113730</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.scienta.2024.113730</pub-id>, PMID: <pub-id pub-id-type="pmid">41737640</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>L.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<name><surname>Lin</surname> <given-names>G.</given-names></name>
<name><surname>Chen</surname> <given-names>P.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
</person-group> (<year>2023</year>a). 
<article-title>Detection and localization of tea bud based on improved YOLOv5s and 3D point cloud processing</article-title>. <source>Agronomy</source> <volume>13</volume>, <elocation-id>2412</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy13092412</pub-id>, PMID: <pub-id pub-id-type="pmid">41725453</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>B.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Sun</surname> <given-names>Y.</given-names></name>
<name><surname>Shi</surname> <given-names>Y.</given-names></name>
<name><surname>Ma</surname> <given-names>Y.</given-names></name>
<name><surname>Guo</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>b). 
<article-title>Quantitative estimation of organ-scale phenotypic parameters of field crops through 3D modeling using extremely low altitude UAV images</article-title>. <source>Comput. Electron. Agric.</source> <volume>210</volume>, <elocation-id>107910</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107910</pub-id>, PMID: <pub-id pub-id-type="pmid">41737640</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1294519">Yuzhen Lu</ext-link>, Michigan State University, United States</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3091389">Taojie Yu</ext-link>, Zhejiang Sci-Tech University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3161713">Md Shaha Nur Kabir</ext-link>, Hajee Mohammad Danesh Science and Technology University, Bangladesh</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3282353">Parvin Mohammadi</ext-link>, Northwest A&amp;F University, China</p></fn>
</fn-group>
</back>
</article>