<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1492110</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>CTDA: an accurate and efficient cherry tomato detection algorithm in complex environments</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Liang</surname>
<given-names>Zhi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2835526"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Caihong</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lin</surname>
<given-names>Zhonglong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Guoqiang</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Xiaojuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zou</surname>
<given-names>Xiangjun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2158137"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Mechanical Engineering, Xinjiang University</institution>, <addr-line>Urumqi</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Institute of Agricultural Mechanization, Xinjiang Academy of Agricultural Sciences</institution>, <addr-line>Urumqi, Xinjiang</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Pei Wang, Southwest University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Hariharan Shanmugasundaram, Vardhaman College of Engineering, India</p>
<p>Xizhe Fu, Shihezi University, China</p>
<p>Tao Ding, Beijing University of Technology, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Xiaojuan Li, <email xlink:href="mailto:xjli@xju.edu.cn">xjli@xju.edu.cn</email>; Xiangjun Zou, <email xlink:href="mailto:xjzou@scau.edu.cn">xjzou@scau.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>13</day>
<month>03</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1492110</elocation-id>
<history>
<date date-type="received">
<day>14</day>
<month>10</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>02</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Liang, Zhang, Lin, Wang, Li and Zou</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Liang, Zhang, Lin, Wang, Li and Zou</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>In the natural harvesting conditions of cherry tomatoes, the robotic vision for harvesting faces challenges such as lighting, overlapping, and occlusion among various environmental factors. To ensure accuracy and efficiency in detecting cherry tomatoes in complex environments, the study proposes a precise, realtime, and robust target detection algorithm: the CTDA model, to support robotic harvesting operations in unstructured environments.</p>
</sec>
<sec>
<title>Methods</title>
<p>The model, based on YOLOv8, introduces a lightweight downsampling method to restructure the backbone network, incorporating adaptive weights and receptive field spatial characteristics to ensure that low-dimensional small target features are not completely lost. By using softpool to replace maxpool in SPPF, a new SPPFS is constructed, achieving efficient feature utilization and richer multi-scale feature fusion. Additionally, by incorporating a dynamic head driven by the attention mechanism, the recognition precision of cherry tomatoes in complex scenarios is enhanced through more effective feature capture across different scales.</p>
</sec>
<sec>
<title>Results</title>
<p>CTDA demonstrates good adaptability and robustness in complex scenarios. Its detection accuracy reaches 94.3%, with recall and average precision of 91.5% and 95.3%, respectively, while achieving a mAP@0.5:0.95 of 76.5% and an FPS of 154.1 frames per second. Compared to YOLOv8, it improves mAP by 2.9% while maintaining detection speed, with a model size of 6.7M.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Experimental results validate the effectiveness of the CTDA model in cherry tomato detection under complex environments. While improving detection accuracy, the model also enhances adaptability to lighting variations, occlusion, and dense small target scenarios, and can be deployed on edge devices for rapid detection, providing strong support for automated cherry tomato picking.</p>
</sec>
</abstract>
<kwd-group>
<kwd>picking robot</kwd>
<kwd>cherry tomato detection</kwd>
<kwd>deep learning</kwd>
<kwd>YOLO</kwd>
<kwd>multi-scale feature fusion</kwd>
</kwd-group>
<counts>
<fig-count count="14"/>
<table-count count="6"/>
<equation-count count="15"/>
<ref-count count="43"/>
<page-count count="18"/>
<word-count count="8346"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>The cherry tomato is a small tomato known for its rich nutritional value and wide-spread market demand, making it one of the important economic crops worldwide (<xref ref-type="bibr" rid="B5">Chen et&#xa0;al., 2021</xref>). Harvesting cherry tomatoes is a critical process in their agricultural production. Current harvesting methods rely primarily on manual labor, which presents problems such as high labor costs, low harvesting efficiency, and other related challenges. In addition, these methods cannot meet the demands of large-scale production, which hinders the sustainable development of the cherry tomato industry (<xref ref-type="bibr" rid="B12">Ishii et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B25">Montoya-Cavero et&#xa0;al., 2022</xref>). In recent years, with the advancement of agricultural mechanization and automation technologies, robotic harvesting has gradually become a viable alternative. The accuracy, adaptability, and real-time capabilities of robotic vision systems are the primary technological supports for reliable robotic harvesting, and they also determine harvesting efficiency (<xref ref-type="bibr" rid="B23">Magalh&#xe3;es et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B16">Li Y. et&#xa0;al., 2024</xref>). Computer vision has a wide range of applications in various fields (<xref ref-type="bibr" rid="B35">Wang H. et al., 2023</xref>; <xref ref-type="bibr" rid="B31">Tang et&#xa0;al., 2024</xref>), such as robotic navigation, 3D imaging (<xref ref-type="bibr" rid="B18">Li et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B17">Li H. et&#xa0;al., 2024</xref>), and remote sensing (<xref ref-type="bibr" rid="B13">Jamal Jumaah et&#xa0;al., 2024</xref>), which are crucial for enhancing the efficiency and accuracy of agricultural tasks like fruit harvesting. Currently, most cherry tomatoes are cultivated in greenhouses. Compared to traditional open-field cultivation, greenhouse cultivation adopts relatively standardized practices optimized for mechanized harvesting, such as uniform plant spacing, consistent plant height, and structured plant arrangement, providing an agronomic basis for robotic harvesting. However, under greenhouse conditions, variations in lighting, occlusions caused by the clustering growth of fruits, the small size of the picking targets, and the complexity of the background (including background interference from the intermingling growth of plant branches and leaves, and changes in the color distribution between fruits and leaves) due to the coupling effects of multiple factors, increase the difficulty of visual detection, impacting the precise identification and localization of fruits. Variations in the height between individual plants, irregularity in the arrangement of leaves, the uneven distribution of fruits, and partial or complete occlusion between fruits and leaves are unstructured characteristics that pose numerous challenges to the visual detection of picking robots (<xref ref-type="bibr" rid="B40">Zhang et&#xa0;al., 2024</xref>).</p>
<p>Traditional target detection methods based on image processing are classic approaches to fruit detection, which require extracting target features and learning to classify and recognize them. Morphological and color-based analysis methods have been widely applied to fruit detection. <xref ref-type="bibr" rid="B28">Septiarini et&#xa0;al. (2022)</xref> proposed a tomato image segmentation method, applying K-means clustering for ROI detection, performing RGB to HSV color space conversion in preprocessing, and using the Canny operator for edge detection, successfully achieving tomato feature extraction and detection. <xref ref-type="bibr" rid="B19">Li J. et&#xa0;al. (2024)</xref> proposed single and dual-mode image demodulation, brightness correction, and image segmentation algorithms, utilizing fast average filtering based on integral images to enhance the contrast between rotten areas and fruit backgrounds. Their approach significantly improved the early detection of rotten navel oranges, achieving a recognition accuracy of 97.5% using a two-phase spiral phase transform (SPT) combined with contrast adjustment and watershed segmentation. Feature extraction and classification methods have been widely explored to improve recognition accuracy. <xref ref-type="bibr" rid="B1">Bai et&#xa0;al. (2023)</xref> proposed a tomato recognition method integrating HOG, LBP, and color histogram algorithms. By concatenating shape, texture, and color feature vectors into a combined feature vector and processing it using a Support Vector Machine (SVM), the model achieved 100% accuracy under ideal conditions, with a processing time of less than one second. <xref ref-type="bibr" rid="B22">Liu et&#xa0;al. (2019)</xref> introduced a mature tomato detection algorithm combining HOG features with an SVM classifier, using a coarse-to-fine scanning approach. The model was further refined using false color removal (FCR) and non-maximum suppression (NMS), achieving a recall rate of 90.00%, a precision of 94.41%, and an F1 score of 92.15%. <xref ref-type="bibr" rid="B3">Chaivivatrakul and Dailey (2014)</xref> proposed a plant green fruit detection technique based on texture analysis, employing interest point feature extraction, descriptor calculation, SVM classification, candidate fruit point mapping, morphological closure, and fruit region extraction. The model achieved detection rates of 85% and 100% for single images of pineapple and bitter melon, respectively. Traditional image recognition techniques often rely on manual feature design, which achieves fruit recognition in specific scenes, but lack an understanding of the overall semantics of the image and are not well adapted to complex and changing unstructured environments (<xref ref-type="bibr" rid="B26">Qi et&#xa0;al., 2022</xref>).</p>
<p>Compared to traditional image processing methods, deep learning employs deep neural networks to automatically extract hierarchical features, reducing reliance on manual feature engineering. Through large-scale data training, it optimizes feature representation, enhances robustness to noise and defects, and improves detection stability (<xref ref-type="bibr" rid="B2">Banerjee et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B14">Kasani et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B38">Zeng et&#xa0;al., 2023</xref>). In fruit detection, deep learning models have been widely used due to their superior feature representation capabilities and increased detection accuracy over conventional image processing techniques (<xref ref-type="bibr" rid="B6">Chen S. et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B24">Meng et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B33">Tang et&#xa0;al., 2023b</xref>). <xref ref-type="bibr" rid="B15">Lawal (2021)</xref> proposed the YOLO-Tomato model for detecting tomatoes under complex environmental conditions by applying the LWYS method with spatial pyramid pooling as well as Mish activation function and integrating the dense architecture into YOLOv3. The mAP reaches more than 98% on small resolution datasets and the detection time is less than 50ms. <xref ref-type="bibr" rid="B43">Zheng et&#xa0;al. (2022)</xref> developed an enhanced method for recognizing cherry tomatoes by improving the YOLOx model. This approach integrates an attention mechanism within the dense network&#x2019;s backbone to enhance overall recognition performance. While these methods enhance detection performance by incorporating additional modules or modifying existing components, they also result in a larger and more complex model. This increased complexity poses challenges for deployment and utilization on edge devices. Therefore, to solve the challenges associated with implementing complex models on edge devices for robotic harvesting, researchers are paying more attention to balancing accuracy and model complexity while enhancing model performance. For instance, <xref ref-type="bibr" rid="B10">Gao et&#xa0;al. (2024)</xref> proposed LACTA, a lightweight high-precision sage fruit detection algorithm with a model size of only 2.88 M, which can be better deployed to selective harvesting robots. <xref ref-type="bibr" rid="B37">Yang et&#xa0;al. (2023)</xref> proposed an automatic tomato detection method based on an improved YOLOv8s model. The mAP of the enhanced model was increased by 1.5%, and the model size was significantly reduced from 22 M to 16 M. At the same time, a detection speed of 138.8 FPS was achieved, which is a better balance between the model size and detection accuracy. However, most of the lightweight algorithms proposed currently target tomatoes with distinct close-range features, are set in relatively simple background environments and do not consider the environmental impacts experienced by robots during actual harvesting operations, often resulting in somewhat singular data samples (<xref ref-type="bibr" rid="B39">Zhang et&#xa0;al., 2023</xref>). Therefore, further research into cherry tomato detection algorithms in actual working environments, ensuring that deployment on edge devices still maintains satisfactory real-time performance and accuracy, remains a highly challenging issue.</p>
<p>To realize accurate and efficient cherry tomato recognition in complex environments, this paper proposes an accurate lightweight, real-time, and efficient saint fruit detection algorithm. Firstly, to enhance the detection model&#x2019;s adaptability to the diversity of fruit features, lighting conditions, and background environments, this study employs a multi-source dataset augmentation strategy, selectively expanding the original dataset and establishing targeted datasets. Secondly, to further optimize and balance the detection efficiency and the capability to extract small target features under complex lighting conditions, the backbone network of the YOLOv8 model is reconstructed. The LAWDarknet53 network is introduced to replace the CBSDarknet53, allowing the model to retain more details while reducing redundant computations when extracting image features from shallow to deep layers. Considering the issues of occlusions, overlaps, and density that occur during the actual harvesting process, the SPPS network is proposed to better capture subtle feature changes caused by environmental variations. The introduction of a dynamic head detection head focuses on capturing valuable details, enhancing the model&#x2019;s understanding and detection accuracy in complex environments. This algorithm adapts well to unstructured environments under natural conditions, possessing good generalization and robustness, capable of being deployed on edge devices to efficiently and effectively complete detection tasks while ensuring performance.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data acquisition</title>
<p>Data on cherry tomatoes were collected at the Changji Agricultural Expo Park planting base in Xinjiang. To ensure the consistency of the growing environment and the quality of the fruit, the planting base uses a uniform vertical planting system. In this study, data on cherry tomatoes was collected using a handheld portable camera with a resolution of 1920 &#xd7; 1080. To ensure data diversity and broad coverage, images were collected under various lighting and occlusion conditions to simulate different actual planting environments, such as direct sunlight, shadows, fruit overlap, and occlusions. During the image collection process, the study captured images from multiple angles, including frontal, overhead, oblique, and upward angles, to capture features such as the shape, color, and texture of the cherry tomatoes, and conducted image collection at close, medium, and long distances to obtain fruit images at different scales. During the data collection period, the cherry tomatoes were in the ripening stage, with some of the fruits fully matured and meeting harvesting standards. A total of 2500 images of both mature and immature cherry tomatoes were collected, as shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Images of cherry tomatoes captured in a complex greenhouse environment under various conditions: <bold>(a)</bold> natural lighting, <bold>(b)</bold> intense lighting, <bold>(c)</bold> dim lighting, <bold>(d)</bold> shaded areas, <bold>(e)</bold> overlapping clusters, and <bold>(f)</bold> small targets positioned at a relatively greater distance.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g001.tif"/>
</fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Data preprocessing</title>
<p>By increasing the amount and variety of data, data augmentation can simulate different planting environments, including different lighting and occlusion conditions, as well as different shooting angles and distances, improving the robustness of the algorithm and the model&#x2019;s ability to generalize (<xref ref-type="bibr" rid="B32">Tang et&#xa0;al., 2023a</xref>). This study used a combination of offline and online data augmentation to selectively expand the original dataset; offline data augmentation is a preprocessing method applied to the original dataset before training, involving random combinations of brightness adjustment, rotation, translation, and noise to expand the dataset. Considering that excessive offline augmentation might introduce too much noise or inconsistency, potentially degrading model performance, only 500 new training samples are expanded using offline methods. Online data augmentation is the real-time enhancement of the original data during model training, which enhances model diversity while reducing storage resource requirements. During training, techniques such as noise, HSV adjustments, random rotations, scaling, and perspective transformations are used to enhance the training samples, with each method being applied with a 1% probability. It also turns off data augmentation in the last 10 epochs, allowing it to focus on learning from the original data, optimizing and complicating the details of the features (<xref ref-type="bibr" rid="B11">Ge et&#xa0;al., 2021</xref>). The effects of the enhancement are shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The original data is expanded using several techniques for data augmentation: <bold>(a)</bold> original data, <bold>(b)</bold> brightness, <bold>(c)</bold> translation, <bold>(d)</bold> noise, <bold>(e)</bold> horizontal flipping, <bold>(f)</bold> HSV, <bold>(g)</bold> scaling, <bold>(h)</bold> blurring, and their random combinations.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g002.tif"/>
</fig>
<p>In the data labeling process, the study used the labeling tool to process the cherry tomato dataset &#x201c;<ext-link ext-link-type="uri" xlink:href="https://github.com/HumanSignal/labelimg">https://github.com/HumanSignal/labelimg</ext-link>&#x201d;. During the data labeling process, the smallest enclosing rectangle of the cherry tomato was used as the true detection frame to minimize the interference of background information with the true detection frame. The samples were categorized into two groups: &#x201c;Immature,&#x201d; representing unripe cherry tomatoes, and &#x201c;Mature,&#x201d; indicating ripe cherry tomatoes. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> provides detailed information about the dataset. During the model training process, the training set used 80% of the dataset, the validation set used 10%, and the test set used 10%, guaranteeing that the test set is created using only the original photos and does not include any enhanced images.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Differences between the offline enhanced dataset and the original dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Category</th>
<th valign="middle" align="center">Parameter</th>
<th valign="middle" align="center"/>
<th valign="middle" align="center">Training</th>
<th valign="middle" align="center">Validation</th>
<th valign="middle" align="center">Test</th>
<th valign="middle" align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="4" align="center">Original dataset</td>
<td valign="middle" align="center">Number of images</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">2000</td>
<td valign="middle" align="center">250</td>
<td valign="middle" align="center">250</td>
<td valign="middle" align="center">2500</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Instances</td>
<td valign="middle" align="center">Immature</td>
<td valign="middle" align="center">7261</td>
<td valign="middle" align="center">1052</td>
<td valign="middle" align="center">974</td>
<td valign="middle" align="center"/>
</tr>
<tr>
<td valign="middle" align="center">mature</td>
<td valign="middle" align="center">6790</td>
<td valign="middle" align="center">956</td>
<td valign="middle" align="center">851</td>
<td valign="middle" align="center"/>
</tr>
<tr>
<td valign="middle" align="center">All</td>
<td valign="middle" align="center">14051</td>
<td valign="middle" align="center">2008</td>
<td valign="middle" align="center">1825</td>
<td valign="middle" align="center">17884</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="center">Augmented dataset</td>
<td valign="middle" align="center">Number of images</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">2400</td>
<td valign="middle" align="center">300</td>
<td valign="middle" align="center">300</td>
<td valign="middle" align="center">3000</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Instances</td>
<td valign="middle" align="center">Immature</td>
<td valign="middle" align="center">8834</td>
<td valign="middle" align="center">1321</td>
<td valign="middle" align="center">1241</td>
<td valign="middle" align="center"/>
</tr>
<tr>
<td valign="middle" align="center">mature</td>
<td valign="middle" align="center">8286</td>
<td valign="middle" align="center">1196</td>
<td valign="middle" align="center">1142</td>
<td valign="middle" align="center"/>
</tr>
<tr>
<td valign="middle" align="center">All</td>
<td valign="middle" align="center">17120</td>
<td valign="middle" align="center">2517</td>
<td valign="middle" align="center">2383</td>
<td valign="middle" align="center">22020</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>CTDA model</title>
<p>YOLO (You Only Look Once) was released in 2015 as a fast, accurate, and widely applied real-time object detection algorithm. The basic idea is to treat the target detection task as a regression problem and make predictions in an end-to-end manner. YOLOv8, the latest real-time object detector released in early 2023 as part of the YOLO series, establishes new technical standards for instance segmentation and object detection &#x201c;<ext-link ext-link-type="uri" xlink:href="https://github.com/ultralytics/ultralytics">https://github.com/ultralytics/ultralytics</ext-link>&#x201d;. <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref> illustrates the architectural model of YOLOv8, which includes a backbone network, neck, and detection head. Compared to previous generations, the backbone employs CBSDarknet53 for four times subsampling to extract features, utilizing SPPF for multi-scale feature extraction, which improves the model&#x2019;s ability to identify targets of varying sizes by capturing object and scene information at multiple scales. The neck uses FPN-PANet to merge and aggregate feature maps from different levels for a more global and semantically rich feature representation. The head section uses a decoupled head that separates classification and regression tasks, which effectively reduces the number of model parameters and computational complexity, improving model generalization and robustness.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>YOLOv8 model architecture diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g003.tif"/>
</fig>
<p>CBSDarknet53 experiences feature information loss, particularly with clustering, occlusion, and distant small targets, which reduces the model&#x2019;s detection accuracy. SPPF, which builds residual networks through maxpool at different levels, increases the risk of feature information loss during downsampling, thereby raising the likelihood of missing detections of partially occluded cherry tomatoes. Decoupled head, focusing on separating the classification and localization tasks, tends to overlook the relationships between targets and local context information, potentially causing false or missed detections in scenarios with dense targets or occlusions. This article improves the structure of the YOLO8 network model and introduces a new network model to address these issues. Firstly, a new downsampling method, LAWDS, incorporates adaptive weights and receptive field spatial features to preserve important features better and enhance feature representation; it maintains spatial information continuity and avoids disrupting spatial relationships between adjacent pixels. Secondly, softpool replaces maxpool in SPPF; while maxpool activates features by selecting the maximum value, which is simple and efficient, it may lose important information. Softpool uses a weighted sum of softmax activations within the kernel area to optimize activation downsampling, preserving more background information and feature details, thus enhancing feature representation. Finally, the study introduces a dynamic head with attention mechanisms, using a unified attention mechanism for scale perception, spatial awareness, and task awareness within a single structure to enhance object detection performance, effectively improving the representational capability of the detection head. The model structure is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>CTDA model structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g004.tif"/>
</fig>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>LAWDarknet53</title>
<p>In natural harvesting conditions, cherry tomatoes to be harvested by robots may be at a distance, presenting fewer features in images, it makes it particularly important to improve the model&#x2019;s ability to detect small, distant targets. Additionally, real-world production involves issues such as bright or dim lighting, necessitating improvements in the model&#x2019;s adaptability to different lighting conditions to ensure detection accuracy. In the entire detection network model, the backbone network is a crucial component for feature extraction; its performance directly affects the model&#x2019;s ability to detect and locate targets. Therefore, improving the feature extraction capability of the backbone network is necessary to adapt to the diversity of cherry tomatoes at different distances, sizes, and lighting conditions.</p>
<p>In this application context, CBSDarknet53 exhibits certain limitations; it uses the CBS module for feature extraction and fixed convolution for downsampling, where convolution operations depend on common parameters and are insensitive to changes in position that cause variations in information. Different lighting conditions alter the features of cherry tomato images, and the CBS module fails to effectively adjust its feature extraction strategy, leading to the loss of crucial features. Furthermore, fixed convolution sampling can miss some small target features, and the convolution operations might disrupt spatial relationships between adjacent pixels, resulting in discontinuities in spatial information. To address these issues, the research introduces a new downsampling method called light adaptive weight downsampling (LAWDS). LAWDS incorporates adaptive weights and receptive field spatial features, better preserving essential features and enhancing the representation of features for distant small targets. The formula for its calculation is as shown in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mtext>output</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:munderover>
<mml:mo stretchy="false">(</mml:mo>
</mml:mstyle>
<mml:mtext>Conv</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>Softmax</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>AvgPool</mml:mtext>
<mml:mn>2</mml:mn>
<mml:mtext>d</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>Conv</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mtext>output</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the output of the module, <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the <italic>i</italic>th feature map obtained through the downsampling convolution operation, and <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mtext>Softmax</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>AvgPool</mml:mtext>
<mml:mn>2</mml:mn>
<mml:mtext>d</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>Conv</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the <italic>i</italic>th attention map channel processed by the softmax function.</p>
<p>The LAWDS module initially employs average pooling operations to extract local features and gather global information. It then utilizes 1 &#xd7; 1 convolution for inter-channel information exchange and feature transformation to further enhance the feature map&#x2019;s expressive capacity. To improve the model&#x2019;s focus on crucial features, a softmax function normalizes the attention map. Additionally, small targets, which typically have smaller sizes and lower pixel densities, can lose detailed information in traditional convolution operations. Compared to the Focus module in YOLOv5 (<xref ref-type="bibr" rid="B42">Zhao et&#xa0;al., 2022</xref>), grouped convolution in LAWDS offers similar effects but is more computationally efficient. This method splits the input feature map into several groups and performs independent convolution operations on each group, enabling quick and efficient extraction of receptive field spatial features, enhancing the perception of small targets, and reducing computational complexity. Finally, the LAWDS module implements weighted fusion and spatial weighting of features, thereby further improving the model&#x2019;s focus on key features and increasing its performance and robustness. The structure of this module is shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>LAWDS structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g005.tif"/>
</fig>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>SPPFS</title>
<p>SPPF uses multiple pooling kernels to separate the most prominent feature information, and achieve optimal detection results by merging local and global features at the feature map level. However, in natural harvesting conditions, with fruits of different maturities varying in size and overlapping each other, the maxpool operation in SPPF leads to the loss of target feature information under dense occlusion, increasing the likelihood of missed detections of partially occluded cherry tomatoes. Therefore, the study proposes replacing the original SPPF network with the SPPFS network, as shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>. This network uses softpool instead of maxpool, preserving more background information and feature details, enabling more efficient feature utilization and richer multi-scale feature fusion, reducing the limitations of maxpool in complex scenes, and better recognizing and differentiating tightly packed or partially occluded fruits, thereby enhancing overall detection performance (<xref ref-type="bibr" rid="B29">Stergiou et&#xa0;al., 2021</xref>). Furthermore, softpool, by more finely processing the activation maps, better captures and preserves subtle feature changes caused by these environmental variations. It strengthens the model&#x2019;s resilience to environmental changes and increases the model&#x2019;s accuracy in identifying cherry tomatoes, ensuring its stability and reliability in practical application scenarios.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Using softpool to replace maxpool to build SPPFS: <bold>(a)</bold> SPPF structure diagram, <bold>(b)</bold> SPPFS structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g006.tif"/>
</fig>
<p>In maxpool, discarding most activations carries the risk of losing important information. Conversely, in avgpool, the equal contribution of activations can significantly reduce the overall intensity of area features. Compared to maxpool and avgpool, softpool adopts an activation method within the kernel that uses softmax exponential weighting. This approach is designed to maintain the functionality of the pooling layer while minimizing information loss during the pooling process, as illustrated in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Schematic diagram of three pooling activation methods: <bold>(a)</bold> maxpool, <bold>(b)</bold> avgpool, <bold>(c)</bold> softpool.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g007.tif"/>
</fig>
<p>Softpool uses the maximum approximation of the activation region R. Each activation that has an index is given a weight that is calculated as the ratio of the natural index of that activation to the sum of the natural indices of all activations in the neighborhood R. The weight calculation formula is shown in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>A nonlinear transformation uses the weights and the corresponding activation values. Larger activations are more common than smaller ones. Selecting the maximum value, which is the result of a standard summation of all weighted activations in the kernel neighborhood R, is not a balanced approach because most pooling operations are performed in high-dimensional feature spaces. Rather, it is better to highlight activations with greater effect. Its calculation formula is shown in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>:</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>a</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>*</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s2_3_3">
<label>2.3.3</label>
<title>Dynamic head</title>
<p>YOLOv8 employs a decoupled head as its detection head, enabling the model to independently handle object classification and localization tasks. This design aims to increase processing speed and reduce interference between tasks, while simultaneously maintaining high efficiency and enhancing detection accuracy (<xref ref-type="bibr" rid="B36">Xiao et&#xa0;al., 2024</xref>). In the context of cherry tomato detection, where the fruits vary in size and density and may be clustered or partially obscured due to their growth characteristics, the decoupled head tends to overlook the relationships between targets and local context information in tasks involving multiple targets. This oversight can impact the detection effectiveness of cherry tomatoes in dense scenes.</p>
<p>Therefore, this study improves the head part of the original network with dynamic head, which introduces a scale-aware attention mechanism that more effectively captures target features, enabling precise detection of cherry tomatoes of varying scales, shapes, and densities (<xref ref-type="bibr" rid="B8">Dai et&#xa0;al., 2021</xref>). Especially in complex backgrounds with dense and highly overlapping targets, this mechanism aids in the model&#x2019;s focus on key information, reducing the interference from background noise. The use of spatial awareness attention enhances the model&#x2019;s comprehension of the spatial position of target objects, reducing errors in bounding box localization. Unlike the decoupled head, the dynamic head introduces a task-aware attention mechanism that dynamically adjusts its internal structure, including feature extraction and decision layers, based on the features of the input image. This dynamism allows the model to more flexibly handle various detection scenarios, enhancing the model&#x2019;s generalization capability and accuracy.</p>
<p>The scale-aware attention module, which fuses features of different scales based on their semantic importance, is calculated as shown in <xref ref-type="disp-formula" rid="eq4">Equations 4</xref>, <xref ref-type="disp-formula" rid="eq5">5</xref>:</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>(</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>(</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mstyle>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mo>(</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a hard sigmoid function, <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a linear function approximated by 1 &#xd7; 1 convolutional layers, and <italic>H</italic>, <italic>W</italic>, and <italic>C</italic> signify the height, width, and number of channels in the intermediate hierarchy, respectively. The feature tensor is represented by <inline-formula>
<mml:math display="inline" id="im6">
<mml:mi mathvariant="script">F</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>The spatially-aware attention module focuses on the discriminative power of different spatial locations; given the high latitude of <italic>S</italic>, the module needs to be decoupled in two steps: first learning sparsification using variability convolution (<xref ref-type="bibr" rid="B9">Dai et&#xa0;al., 2017</xref>), and then aggregating features across levels at the same spatial location. Its calculation formula is shown in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>:</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>L</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>L</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mtext>&#x394;</mml:mtext>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>&#x394;</mml:mtext>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>L</italic> is the number of layers of the scaled feature pyramid, <italic>k</italic> is the number of sparsely sampled locations, <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mtext>&#x394;</mml:mtext>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the location shifted by the self-learned spatial offset <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:mtext>&#x394;</mml:mtext>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to focus on a discriminative region. <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:mtext>&#x394;</mml:mtext>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the self-learned importance scalar at location <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, both learned from the mid-level input feature <inline-formula>
<mml:math display="inline" id="im11">
<mml:mi mathvariant="script">F</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>The task-aware attention module facilitates collaborative learning and helps generalize different object representations, and it selects different tasks by dynamically turning feature channels on and off. Its calculation formula is shown in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>:</p>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mi>C</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a hyperfunction that learns to adjust the activation thresholds, and <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the feature slice of the <italic>c</italic>th channel. <inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is used in a manner similar to dynamic relu (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2020</xref>). To reduce dimensionality, it first performs global mean pooling on the <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> dimension. Then it uses two fully connected layers and a normalization layer, and finally it applies a shifted sigmoid function to normalize the output.</p>
<p>Dynamic head achieves the unification and synergistic effect of three types of attention mechanisms by sequentially applying scale-aware, spatial-aware, and task-aware attention modules. Additionally, YOLOv8 employs an anchor-free method, complicating the construction of specific task branches by attaching center or keypoint predictions to the classification or regression branches. In contrast, dynamic head simplifies the model structure and enables dynamic adjustments by merely attaching various types of predictions to the end of the head, as depicted in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Schematic diagram of the decoupled head and dynamic head structures.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g008.tif"/>
</fig>
</sec>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Experimental environment and model evaluation indicators</title>
<p>All experiments in this study were conducted on a server equipped with an NVIDIA GeForce RTX 4090 GPU and an Intel(R) Xeon(R) Platinum 8375C CPU. The operating system was Ubuntu 20.04, and the experiments utilized CUDA version 11.8, python 3.8, and pytorch 2.0.0. The key hyperparameters used during the training process are outlined in <xref ref-type="table" rid="T2">
<bold>Table 2</bold>
</xref>. To verify the feasibility and effectiveness of the proposed improvements, experiments were carried out on the model. It is crucial to mention that these experiments were conducted using a baseline model, concentrating exclusively on validating the enhanced structure without incorporating pretrained weights.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>CTDA model training key hyperparameters.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Parameters</th>
<th valign="top" align="center">Values</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Image Size</td>
<td valign="top" align="center">640&#xd7;640</td>
</tr>
<tr>
<td valign="top" align="center">Epoch</td>
<td valign="top" align="center">200</td>
</tr>
<tr>
<td valign="top" align="center">Batch</td>
<td valign="top" align="center">16</td>
</tr>
<tr>
<td valign="top" align="center">Optimizer</td>
<td valign="top" align="center">SGD</td>
</tr>
<tr>
<td valign="top" align="center">Initial learning rate</td>
<td valign="top" align="center">0.01</td>
</tr>
<tr>
<td valign="top" align="center">Momentum</td>
<td valign="top" align="center">0.937</td>
</tr>
<tr>
<td valign="top" align="center">Weight decay</td>
<td valign="top" align="center">0.0005</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The study evaluates the model performance of CTDA using precision (P), recall (R), mean average precision (mAP), F1 score, and GFLOPs. In object detection tasks, predictions are classified as positive samples when their intersection with the ground truth labels exceeds a certain threshold. Otherwise, they are identified as negative samples. The formulas for calculating precision and recall are shown in <xref ref-type="disp-formula" rid="eq8">Equations 8</xref>, <xref ref-type="disp-formula" rid="eq9">9</xref>:</p>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mtext>P</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mtext>R</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The mean Average Precision (mAP) represents the mean of the Average Precision (AP) values across multiple categories, and the AP formula is shown in <xref ref-type="disp-formula" rid="eq10">Equation 10</xref>:</p>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:mtext>AP</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>mAP@0.5:0.95 refers to the average of the average precision calculated using different IoU thresholds between 0.5 and 0.95 in object detection.</p>
<p>The <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> score is a reconciled average of precision and recall, which is used to comprehensively evaluate the model performance, ranging from 0 to 1, with higher values indicating better performance. Its calculation formula is shown in <xref ref-type="disp-formula" rid="eq11">Equation 11</xref>:</p>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>GFLOP refers to one billion floating point operations per second, which is used to evaluate the computational performance of the model.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Experimental results</title>
<sec id="s3_1">
<label>3.1</label>
<title>The impact of augmented data on CTDA</title>
<p>Experiments were performed on three datasets to examine the effects of data augmentation techniques on CTDA performance: the original dataset, an offline-augmented dataset, and a dataset utilizing both offline and online augmentation. With the exception of the training dataset, the experimental conditions remained unchanged. The results presented in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref> show that the offline augmented dataset increased precision by 0.7%, recall by 1.3%, and an average precision increase of 1.1% over the original dataset. The combined offline and online augmentation strategy further improved precision to 92.2%, recall to 86.7%, and mAP to 92.4%, indicating enhancements across all metrics. Thus, the data augmentation strategy combining offline and online approaches effectively improved the detection performance of CTDA.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Data augmentation ablation experiment results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">original</th>
<th valign="middle" align="center">Offline Enhancement</th>
<th valign="middle" align="center">Online Enhancement</th>
<th valign="top" align="center">P(%)</th>
<th valign="top" align="center">R(%)</th>
<th valign="top" align="center">mAP(%)</th>
<th valign="top" align="center">mAP@0.5:0.95(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="top" align="center">91.9</td>
<td valign="top" align="center">85.2</td>
<td valign="top" align="center">90.9</td>
<td valign="top" align="center">69.5</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="top" align="center">92.6</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">92.0</td>
<td valign="top" align="center">69.9</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="top" align="center">92.2</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">70.5</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Comparison between different enhancement mechanisms of CTDA</title>
<p>To improve the model&#x2019;s ability to detect cherry tomatoes in occluded environments, softpool was used to replace maxpool in the SPPF, enhancing the detection of occluded sections. The effectiveness of softpool was assessed by applying maxpool, avgpool, and softpool treatments to the upper input feature maps of SPPF. As observed in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>, maxpool led to the loss of critical features, which is particularly problematic for cherry tomatoes that inherently have fewer features, resulting in missed detections. While avgpool maintained important features, it reduced the intensity of the overall feature area, weakening of the model&#x2019;s feature recognition ability. In comparison, softpool has significantly optimized this processing procedure, not only effectively preserving key features but also ensuring the overall intensity of the feature areas, which significantly increases the expressiveness of the features, and improves the model&#x2019;s performance in the cherry tomato detection task.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Visualization results of feature maps processed using different pooling mechanisms: <bold>(a)</bold> original image, <bold>(b)</bold> input feature map, <bold>(c)</bold> maxpool result, <bold>(d)</bold> avgpool result, <bold>(e)</bold> softpool result.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g009.tif"/>
</fig>
<p>To improve the model&#x2019;s detection ability for densely packed and highly overlapping cherry tomatoes in complex backgrounds, a dynamic head equipped with an attention mechanism was used to improve the head part of the original network. Both decoupled head and dynamic head were tested in dense cherry tomato scenes. <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref> shows the heatmap generated using Grad-CAM, highlighting that CTDA primarily focuses on leaves and cherry tomatoes, with the background impacting detection. The study emphasizes distinguishing between foreground and background to focus solely on cherry tomatoes in the foreground. The heatmap indicates various detection heads&#x2019; differing attentiveness to dense cherry tomatoes. Dynamic head precisely targets these areas, reducing background interference. Experimental results show that CTDA based on dynamic head more effectively distinguishes and focuses on cherry tomatoes in dense areas, significantly improving foreground-background differentiation.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Visualization of different detection heads: <bold>(a)</bold> original image, <bold>(b)</bold> decoupled head detection effect, <bold>(c)</bold> dynamic head detection effect.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g010.tif"/>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Ablation experiment</title>
<p>To further test the validity of the proposed strategies for improvement, this study incrementally tested the performance enhancements of the model. The testing process and results of the ablation experiment are shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. Ablation testing showed that the B model, featuring the LAWDarknet53 structure, slightly exceeded baseline precision and recall, reducing the detection model size from 6.0 M to 5.4 M. Building on the B model, the C model incorporated the SPPFS network, significantly improving precision, recall, mAP@0.5, and mAP@0.5:0.95, with a slight increase in parameters. Integrating the dynamic head module into the CTDA model, which builds on the C model, resulted in a slight reduction in detection speed but notably enhanced precision, mAP, and mAP@0.5:0.95. Compared to the baseline model (A), the proposed CTDA model increased precision, recall, mAP@0.5, and mAP@0.5:0.95 by 2.1%, 5.2%, 2.9%, and 6.0%, respectively, achieving values of 94.3%, 91.5%, 95.3%, and 76.5%. Furthermore, the model size and FPS, at 6.7M and 154.1 respectively, only decreased by 1.1%, confirming that the model sustains high efficiency and real-time performance while notably improving accuracy. The results underscored the effectiveness of the improvement strategies, particularly in improving detection accuracy, where the LAWDS structure, SPPS component, and dynamic head module had significant impacts on computational parameters, recall, and precision, respectively.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Model improvement ablation experiment results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Baseline</th>
<th valign="top" align="center">LAWDarknet53</th>
<th valign="top" align="center">SPPFS</th>
<th valign="top" align="center">Dyhead</th>
<th valign="top" align="center">P</th>
<th valign="top" align="center">R</th>
<th valign="top" align="center">mAP@0.5</th>
<th valign="top" align="center">mAP@0.5:0.95</th>
<th valign="top" align="center">Size/M</th>
<th valign="top" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">A</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="middle" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.2</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">70.5</td>
<td valign="top" align="center">6.0</td>
<td valign="top" align="center">155.8</td>
</tr>
<tr>
<td valign="top" align="center">B</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">88.3</td>
<td valign="top" align="center">93.5</td>
<td valign="top" align="center">74.9</td>
<td valign="top" align="center">5.4</td>
<td valign="top" align="center">154.9</td>
</tr>
<tr>
<td valign="top" align="center">C</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#xd7;</td>
<td valign="top" align="center">93.7</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">94.6</td>
<td valign="top" align="center">75.9</td>
<td valign="top" align="center">6.2</td>
<td valign="top" align="center">156.3</td>
</tr>
<tr>
<td valign="top" align="center">CTDA</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">94.3</td>
<td valign="top" align="center">91.5</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">76.5</td>
<td valign="top" align="center">6.7</td>
<td valign="top" align="center">154.1</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>CTDA network model training and testing</title>
<p>The study conducted training of the CTDA model for 200 epochs on the enhanced dataset, with results depicted in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>. YOLOv8&#x2019;s loss computation includes classification loss (<inline-formula>
<mml:math display="inline" id="im17">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) and regression loss (<inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:mtext>CIoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> loss plus Distribution Focal Loss (<inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>)), all weighted according to specific ratios. The formulas for these calculations are shown in <xref ref-type="disp-formula" rid="eq12">Equations 12</xref>&#x2013;<xref ref-type="disp-formula" rid="eq15">15</xref>:</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Training results for the CTDA model: <bold>(a)</bold> variations in loss during training, <bold>(b)</bold> comparison of average precision curves between CTDA and YOLOv8 during training at IoU thresholds of 0.5 and 0.5:0.95, <bold>(c)</bold> P-R curve, <bold>(d)</bold> confusion matrix.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g011.tif"/>
</fig>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mo>{</mml:mo>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>&gt;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:msup>
<mml:mi>p</mml:mi>
<mml:mi>&#x3b3;</mml:mi>
</mml:msup>
<mml:mtext>log</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mtext>CIoU</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>IoU</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>b</mml:mtext>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mtext>b</mml:mtext>
<mml:mrow>
<mml:mtext>gt</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mtext>c</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>q</italic> stands for the label; <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> for the intersection over union; b and <inline-formula>
<mml:math display="inline" id="im21">
<mml:mrow>
<mml:msup>
<mml:mtext>b</mml:mtext>
<mml:mrow>
<mml:mtext>gt</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> for the center points of the two rectangular boxes; <italic>p</italic> for the Euclidean distance between them; c for the diagonal distance of the enclosed region of the boxes; <italic>v</italic> for the consistency of their relative proportions; <inline-formula>
<mml:math display="inline" id="im22">
<mml:mi>&#x3b1;</mml:mi>
</mml:math>
</inline-formula> for the weighting coefficient; <italic>y</italic> for the total distribution value; and <italic>i</italic> for the number of entries.</p>
<p>
<xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref> displays the loss function curve, mAP curve, P-R curve, and confusion matrix of the CTDA model. According to <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11a</bold>
</xref>, as training epochs rise, all three loss values progressively drop until stabilizing. <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11b</bold>
</xref> shows significant improvements in mAP values for the CTDA model compared to the original YOLOv8 at IoU thresholds of 0.5 and 0.5:0.95. <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11c</bold>
</xref> shows the precision-recall curves for Mature, Immature, and all categories during the training process. The Figure indicates that the area under the precision-recall curve for Mature is larger than that for Immature, indicating that the model performs better in identifying mature cherry tomatoes. This is because immature cherry tomatoes have a color similar to the plant, which interferes with their detection, whereas mature fruits have a clear contrast with the background environment, making them easier to accurately identify.</p>
<p>
<xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11d</bold>
</xref> presents the confusion matrix for the CTDA model, with the vertical axis indicating the predicted labels and the horizontal axis displaying the actual labels. The color of each item represents the likelihood of that entry. Every category&#x2019;s likelihood of being correctly classified is represented by the values along the major diagonal. It can be observed that both mature and immature cherry tomatoes have a classification probability of 92%. Values deviating from the main diagonal indicate model misclassifications. For this experimental result, the frequency of misclassifications is relatively low. Misclassifications mainly occur when the background is recognized as mature (44%) or immature (56%) cherry tomatoes.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Performance Test of CTDA</title>
<p>The CTDA model&#x2019;s performance under various lighting conditions, as depicted in <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref>, includes natural lighting, intense lighting, dim lighting, and shaded areas. The detection results clearly show that the CTDA model can precisely identify mature and immature cherry tomatoes under these complex lighting conditions, highlighted with red and orange boxes respectively. This demonstrates the model&#x2019;s high adaptability and robustness to complex lighting variations, essential for cherry tomato picking robots to efficiently and stably perform in unstructured natural environments. Additionally, detection experiments were conducted on cherry tomatoes at near and far distances under four lighting conditions, where the model typically captures larger, more detailed images at closer ranges. However, at greater distances, recognition becomes more challenging due to smaller target images, increased noise, and less distinct features. Thus, the proposed model effectively handles different scales of recognition, maintaining the ability to effectively recognize cherry tomatoes and accurately assess their maturity at long distances, with detection accuracy nearly equal to that of close-range detection.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Effects of model detection in different illumination conditions: <bold>(a)</bold> natural lighting, <bold>(b)</bold> intense lighting, <bold>(c)</bold> dim lighting, <bold>(d)</bold> shaded areas.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g012.tif"/>
</fig>
<p>Due to cherry tomatoes growing in clusters and the limited spatial range of the robot&#x2019;s visual sensor, there is a high occurrence of overlapping and small, distant targets during harvesting, which significantly impacts the model&#x2019;s detection capabilities. The model is designed for efficient target detection in constrained operational spaces, ensuring it can differentiate between cherry tomatoes in the foreground and background, even when the targets are small or overlapping. <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref> illustrates the CTDA model&#x2019;s detection results in scenarios involving multiple overlaps and distant small targets, demonstrating the model&#x2019;s ability to precisely detect and individually identify and locate each cherry tomato in overlapping situations. Furthermore, under other background disturbances like reflective mulching, the CTDA model continues to show outstanding detection performance, effectively distinguishing cherry tomatoes from complex backgrounds.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>CTDA detection effect in different environments: <bold>(a)</bold> overlapping clusters, <bold>(b)</bold> small targets positioned at a relatively greater distance.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g013.tif"/>
</fig>
<p>To evaluate the CTDA model&#x2019;s detection capabilities in complex scenarios, this study developed a multi-scenario dataset, capturing images in greenhouses under various visual conditions including strong light, weak light, occlusion, and dense settings. The performance tests of the model included detecting both mature and immature cherry tomatoes, as well as determining the overall detection ability for all cherry tomatoes.</p>
<p>The study uses precision, recall, mAP and F1 score to evaluate the performance in different scenarios. As shown in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>, the model exhibited its best performance in detecting all cherry tomatoes under low light conditions, achieving the highest precision of 94.1% and an F1 score of 92.7. In the occlusion scenario, it achieved the highest recall rate of 92.9% and an mAP of 95.9%. In low-light and obscured conditions, the model performed better than in circumstances with high and dense light. Further analysis reveals that the model excels in detecting mature cherry tomatoes compared to immature ones. The CTDA model shows good detection ability in scenarios with strong light, low light, occlusion, and thick surroundings; nevertheless, strong light and dense circumstances significantly affect its performance.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Detection results in complex scenarios.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Dataset</th>
<th valign="top" align="center">Classes</th>
<th valign="top" align="center">Instances</th>
<th valign="top" align="center">P(%)</th>
<th valign="top" align="center">R(%)</th>
<th valign="top" align="center">mAP(%)</th>
<th valign="top" align="center">F1(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="3" align="center">Strong light</td>
<td valign="middle" align="center">Immature</td>
<td valign="middle" align="center">244</td>
<td valign="top" align="center">92.9</td>
<td valign="top" align="center">93.2</td>
<td valign="top" align="center">96.4</td>
<td valign="top" align="center">93.0</td>
</tr>
<tr>
<td valign="middle" align="center">mature</td>
<td valign="middle" align="center">210</td>
<td valign="top" align="center">88.6</td>
<td valign="top" align="center">86.6</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">87.6</td>
</tr>
<tr>
<td valign="middle" align="center">Total</td>
<td valign="middle" align="center">454</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">89.9</td>
<td valign="top" align="center">94.6</td>
<td valign="top" align="center">90.3</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Weak light</td>
<td valign="middle" align="center">Immature</td>
<td valign="middle" align="center">271</td>
<td valign="top" align="center">95.9</td>
<td valign="top" align="center">87.6</td>
<td valign="top" align="center">94.8</td>
<td valign="top" align="center">91.6</td>
</tr>
<tr>
<td valign="middle" align="center">mature</td>
<td valign="middle" align="center">224</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">95.2</td>
<td valign="top" align="center">96.4</td>
<td valign="top" align="center">93.7</td>
</tr>
<tr>
<td valign="middle" align="center">Total</td>
<td valign="middle" align="center">495</td>
<td valign="top" align="center">94.1</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">95.6</td>
<td valign="top" align="center">92.7</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Occlusion</td>
<td valign="middle" align="center">Immature</td>
<td valign="middle" align="center">406</td>
<td valign="top" align="center">87.3</td>
<td valign="top" align="center">92.2</td>
<td valign="top" align="center">95.1</td>
<td valign="top" align="center">89.7</td>
</tr>
<tr>
<td valign="middle" align="center">mature</td>
<td valign="middle" align="center">336</td>
<td valign="top" align="center">94.3</td>
<td valign="top" align="center">93.6</td>
<td valign="top" align="center">96.7</td>
<td valign="top" align="center">93.9</td>
</tr>
<tr>
<td valign="middle" align="center">Total</td>
<td valign="middle" align="center">742</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">92.9</td>
<td valign="top" align="center">95.9</td>
<td valign="top" align="center">91.8</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="center">Density</td>
<td valign="middle" align="center">Immature</td>
<td valign="middle" align="center">305</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">88.2</td>
<td valign="top" align="center">91.9</td>
<td valign="top" align="center">87.3</td>
</tr>
<tr>
<td valign="middle" align="center">mature</td>
<td valign="middle" align="center">223</td>
<td valign="top" align="center">93.5</td>
<td valign="top" align="center">90.7</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">92.1</td>
</tr>
<tr>
<td valign="middle" align="center">Total</td>
<td valign="middle" align="center">528</td>
<td valign="top" align="center">90.0</td>
<td valign="top" align="center">89.5</td>
<td valign="top" align="center">93.6</td>
<td valign="top" align="center">89.7</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Robustness evaluation of CTDA in various contexts</title>
<p>In the actual cherry tomato picking process, image quality can be severely impacted by environmental noises such as poor lighting (either insufficient or excessive) and blurring due to movement, thereby reducing the performance of target detection algorithms. To thoroughly evaluate the robustness and adaptability of the proposed CTDA model under different greenhouse conditions, four test datasets were created representing normal lighting, dim lighting, excessive lighting, and blurred images. These datasets, constructed by adjusting the brightness and adding blur to images from the normal lighting dataset, maintain consistency in image count, size, and annotations, with identical categorizations of the targets within each image. The model&#x2019;s performance was visually assessed through visualization of the detection results in these scenarios, as illustrated in <xref ref-type="fig" rid="f14">
<bold>Figure&#xa0;14</bold>
</xref>, using green bounding boxes for correct detections, blue for incorrect ones, and red for misses. Despite some errors and omissions, the CTDA model generally excels in recognizing cherry tomato targets under different conditions. Because research divides detection targets into mature and immature categories, misclassification of mature as immature results in both a missed detection and an incorrect detection, leading to overlapping bounding boxes in the visual results.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Visualization of cherry tomato detection outcomes in various settings: <bold>(a)</bold> under natural lighting, <bold>(b)</bold> intense lighting conditions, <bold>(c)</bold> low-light environments, and <bold>(d)</bold> blurred scenes. Green boxes represent accurate detections, blue boxes represent faulty detections, and red boxes indicate missing detections.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1492110-g014.tif"/>
</fig>
<p>As demonstrated in <xref ref-type="fig" rid="f14">
<bold>Figure&#xa0;14</bold>
</xref>, the CTDA model exhibits substantial stability and detection capabilities under variable lighting conditions, retaining high accuracy with minimal errors and misses under strong lighting, and showing even better performance in weak lighting. However, the model&#x2019;s performance declines in blurred scenarios, with an increased rate of missed detections and a significant drop in detection accuracy, indicating a need for further enhancement in handling such conditions. The study observed that camera movement during the robotic picking process could cause image blurring due to external disturbances. Consequently, the research will focus on enhancing the model&#x2019;s resistance to disturbances in blurred scenarios by augmenting the dataset with more images from such conditions, aiming to boost the model&#x2019;s overall robustness.</p>
</sec>
<sec id="s3_7">
<label>3.7</label>
<title>Comparison of CTDA with the latest detection algorithms</title>
<p>In this study, the CTDA model is evaluated against both classical and state-of-the-art object detection models to further validate the effectiveness of the proposed algorithm, as presented in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>. These comparisons included models like Faster R-CNN (<xref ref-type="bibr" rid="B27">Ren et&#xa0;al., 2017</xref>), RetinaNet (<xref ref-type="bibr" rid="B20">Lin et&#xa0;al., 2017</xref>), EfficientDet (<xref ref-type="bibr" rid="B30">Tan et&#xa0;al., 2020</xref>), YOLOv5n, YOLOx-s (<xref ref-type="bibr" rid="B11">Ge et&#xa0;al., 2021</xref>), YOLOv7 (<xref ref-type="bibr" rid="B34">Wang C. Y. et al., 2023</xref>), Fasternet (<xref ref-type="bibr" rid="B4">Chen J. et&#xa0;al., 2023</xref>), Swin transformer (<xref ref-type="bibr" rid="B21">Liu et&#xa0;al., 2021</xref>) and RT-DETR (<xref ref-type="bibr" rid="B41">Zhao et&#xa0;al., 2023</xref>), encompassing high-precision, lightweight models, and representative detection algorithms. CTDA achieved the highest mAP of 95.3%, surpassing newly released models like YOLOv7 and Swin transformer by significantly reducing parameter counts by 91.1% and 88.1% respectively and increasing mAP by 3.4% and 4.9%. When compared to lightweight models like EfficientDet and YOLOv5n, CTDA showed a slight increase in parameters but far superior accuracy, outperforming them by 23.5% and 4.4% in mAP, respectively. Additionally, against models known for their fast detection speed, like Fasternet, CTDA not only improved mAP by 5.1% but also managed to reduce parameter count and increase FPS by 5.9% to 154.1, striking a good balance between speed and accuracy. This demonstrates CTDA&#x2019;s exceptional overall performance, particularly in real-time capabilities, parameter efficiency, and accuracy, making it well-suited for deployment on edge devices with limited computing resources, thereby supporting efficient, precise, and real-time detection tasks.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Comparative experiment of CTDA with other advanced models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">mAP@0.5</th>
<th valign="middle" align="center">mAP@0.5:0.95</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">Faster R-CNN</td>
<td valign="top" align="center">78.8</td>
<td valign="top" align="center">80.7</td>
<td valign="top" align="center">84.1</td>
<td valign="top" align="center">58.5</td>
<td valign="middle" align="center">369.7</td>
<td valign="middle" align="center">22.6</td>
</tr>
<tr>
<td valign="middle" align="center">RetinaNet</td>
<td valign="top" align="center">86.8</td>
<td valign="top" align="center">85.3</td>
<td valign="top" align="center">88.9</td>
<td valign="top" align="center">59.4</td>
<td valign="middle" align="center">145.7</td>
<td valign="middle" align="center">41.5</td>
</tr>
<tr>
<td valign="middle" align="center">EfficientDet</td>
<td valign="top" align="center">78.7</td>
<td valign="top" align="center">67.1</td>
<td valign="top" align="center">71.8</td>
<td valign="top" align="center">48.9</td>
<td valign="middle" align="center">4.7</td>
<td valign="middle" align="center">23.8</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5n</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">84.1</td>
<td valign="top" align="center">90.9</td>
<td valign="top" align="center">69.4</td>
<td valign="middle" align="center">7.1</td>
<td valign="middle" align="center">144.6</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOx-s</td>
<td valign="top" align="center">93.3</td>
<td valign="top" align="center">86.4</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">68.4</td>
<td valign="middle" align="center">26.76</td>
<td valign="middle" align="center">81.2</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv7</td>
<td valign="top" align="center">93.1</td>
<td valign="top" align="center">84.3</td>
<td valign="top" align="center">91.9</td>
<td valign="top" align="center">68.1</td>
<td valign="middle" align="center">105.1</td>
<td valign="middle" align="center">80.2</td>
</tr>
<tr>
<td valign="top" align="center">Fasternet</td>
<td valign="top" align="center">93.0</td>
<td valign="top" align="center">83.9</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">67.8</td>
<td valign="top" align="center">10.7</td>
<td valign="top" align="center">145.5</td>
</tr>
<tr>
<td valign="top" align="center">Swin transformer</td>
<td valign="top" align="center">92.6</td>
<td valign="top" align="center">83.2</td>
<td valign="top" align="center">90.4</td>
<td valign="top" align="center">67.7</td>
<td valign="top" align="center">79.1</td>
<td valign="top" align="center">46.6</td>
</tr>
<tr>
<td valign="middle" align="center">RT-DETR</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">91.6</td>
<td valign="top" align="center">72.1</td>
<td valign="middle" align="center">56.9</td>
<td valign="middle" align="center">50.8</td>
</tr>
<tr>
<td valign="middle" align="center">CTDA</td>
<td valign="middle" align="center">94.3</td>
<td valign="middle" align="center">91.5</td>
<td valign="middle" align="center">95.3</td>
<td valign="middle" align="center">76.5</td>
<td valign="middle" align="center">9.4</td>
<td valign="middle" align="center">154.1</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>In unstructured environments, varying lighting conditions, complex backgrounds, and fruit overlap and occlusion pose challenges to the visual detection of picking robots. This study focuses on developing a detection algorithm tailored for use in picking robots. However, during the operation of the picking robot, there will be cherry tomato picking targets at a distance, which have fewer feature information in the image, making it difficult for cherry tomatoes to be effectively detected. Therefore, in response to these issues, the research proposes a precise, lightweight, and real-time efficient cherry tomato detection algorithm. By using LAWDS to reconstruct the backbone network, capturing more detailed features improves detection accuracy, making the model more effectively retain small target features. Secondly, the model introduces the SPPFS network, achieving more efficient feature utilization and richer multi-scale feature fusion, better identifying and distinguishing closely arranged or partially occluded fruits. The model applies the dynamic head detection head to more effectively capture target features, achieving accurate detection of cherry tomatoes of different scales, shapes, and densities. Additionally, the CTDA has significantly improved in accuracy, computational complexity, and detection speed while ensuring the model size, making it more suitable for deployment on resource-limited edge devices.</p>
<p>In model testing, an offline and online combined data augmentation strategy was utilized to selectively expand the original dataset, enhancing the model&#x2019;s generalization capabilities. The study tested cherry tomato scenes under various lighting conditions, demonstrating the model&#x2019;s adaptability to changes in lighting and its ability to accurately detect cherry tomatoes in scenarios involving overlap and distant small targets, effectively identifying each fruit even in overlapping states. The CTDA model also excelled in other background disturbances such as reflective mulching, effectively distinguishing foreground cherry tomatoes from complex backgrounds. A quantitative analysis showed minimal errors and missed detections under strong lighting, with better performance under weak lighting. However, blurred scenarios increased missed detections, significantly impacting accuracy, indicating room for improvement in the model&#x2019;s handling of blurred images. External disturbances can cause image blurring during robotic harvesting in greenhouses, negatively impacting detection. Future work will explore optimizing the model to resist dynamic image blurring, possibly through attention mechanisms tailored for blurred target detection or image preprocessing techniques. Additionally, the current dataset primarily includes images of ripe and unripe cherry tomatoes, which limits the model&#x2019;s comprehensive understanding of all growth stages. To improve the model&#x2019;s detection capabilities and develop a more accurate automated picking system, the research will collect more data on cherry tomatoes of varying ripeness and growth stages. By analyzing the impact of different growth stages on model performance, more effective picking strategies can be devised, enhancing efficiency and reducing fruit loss due to incorrect picking.</p>
<p>In conclusion, while the CTDA model has its limitations, it has significantly contributed to improving cherry tomato detection technologies in greenhouse settings, providing essential technical support for the advancement of agricultural automation and intelligent development of harvesting robots. With continuous enhancements, this model is poised for wider future applications. Additionally, due to its adjustability and adaptability to various object features, the CTDA model&#x2019;s framework and methodology could be adapted for other agricultural settings, particularly for fruit harvesting in complex environments, offering substantial technical support for robotic harvesting in unstructured settings. Further studies will also test the model across different crops and growing conditions to assess its utility and performance in a broader range of agricultural applications.</p>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusions</title>
<p>To enhance the detection capabilities for cherry tomatoes in complex environments, the study developed the CTDA model based on YOLOv8, tailored for unstructured settings. This model introduces a new downsampling method, LAWDS, to construct the LAWDarknet53 network, enhancing feature extraction capabilities. It also includes the SPPS network to improve feature fusion, addressing uneven detection issues in tomato occlusion scenarios. Additionally, the dynamic head with an attention mechanism was integrated to boost detection performance by harmonizing scale-aware, space-aware, and task-aware attention mechanisms within a single structure. The improved CTDA model achieved a 95.3% mAP, a 2.9% increase over the original, with significant improvements in recall and precision rates to 91.5% and 94.3%, respectively. To evaluate the effectiveness of the CTDA model in complex situations, datasets were generated that included strong, weak, occlusion, and density condition. The results showed accuracies of 94.8% and 95.1% in strong and weak illumination, respectively. The CTDA model demonstrates good stability under varying lighting conditions, but the miss rate increases in blurry scenes, affecting detection accuracy. The CTDA model was also compared with the latest detection networks, showing excellent performance in mAP, parameter count, and speed. Weighing 6.7M with a 95.3% mAP and 154.1 FPS, it meets the real-time detection requirements for cherry tomatoes in unstructured environments. Future research will integrate the CTDA model into cherry tomato harvesting robots to facilitate automated picking in greenhouses. Given mechanical vibrations can blur images during picking, reducing detection efficacy, ongoing research will aim to boost the model&#x2019;s interference resistance, enhancing performance in disturbed environments and ensuring reliable visual support for cherry tomato harvesting robots.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>ZL: Conceptualization, Formal Analysis, Methodology, Validation, Writing &#x2013; original draft. CZ: Software, Validation, Writing &#x2013; review &amp; editing. ZLL: Formal Analysis, Methodology, Writing &#x2013; original draft. GW: Software, Validation, Writing &#x2013; review &amp; editing. XL: Conceptualization, Funding acquisition, Resources, Supervision, Writing &#x2013; review &amp; editing. XZ: Conceptualization, Visualization, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This study was supported by the Autonomous Region Tianshan Cedar Youth Top Talent Program &#x201c;Research on Human-like Picking Robot Vision Perception and Adaptive Control System&#x201d; Grant No. 20227SYCCX0061; Xinjiang Uyghur Autonomous Region major project &#x201c;Research and Development of Automated, Intelligent Mechanical Equipment for Facility Vegetables in the Tarim Basin&#x201d; Grant No. 2022A02005-5; and the Central Guidance to Local Projects &#x201c;Research and Base Construction of Xinjiang Facility Green Fruit and Vegetable Production and Processing Engineering and Intelli-gent Equipment Technology&#x201d; Grant No. ZYYD2023B01.</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>We sincerely thank Guoqiang Wang and Caihong Zhang from the Agricultural Mechanization Institute of Xinjiang Academy of Agricultural Sciences for their guidance and assistance.</p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bai</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Clustered tomato detection and picking point location using machine learning-aided image analysis for automatic robotic harvesting</article-title>. <source>Precis. Agric.</source> <volume>24</volume>, <fpage>727</fpage>&#x2013;<lpage>743</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-022-09972-6</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Banerjee</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Kukreja</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Hariharan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Jain</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Jindal</surname> <given-names>V.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Predicting tulip leaf diseases: a integrated cnn and random forest approach</article-title>,&#x201d; in <conf-name>2023 World Conference on Communication &amp; Computing (WCONF)</conf-name> (<publisher-name>IEEE</publisher-name>), p. <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chaivivatrakul</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Dailey</surname> <given-names>M. N.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Texture-based fruit detection</article-title>. <source>Precis. Agric.</source> <volume>15</volume>, <fpage>662</fpage>&#x2013;<lpage>683</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-014-9361-x</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhuo</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>C. H.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>Run, don&#x2019;t walk: chasing higher flops for faster neural networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, p. <fpage>12021</fpage>&#x2013;<lpage>12031</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>An improved yolov3 based on dual path network for cherry tomatoes detection</article-title>. <source>J. Food Process Eng.</source> <volume>44</volume>, <elocation-id>e13803</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/jfpe.13803</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Xiang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Study on fusion clustering and improved yolov5 algorithm based on multiple occlusion of camellia oleifera fruit</article-title>. <source>Comput. Electron. Agric.</source> <volume>206</volume>, <elocation-id>107706</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107706</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Dynamic relu</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name> (<publisher-name>Springer</publisher-name>), p. <fpage>351</fpage>&#x2013;<lpage>367</lpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Dynamic head: unifying object detection heads with attentions</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, p. <fpage>7373</fpage>&#x2013;<lpage>7382</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>). &#x201c;<article-title>Deformable convolutional networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name>, p. <fpage>764</fpage>&#x2013;<lpage>773</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Lacta: a lightweight and accurate algorithm for cherry tomato detection in unstructured environments</article-title>. <source>Expert Syst. Appl.</source> <volume>238</volume>, <elocation-id>122073</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2023.122073</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ge</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Yolox: exceeding yolo series in 2021</article-title>. <italic>Arxiv Preprint</italic>.</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ishii</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Matsuo</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Takemura</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sonoda</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Nishida</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yasukawa</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Tomato-harvesting-robot competition towards smart agriculture</article-title>,&#x201d; in <conf-name>Proceedings of International Conference on Artificial Life &amp; Robotics (ICAROB2021)</conf-name> (<publisher-name>ALife Robotics</publisher-name>), p. <fpage>1</fpage>&#x2013;<lpage>5</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jumaah</surname> <given-names>H. J.</given-names>
</name>
<name>
<surname>Rashid</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Saleh</surname> <given-names>S. A. R.</given-names>
</name>
<name>
<surname>Jumaah</surname> <given-names>S. J</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Deep neural remote sensing and Sentinel-2 satellite image processing of Kirkuk City, Iraq for sustainable prospective</article-title>. <source>J. Optics Photonics Res.</source> <volume>00</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.47852/bonviewJOPR42022920</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kasani</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Yadla</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Rachamalla</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hariharan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Devarajula</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Andraju</surname> <given-names>B. P.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Potato crop disease prediction using deep learning</article-title>,&#x201d; in <conf-name>2023 IEEE 12th International Conference on Communication Systems and Network Technologies (CSNT)</conf-name> (<publisher-name>IEEE</publisher-name>), p. <fpage>231</fpage>&#x2013;<lpage>235</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lawal</surname> <given-names>M. O.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Tomato detection based on modified yolov3 framework</article-title>. <source>Sci. Rep.</source> <volume>11</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-021-81216-5</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Peduncle collision-free grasping based on deep reinforcement learning for tomato harvesting robot</article-title>. <source>Comput. Electron. Agric.</source> <volume>216</volume>, <fpage>108488</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108488</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Gu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>He</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Mo</surname> <given-names>Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>A lightweight improved YOLOv5s model and its deployment for detecting pitaya fruits in daytime and nighttime light-supplement environments</article-title>. <source>Comput. Electron. Agric.</source> <volume>220</volume>, <fpage>108914</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.108914</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jie</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Dual-frequency lidar for compressed sensing 3D imaging based on all-phase fast fourier transform</article-title>. <source>J. Optics Photonics Res.</source> <volume>1</volume>, <fpage>74</fpage>&#x2013;<lpage>81</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.47852/bonviewJOPR32021565</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Identification of early decayed oranges using structured-illumination reflectance imaging coupled with fast demodulation and improved image processing algorithms</article-title>. <source>Postharvest Biol. Technol.</source> <volume>207</volume>, <fpage>112627</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.postharvbio.2023.112627</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T.-Y</given-names>
</name>
<name>
<surname>Goyal</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Focal loss for dense object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name>, p. <fpage>2980</fpage>&#x2013;<lpage>2988</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, p. <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J. H.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A mature-tomato detection algorithm using machine learning and color analysis</article-title>. <source>Sensors</source> <volume>19</volume>, <fpage>2023</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s19092023</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Magalh&#xe3;es</surname> <given-names>S. A.</given-names>
</name>
<name>
<surname>Moreira</surname> <given-names>A. P.</given-names>
</name>
<name>
<surname>Santos</surname> <given-names>F. N. D.</given-names>
</name>
<name>
<surname>Dias</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Active perception fruit harvesting robots&#x2014;a systematic review</article-title>. <source>J. Intelligent Robotic Syst.</source> <volume>105</volume>, <fpage>14</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10846-022-01595-3</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meng</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Transforming unmanned pineapple picking with spatio-temporal convolutional neural networks</article-title>. <source>Comput. Electron. Agric.</source> <volume>214</volume>, <fpage>108298</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108298</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Montoya-Cavero</surname> <given-names>L.</given-names>
</name>
<name>
<surname>D&#xed;az De Le&#xf3;n Torres</surname> <given-names>R.</given-names>
</name>
<name>
<surname>G&#xf3;mez-Espinosa</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Escobedo Cabello</surname> <given-names>J. A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Vision systems for harvesting robots: produce detection and localization</article-title>. <source>Comput. Electron. Agric.</source> <volume>192</volume>, <elocation-id>106562</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106562</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Pearson</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Harman</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Shu</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Tea chrysanthemum detection under unstructured environments using the tc-yolo model</article-title>. <source>Expert Syst. Appl.</source> <volume>193</volume>, <elocation-id>116473</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2021.116473</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Faster r-cnn: towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans Pattern Anal Mach Intell.</source> <volume>39</volume>, <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Septiarini</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Hamdani</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Sari</surname> <given-names>S. U.</given-names>
</name>
<name>
<surname>Hatta</surname> <given-names>H. R.</given-names>
</name>
<name>
<surname>Puspitasari</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Hadikurniawati</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Image processing techniques for tomato segmentation applying k-means clustering and edge detection approach</article-title>,&#x201d; in <conf-name>2021 International Seminar on Machine Learning, Optimization, and Data Science (ISMODE)</conf-name> (<publisher-name>IEEE</publisher-name>), p. <fpage>92</fpage>&#x2013;<lpage>96</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Stergiou</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Poppe</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Kalliatakis</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Refining activation downsampling with softpool</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, p. <fpage>10357</fpage>&#x2013;<lpage>10366</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Pang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Le</surname> <given-names>Q. V.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Efficientdet: scalable and efficient object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, p. <fpage>10781</fpage>&#x2013;<lpage>10790</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhuo</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Obstacle avoidance motion in mobile robotics;</article-title>. <source>J. System Simulation.</source> <volume>36</volume>, <fpage>1</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.16182/j.issn1004731x.joss.23-1297E</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>K.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>a). <article-title>Optimization strategies of fruit detection to overcome the challenge of unstructured background in field orchard environment: a review</article-title>. <source>Precis. Agric.</source> <volume>24</volume>, <fpage>1183</fpage>&#x2013;<lpage>1219</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-023-10009-9</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>b). <article-title>Fruit detection and positioning technology for a camellia oleifera c. Abel orchard based on improved yolov4-tiny model and binocular stereo vision</article-title>. <source>Expert Syst. Appl.</source> <volume>211</volume>, <elocation-id>118573</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2022.118573</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H. Y. M.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Yolov7: trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, p. <fpage>7464</fpage>&#x2013;<lpage>7475</lpage>.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>R.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Magnetic field sensor using the magnetic fluid-encapsulated long-period fiber grating inscribed in the thin-cladding fiber</article-title>. <source>J. Optics Photonics Res.</source> <volume>1</volume>, <fpage>210</fpage>&#x2013;<lpage>215</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.47852/bonviewJOPR32021689</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Nguyen</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>W. Q.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Fruit ripeness identification using yolov8 model</article-title>. <source>Multimedia Tools Appl.</source> <volume>83</volume>, <fpage>28039</fpage>&#x2013;<lpage>28056</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-023-16570-9</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Nie</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A lightweight yolov8 tomato detection algorithm combining feature enhancement and attention</article-title>. <source>Agronomy</source> <volume>13</volume>, <fpage>1824</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy13071824</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Lightweight tomato real-time detection method based on improved yolo and mobile deployment</article-title>. <source>Comput. Electron. Agric.</source> <volume>205</volume>, <elocation-id>107625</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107625</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Ali</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Multi-class detection of cherry tomatoes using improved yolov4-tiny model</article-title>. <source>Int. J. Agric. Biol. Eng.</source> <volume>16</volume>, <fpage>225</fpage>&#x2013;<lpage>231</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.25165/jijabe.20231602.7744</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Greenhouse tomato detection and pose classification algorithm based on improved yolov5</article-title>. <source>Comput. Electron. Agric.</source> <volume>216</volume>, <elocation-id>108519</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108519</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>G.</given-names>
</name>    <name>
<surname>Dang</surname> <given-names>Q.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). &#x201c;<article-title>Detrs beat yolos on real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, p. <fpage>16965</fpage>&#x2013;<lpage>16974</lpage>.</citation>
</ref>
<ref id="B42">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>The improved yolov5 algorithm and its application in small target detection</article-title>,&#x201d; in <conf-name>international conference on intelligent robotics and applications</conf-name> (<publisher-name>Springer</publisher-name>), p. <fpage>679</fpage>&#x2013;<lpage>688</lpage>.</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Yolox-dense-ct: a detection algorithm for cherry tomatoes based on yolox and densenet</article-title>. <source>J. Food Measurement Characterization</source> <volume>16</volume>, <fpage>4788</fpage>&#x2013;<lpage>4799</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11694-022-01553-5</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>