<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2025.1618214</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Ta-YOLO: overcoming target blocked challenges in greenhouse tomato detection and counting</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Yun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Yijia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3047487/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xu</surname>
<given-names>Xing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1528890/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>He</surname>
<given-names>Yong</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/692559/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gan</surname>
<given-names>Hao</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/968172/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Na</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Zhechen</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sun</surname>
<given-names>Xi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Yali</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3035149/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Skobelev</surname>
<given-names>Petr</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2414822/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mi</surname>
<given-names>Yanan</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Artificial Intelligence and Information Engineering, Zhejiang University of Science and Technology</institution>, <addr-line>Hangzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>College of Biosystems Engineering and Food Science, Zhejiang University</institution>, <addr-line>Hangzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Biosystems Engineering and Soil Science, University of Tennessee</institution>, <addr-line>Knoxville, TN</addr-line>,&#xa0;<country>United States</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Cardiovascular Medicine, Zhejiang Hospital</institution>, <addr-line>Hangzhou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Samara Federal Research Scientific Center, Russian Academy of Sciences</institution>, <addr-line>Samara</addr-line>,&#xa0;<country>Russia</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Department of Business Development, Pegasor Oy</institution>, <addr-line>Tampere</addr-line>,&#xa0;<country>Finland</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Aichen Wang, Jiangsu University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Chao Qi, Jiangsu Academy of Agricultural Sciences (JAAS), China</p>
<p>Lin Jiao, Anhui University, China</p>
<p>Xuan Wei, Fujian Agriculture and Forestry University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Xing Xu, <email xlink:href="mailto:xuxing3220@163.com">xuxing3220@163.com</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>08</day>
<month>07</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1618214</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>04</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>14</day>
<month>06</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Zhao, Chen, Xu, He, Gan, Wu, Wang, Sun, Wang, Skobelev and Mi</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Zhao, Chen, Xu, He, Gan, Wu, Wang, Sun, Wang, Skobelev and Mi</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Screening and cultivating healthy small tomatoes, along with accurately predicting their yields, are crucial for sustaining the economy of tomato industry. However, in field scenarios, counting small tomato fruits is often hindered by environmental factors such as leaf shading. To address this challenge, this study proposed the Ta-YOLO modeling framework, aimed at improving the efficiency and accuracy of small tomato fruit detection. We captured images of small tomatoes at various stages of ripeness in real-world settings and compiled them into datasets for training and testing the model. First, we utilized the Space-to-Depth module to efficiently leverage the implicit features of the images while ensuring a lightweight operation of the backbone network. Next, we developed a novel pyramid pooling module(DASPPF) to capture global information through average pooling, effectively reducing the impact of edge and background noise on detection. We also introduced an additional tiny target detection head alongside the original detection head, enabling multi-scale detection of small tomatoes. To further enhance the model&#x2019;s focus on relevant information and improve its ability to recognize small targets, we designed a multi-dimensional attention structure(CSAM) that generated feature maps with more valuable information. Finally, we proposed the EWDIoU bounding box loss function, which leveraged a 2D Gaussian distribution to enhance the model&#x2019;s accuracy and robustness. The experimental results showed that the number of parameters, FLOPs, and FPS of our designed Ta-YOLO were 10.58M, 14.4G, and 131.58, respectively, and its mean average precision(mAP) reached 84.4%. It can better realize the counting of tomatoes with different maturity levels, which helps to improve the efficiency of the small tomato production and planting process.</p>
</abstract>
<kwd-group>
<kwd>machine vision</kwd>
<kwd>Ta-YOLO</kwd>
<kwd>target detection</kwd>
<kwd>tomato counting</kwd>
<kwd>target blocked</kwd>
</kwd-group>
<counts>
<fig-count count="13"/>
<table-count count="7"/>
<equation-count count="22"/>
<ref-count count="38"/>
<page-count count="21"/>
<word-count count="11038"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Small tomatoes are a flavorful, nutritious crop with high economic value and important in the global vegetable trade. China&#x2019;s small tomato industry has grown rapidly over the past 20 years, with more than 30,000 acres planted nationwide, jumping to the top spot in the world (<xref ref-type="bibr" rid="B7">Guan et&#xa0;al., 2018</xref>). The huge economic benefits have made it economically important to accurately estimate the number of fruits before harvest. On one hand early yield estimation can help producers adjust their planting strategies. On the other hand, it can also effectively improve the operators&#x2019; income and operation development strategies. However, estimating the number of small tomatoes is greatly challenged by their own tight growth, dense leaf shade, and short ripening period. Traditional manual methods of counting are not only economically costly, but also time-consuming and easily hindered by human error and subjectivity. These problems can easily compromise the accuracy of the counting of data. Therefore, it is very important to utilize robotics to achieve an automated and scalable approach to improve the accuracy and speed of fruit detection and counting in agriculture (<xref ref-type="bibr" rid="B36">Zhao et&#xa0;al., 2022</xref>).</p>
<p>In recent years, with the development of deep learning, computer vision technology is highly integrated with the agricultural industry. In the field of computer vision, it mainly includes a variety of tasks, such as image classification, target detection, entity segmentation, etc. Among them, target detection is able to locate the target in the form of a rectangular box, which has high accuracy and real-time performance (<xref ref-type="bibr" rid="B25">Srinivas et&#xa0;al., 2016</xref>). Therefore, target detection technology is most widely used in agricultural fruit detection and counting, and also provides a new solution for the application of robots in agriculture.</p>
<p>Deep learning based target detection algorithms include single-stage and two-stage algorithms. The single-stage algorithms realize the detection process through a single network branch, eliminating the complex steps such as feature extraction and generation of candidate frames in the two-stage. Therefore, single-stage target detection algorithms are famous for their fast detection. Currently, excellent single-stage target detection algorithms include SSD (<xref ref-type="bibr" rid="B17">Liu et&#xa0;al., 2016</xref>), RetinaNet (<xref ref-type="bibr" rid="B15">Lin et al., 2017</xref>), YOLOv5 (<xref ref-type="bibr" rid="B20">Redmon et al., 2016</xref>), YOLOv8, YOLOv9 (<xref ref-type="bibr" rid="B33">Wang et&#xa0;al., 2024</xref>), and the latest YOLOv11 (<xref ref-type="bibr" rid="B12">Khanam and Muhammad, 2024</xref>). Two-stage detection algorithms first generate a large number of candidate regions containing the target object, and then perform further processing such as region classification, bounding box regression, and so on for each candidate region. Classical two-stage detection algorithms include R-CNN (<xref ref-type="bibr" rid="B6">Girshick et&#xa0;al., 2014</xref>), Fast R-CNN (<xref ref-type="bibr" rid="B5">Girshick, 2015</xref>), Faster R-CNN (<xref ref-type="bibr" rid="B21">Ren et al., 2016</xref>), Mask R-CNN (<xref ref-type="bibr" rid="B9">He et&#xa0;al., 2017</xref>), Cascade R-CNN (<xref ref-type="bibr" rid="B1">Cai and Vasconcelos, 2018</xref>), and DetectoRS (<xref ref-type="bibr" rid="B19">Qiao et&#xa0;al., 2021</xref>). However, when facing complex scenes, although two-stage target detection algorithms are able to provide higher accuracy, it has a large computational overhead, which makes it unsuitable for a wide range of scenarios such as real-time detection.</p>
<p>Therefore, researchers must balance the advantages and disadvantages of the two algorithms in light of practical needs, selecting and enhancing them accordingly. These algorithms have been widely used for the recognition of a variety of crops, such as potato (<xref ref-type="bibr" rid="B10">Johnson et&#xa0;al., 2021</xref>), maize (<xref ref-type="bibr" rid="B11">Khaki et&#xa0;al., 2020</xref>), rice (<xref ref-type="bibr" rid="B35">Zhang et&#xa0;al., 2022</xref>), apple (<xref ref-type="bibr" rid="B31">Wang and He., 2022</xref>), and so on. For the detection of small tomato crop, Seo et&#xa0;al. (<xref ref-type="bibr" rid="B24">Seo et&#xa0;al., 2021</xref>) proposed a real-time robotic detection system based on Faster R-CNN for detecting tomato growth and selecting a color model that is robust to external light to develop an image-based ripeness criterion for tomato fruits. Wang et&#xa0;al. (<xref ref-type="bibr" rid="B28">Wang et&#xa0;al., 2022</xref>) designed an improved Faster R-CNN model, MatDet, for tomato ripeness detection to address the difficulty of detecting tomato ripeness in complex scenes by using RolAlign to obtain more accurate bounding boxes in the feature mapping stage. Wang et&#xa0;al. (<xref ref-type="bibr" rid="B29">Wang et&#xa0;al., 2023</xref>) proposed an R-CNN model for tomato detection and segmentation tasks, using Swin Transformer as the backbone network for better feature extraction, the method can not only effectively recognize tomato in cherry tomato varieties, but also differentiate between different ripening stages. The introduction of the YOLO (You Only Look Once) family of models provides the advantage of directly predicting the entire image without generating candidate regions and has also been widely used by researchers. Lawal et&#xa0;al. (<xref ref-type="bibr" rid="B13">Lawal, 2021</xref>) used an improved YOLOv3 model to realize the detection of tomato counts in natural scenes, and solved the problem of gradient vanishing during model training by introducing the MixNet backbone network. Miao et&#xa0;al. (<xref ref-type="bibr" rid="B18">Miao et&#xa0;al., 2023</xref>) proposed an algorithm for estimating the ripeness of individual tomato clusters and an integrated method for locating tomato stems based on experimental errors using the YOLOv5 network architecture. Liu et&#xa0;al. (<xref ref-type="bibr" rid="B16">Liu et&#xa0;al., 2020</xref>) proposed a tomato detection model called &#x201c;YOLO-tomato&#x201d; using the improved YOLOv3 architecture, which utilizes a circular bounding box instead of the traditional rectangular bounding box for tomato localization, which reduces the predicted coordinates and thus achieves more accurate tomato matching. In (<xref ref-type="bibr" rid="B4">Ge et&#xa0;al., 2022</xref>), a detection model named &#x201c;YOLO-deepsort&#x201d; is proposed to realize the periodic detection of tomato growth, and the effective features are enhanced by using BiFPN multiscale fusion structure to realize the improvement of detection accuracy. In addition, the combination of robots and inspection algorithms brings a number of significant advantages to the field of tomato inspection. Dai et&#xa0;al. (<xref ref-type="bibr" rid="B3">Dai et&#xa0;al., 2022</xref>) proposed a tomato fruit counting algorithm for greenhouse inspection robots, which tracks the position of tomatoes in the image by the spatial displacement information of the robot, while 3D depth filtering is used to avoid the interference of complex backgrounds on tomato counting. Rong et&#xa0;al. (<xref ref-type="bibr" rid="B22">Rong et&#xa0;al., 2023</xref>) Proposed an improved tomato cluster counting method based on YOLOv4, which incorporates target detection, multi-target tracking, and region-specific tracking counting in a robot to reduce the problem of tracked tomato cluster offset. Li et&#xa0;al. (<xref ref-type="bibr" rid="B14">Li et&#xa0;al., 2023</xref>) based on the improved YOLOv8 model, the MHSA attention mechanism is utilized to enhance the ability of the network to extract diverse features, and at the same time, it is mounted on the robot to realize the real-time hierarchical detection and counting function in the real scene, and achieve good detection results. Ruparelia et&#xa0;al. (<xref ref-type="bibr" rid="B23">Ruparelia et&#xa0;al., 2022</xref>) proposed a deep learning based tomato detection system for distinguishing between healthy, ripe and unripe tomatoes using different versions of the YOLO architecture.</p>
<p>However, practical applications of tomato detection and counting still face significant challenges under occlusion conditions. Fruits obstructed by other fruits, leaves, calyxes, stems, and similar structures can substantially degrade the accuracy of vision-based robotic detection systems. Specifically, the following issues are observed: (1) During the late fruit-setting stage, the extremely small size of tomato fruits increases the risk of missed detections; (2) In the fruiting stage, the dense distribution of small tomatoes combined with extensive occlusion frequently results in undetected instances; (3) Occlusion by branches, leaves, and stems can lead to false positives during the fruiting stage; (4) Leaf shading during fruiting may also cause both false detections and omissions of small tomatoes. To address these challenges, this study explores the integration of feature representations at varying depths across different branching structures to enhance the detection of small tomato targets through the fusion of multi-level feature information.</p>
<p>In summary, this study proposed a small tomato target detection method based on the YOLOv8 network architecture, specifically designed to address the occlusion challenges encountered during the counting of small tomato fruits in large-scale production environments. The main contributions of this research are outlined as follows:</p>
<list list-type="order">
<list-item>
<p>Images of small tomatoes at different maturity stages were collected under large-scale cultivation conditions to construct a real-world small tomato dataset. The tomatoes in each image were annotated and categorized into three distinct maturity levels. To improve the robustness and generalization of the detection model, the dataset was further augmented using a set of simple yet effective data augmentation techniques applied to both the images and their corresponding annotations.</p>
</list-item>
<list-item>
<p>In real-world scenarios, the growth of small tomatoes is often accompanied by dense foliage and branching, leading to challenges such as the loss of fine-grained image features during recognition. To address these issues, this study incorporated a C2f-RepGhost module combined with a Space-to-Depth convolutional structure, enabling the proposed Ta-YOLO model to preserve detailed feature representations while maintaining a lightweight design. Furthermore, an additional detection head was introduced to enhance the model&#x2019;s capacity for small object feature extraction. To further mitigate the impact of peripheral edge information on core feature representation, a Dilated Atrous Spatial Pyramid Pooling Fusion (DASPPF) module was integrated into the architecture.</p>
</list-item>
<list-item>
<p>This study proposed a CSAM attention mechanism, which integrates spatial and channel attention to enhance the model&#x2019;s focus on salient features. By jointly leveraging spatial and channel-wise dependencies, the CSAM module improves the model&#x2019;s sensitivity to occluded regions and enhances its capability to accurately recognize targets under complex occlusion conditions.</p>
</list-item>
<list-item>
<p>Traditional IoU-based loss functions often exhibit substantial bias when handling objects of varying scales. To mitigate this issue, we proposed the EWDIoU bounding box regression loss, which models the distance between the predicted box and the ground truth using a two-dimensional Gaussian distribution. This formulation enhances the model&#x2019;s sensitivity to small target regions, thereby reducing scale-related bias and improving both the recognition accuracy and overall robustness of the detection framework.</p>
</list-item>
<list-item>
<p>The proposed model demonstrates effective detection and counting of small tomatoes in real greenhouse environments. It successfully addressed the occlusion challenges associated with short growth periods and validates the efficacy of the Ta-YOLO architecture in practical agricultural scenarios for accurate and robust small tomato detection and counting.</p>
</list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Dataset acquisition and processing</title>
<p>Existing tomato datasets primarily consist of images featuring single or multiple tomatoes against relatively clean and unobstructed backgrounds, limiting their applicability to real-world field production scenarios. Therefore, this study collected data from a small tomato cultivation base located at the International Internet Agricultural Expo Park in Wuzhen City, Zhejiang Province, where tomatoes at various growth stages were cultivated for market supply. Data acquisition was conducted over the period from April 3 to May 30, 2024. A SCOUT 2.0 robot equipped with an iPhone 14 Pro mounted horizontally was utilized to capture images of the small tomato plants. A total of 160 plants were arranged in two rows, each extending 20 meters in length. During data collection, the robot moved at a constant speed, photographing each row sequentially from left to right and then returning from right to left to capture images in the opposite direction. The acquired images were subsequently uploaded to a PC for further processing. The overall experimental setup is illustrated in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>, with the right panel depicting the robot in operation.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Schematic of little tomato dataset acquisition and dataset collection tools.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g001.tif">
<alt-text content-type="machine-generated">Diagram on the left shows a setup with potted plants in rows and columns, each plant monitored by a phone camera linked to a computer. Below, three images depict tomato plants growing in controlled conditions. On the right, a robot navigates between rows of plants in a greenhouse environment.</alt-text>
</graphic>
</fig>
<p>The photographed images were standardized to 640*640 pixels, labeled using the LabelImg tool, with the following labels: green fruit tomato, red fruit tomato and yellow fruit tomato. Following agronomic standards and harvesting requirements, these three labels correspond to unripe tomatoes, ripe tomatoes, and tomatoes between unripe and ripe stages, respectively. Such three classifications can fit the actual production decisions and reduce redundant judgements, while ensuring the efficiency of data annotation and data processing (<xref ref-type="bibr" rid="B27">Wan et&#xa0;al., 2018</xref>). Finally, the original small tomato data samples were obtained as 661, which were divided into training set, validation set and test set according to the ratio of 3:1:1. Given that the training set consists of only 535 images with only a small portion of overexposure and blurring under natural light conditions, in order to enhance the generalization of the model, six data enhancement techniques were adopted to process the training set data, including exposure, rotation, blurring, random brightness adjustment, mirroring and noise addition. As shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>, each image is enhanced by taking a random combination of three of the above enhancement methods.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Example of 4 forms of blocked tomatoes images. <bold>(A)</bold> Example of extreme tiny tomatoes image. <bold>(B)</bold> Enhanced extreme tiny tomatoes image. <bold>(C)</bold> Example of mutual shading between classes image. <bold>(D)</bold> Enhanced mutual shading between classes image. <bold>(E)</bold> Example of branch stalk shading image. <bold>(F)</bold> Enhanced branch stalk shading image. <bold>(G)</bold> Example of leaf shading image. <bold>(H)</bold> Enhanced leaf shading image.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g002.tif">
<alt-text content-type="machine-generated">Series of eight images labeled A to H show tomato plants at various growth stages. Starting from small green tomatoes, the sequence progresses to larger, partially red tomatoes. Each transition is indicated by a blue arrow. The images depict changes in fruit size and color over time.</alt-text>
</graphic>
</fig>
<p>A notable feature of this dataset is the inclusion of complex distractions from real environments, with varying degrees of occlusion problems on each image. Based on the type of blocked, we grouped the detection difficulties into four categories: extreme tiny tomatoes, mutual shading between classes, branch stalk shading, and leaf shading. The proportion of tiny small tomatoes was the largest, with a more similar amount of interclass shading and leaf shading, and a relatively small amount of branch and stem shading.(The number of different maturity categories in the original dataset and the corresponding number for each shading type are shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>) In order to mitigate the impact that category imbalance would have on training, we used the CopyPaste method to perform an additional data augmentation operation on red and yellow fruit tomatoes, which is to copy the instances in the image containing red and yellow fruit tomatoes and paste them into another image during the training process, adding instances from fewer categories to generate new training samples.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Number of different maturity categories and the corresponding number for each type of shading.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Categories</th>
<th valign="top" align="left">Instances</th>
<th valign="top" align="left">Tiny</th>
<th valign="top" align="left">Classes shading</th>
<th valign="top" align="left">Stalk shading</th>
<th valign="top" align="left">Leaf shading</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">9973</td>
<td valign="middle" rowspan="3" align="center">2069</td>
<td valign="middle" rowspan="3" align="center">885</td>
<td valign="middle" rowspan="3" align="center">455</td>
<td valign="middle" rowspan="3" align="center">809</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">1853</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">1220</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Hardware design</title>
<p>Combined with the growth characteristics of the small tomato itself, during the fruiting period, the growth height of the small tomato ranges from 0.5 to 2.3 meters, and within a relatively short period of time, there is a large span of height change. In order to be able to meet the normal work at different heights, we have also designed and improved the agricultural robot hardware accordingly. Firstly, the robot stand is built by 1.5mm iron plate, and the overall structure is in the shape of a tower, which is divided into three layers to meet the needs of different sensors and different heights of mounting. Next, electrical adapter devices are fixed on the bottom layer for powering the sensors of each device and edge computing devices are fixed on the bottom layer for processing real-time data. To improve the stability of the collected data, the camera head is mounted on the bottom tail, and the shooting camera is mounted on the camera head tilt rotation connector. The middle layer installs the router used by the robot for communication, which facilitates remote operation and control of the robot. The top layer is fitted with LIDAR to prevent other equipment from interfering with the laser. In this work, we deployed the detection algorithm ultimately on an edge device and utilized an agricultural robot to achieve the work of detecting and counting small tomatoes of different ripeness in a facility greenhouse, overcoming the problem of occlusion during the growing process. The detailed hardware composition as well as the field applications are shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Compositional architecture of the robot as well as map construction of the whole scene and real-time detection results in real applications.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g003.tif">
<alt-text content-type="machine-generated">Diagram showing a robotic system integrated with sensors and processors. Vision, radar, and attitude sensors connect to an edge processor, which communicates wirelessly with a central computer. Images on the right display a green 3D map and plant growth monitoring on a screen.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Small tomatoes detection based on Ta-YOLO</title>
<p>In real production environments, the large-scale planting of small tomatoes has significant advantages in improving production efficiency, but in the growth process of small tomatoes, regularized planting makes the lush branches and leaves obscure the fruits, and changes in the intensity and angle of the sunlight at different moments also significantly change the brightness and contrast of the image, making it more difficult to count the fruits.</p>
<p>This study proposed a Ta-YOLO model for the detection and counting of small tomatoes in a real production environment to address these challenges. The model retained the overall framework of YOLOv8n, adopted C2f-rghost combined with Space-to-Depth Conv module to reconstruct the backbone structure, and at the same time, the DASPPF structure was proposed to enhance the fine-grained representation. And the CSAM multiple attention mechanism was created in the neck structure, and an additional detection head was added to enhance the detection ability in different scales and occlusion situations. Finally, the EWDIoU loss function was proposed to improve the detection accuracy for small tomatoes. The overall structure of Ta-YOLO is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref> These improvements will be further illustrated above.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Overall structure of Ta-YOLO model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g004.tif">
<alt-text content-type="machine-generated">Flowchart of a neural network architecture for tomato detection. The network consists of three main sections: Backbone with layers for convolution and CSTD processing, Neck with upsampling and concatenation, and Head with detection layers. The right side shows an output image of a tomato plant in a greenhouse, with detected tomatoes highlighted and labeled with confidence scores.</alt-text>
</graphic>
</fig>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>Lightweight network design</title>
<p>In the YOLOv8 backbone network, Convolutional Neural Networks (CNNs) perform well in different tasks such as classification and detection. However, due to the use of pooling layers, connecting across steps, and other operations in the CNN architecture, which allows the model to easily skip over a large amount of redundant pixel information, it is not possible to learn a more efficient representation of the features. Therefore, in our model we use the Space-to-Depth (<xref ref-type="bibr" rid="B26">Sunkara and Luo, 2022</xref>) and Conv module, which consists of a space-to-depth (SPD) layer and a convolution-free step (Conv) layer (shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>). This method alters the image using downsampled feature maps within and across the CNN, allowing the model to reduce the sharp performance degradation when faced with small tomato targets.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Schematic of SPDC module when scale = 2.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g005.tif">
<alt-text content-type="machine-generated">Diagram illustrating the process of space-to-depth transformation and convolution with stride equals one. An initial block labeled with dimensions \(L \times L \times C_1\) is divided into smaller blocks, each scaled by two. These blocks are combined to form a larger block with dimensions \(L/2 \times L/2 \times 4C_1\), which is further combined with another block of size \(L/2 \times L/2 \times C_2\).</alt-text>
</graphic>
</fig>
<p>For example, we denote the feature map with input size <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>L</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:mtext>X</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>j</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the feature map can be divisible by all scales to get the feature subgraph <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mtext>x</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>j</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. When <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mtext>scale</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, we get four feature sub-feature maps, each of which has the shape of <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mi>,</mml:mi>
<mml:mfrac>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mi>,</mml:mi>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Next, we splice these sub-feature maps along the channel dimensions to get a new feature map <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mi>,</mml:mi>
<mml:mfrac>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mi>,</mml:mi>
<mml:mi>&#xa0;</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and a non-Stepwise convolution with a <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is added after the new feature map <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&lt;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Then, the new feature map is further transformed to get <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which retains all the discriminative information as much as possible without reducing the feature map.</p>
<p>In target detection tasks, lightweight network structures tend to lower the computational cost and reduce the size of the model. In order to maintain the improved accuracy of small-target tomato detection without introducing additional computational parameters, we try to replace the traditional Bottleneck structure inside the C2f module with GhostBottleneck and Repghostbottleneck, which in turn, forms the C2f_Ghost module (<xref ref-type="bibr" rid="B8">Han et&#xa0;al., 2020</xref>) with the C2f_Repghost (<xref ref-type="bibr" rid="B2">Chen et&#xa0;al., 2022</xref>) module in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Schematic of lightweight backbone components. <bold>(A)</bold> Ghost module <bold>(B)</bold> RepGhost module <bold>(C)</bold> Ghost Bottleneck <bold>(D)</bold> RepGhost Bottleneck.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g006.tif">
<alt-text content-type="machine-generated">Diagrams of neural network modules are labeled (a) to (d). (a) Ghost module using depthwise convolution and ReLU, ending with concatenation. (b) RepGhost module includes batch normalization and addition. (c) Ghost Bottleneck with a bottleneck structure, featuring concatenation and addition. (d) RepGhost Bottleneck, similar to (c) but includes batch normalization. Each module includes input and output data paths.</alt-text>
</graphic>
</fig>
<p>The C2f_Ghost module expands the number of channels by utilizing the underlying residual structure, while reducing the number of channels that need to be shortcut connected. This design not only optimizes the network structure, allowing for a reduction in the amount of computation, but also preserves the necessary feature representation to improve efficiency without losing the more obvious accuracy. Unlike the C2f_Ghost module, the C2f_Repghost module reduces the number of intermediate channels and downsamples the feature maps with a reduced number of channels. This further improves the computational efficiency and makes the model more efficient. At this point, the feature maps also capture the long-distance dependence between pixels in different spatial locations, which enhances the expressive power of the model (<xref ref-type="bibr" rid="B34">Wu et&#xa0;al., 2024</xref>). Especially in resource-constrained environments, this lightweight structural design, by generating a large number of lightweight feature maps, not only enables efficient dissemination of information, but also provides rich feature representations for the subsequent layers. Also, it avoids the computational bottleneck in traditional convolution and, reduces the computational overhead. Thus it saves memory and, ensures that the model reduces the resource consumption of hardware while maintaining high performance. Additionally, it, and also lays a good foundation for subsequent model deployment and migration.</p>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>Enhanced feature fusion for CSAM multiple attention structures</title>
<p>In the Neck of YOLOv8, multi-scale feature fusion is usually performed using a feature pyramid network (<xref ref-type="bibr" rid="B37">Zhao et&#xa0;al., 2023</xref>). However, feature map fusion in this part often relies on relatively small convolutional operations, resulting in a limited sense field. As the depth of the network increases, the desire to acquire a larger range of features leads to a decrease in the learning rate of the model and the transfer of feature information becomes difficult. In order to better fuse meaningful features in the channel and spatial dimensions and increase the network information effectiveness, we propose an innovative CSAM multi-attention structure that combines Non-Local positional attention (<xref ref-type="bibr" rid="B30">Wang et&#xa0;al., 2018</xref>) with the channel attention mechanism to achieve deep aggregation of spatial information in feature mapping. In the CSAM structure, we first halve the number of channels of the input feature map, which not only helps to reduce the computational redundancy and the subsequent computational burden, but also effectively promotes the selective focusing of features, making the subsequent attention mechanism more targeted and efficient. Subsequently, we apply positional attention and channel attention operations on the feature maps that have been halved by the number of channels, and use average pooling and maximum pooling operations to gather effective information, which is subsequently shared into the MLP to effectively integrate the captured important features, enabling the structure to adaptively weight the features according to the contextual information and expand the sensory field (<xref ref-type="bibr" rid="B38">Zhao et&#xa0;al., 2012</xref>). In particular, it can better enhance the global information when facing the lack of local feature information for small target tomatoes. Then the number of channels of the processed feature map is restored to the original size, preserving the network&#x2019;s ability to capture high-dimensional features.</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c6;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c6;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2295;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>
<xref ref-type="disp-formula" rid="eq1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="eq10">10</xref> represents the CSAM calculation process. Where <inline-formula>
<mml:math display="inline" id="im10">
<mml:mi>&#x3b3;</mml:mi>
</mml:math>
</inline-formula> denotes a split operation that halves the number of channels, <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>&#x3b1;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the feature map at each stage, <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the imposition of a channel attention mechanism, <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the imposition of a spatial attention mechanism, and <inline-formula>
<mml:math display="inline" id="im14">
<mml:mtext>&#x3c3;</mml:mtext>
</mml:math>
</inline-formula> denotes a sigmoid operation, <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c6;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the maximum pooling operation, <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c6;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the average pooling operation, and <inline-formula>
<mml:math display="inline" id="im17">
<mml:mo>&#x2295;</mml:mo>
</mml:math>
</inline-formula> denotes the feature map summation operation.</p>
<p>In order to realize the dependence of different positional information of feature map on other positional information in the surrounding area, and to expand the range of features obtained by ourselves, we carry out Reshap operation on the feature map <inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> outputted from channel attention to obtain <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> for subsequent matrix operation. Then three linear mappings are performed separately using <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolution, i.e., <inline-formula>
<mml:math display="inline" id="im21">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>CSAM overall structure.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g007.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture with input features processed through channel and position attention mechanisms. The channel attention uses max pooling, average pooling, and shared MLP, followed by a sigmoid function. The position attention involves convolutions and softmax operations. Both pathways combine to form the output features. Arrows indicate the flow of data.</alt-text>
</graphic>
</fig>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mi>&#x3c9;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>q</mml:mi>
</mml:msub>
<mml:mi>x</mml:mi>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2295;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>&#x2295;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The corresponding linear transformations are denoted as <inline-formula>
<mml:math display="inline" id="im22">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im23">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im24">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, representing the modulation function, query projection, and key projection, respectively. After applying these transformations, the feature response of a given pixel to all other spatial positions is computed through a similarity-based attention mechanism, typically implemented via a softmax operation, followed by normalization and weighted summation. Specifically, <inline-formula>
<mml:math display="inline" id="im25">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> projects the input into a query representation, while <inline-formula>
<mml:math display="inline" id="im26">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> encodes key features to be compared against the query. The modulation function <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is optional and can be designed to incorporate spatial priors or learnable scaling factors. This mechanism enables each spatial location to adaptively aggregate contextual information from the entire feature map, thereby enhancing the network&#x2019;s capacity to capture long-range dependencies. Here, <inline-formula>
<mml:math display="inline" id="im28">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the affinity between position <inline-formula>
<mml:math display="inline" id="im29">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im30">
<mml:mi>j</mml:mi>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im31">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> extracts content features from position <inline-formula>
<mml:math display="inline" id="im32">
<mml:mi>j</mml:mi>
</mml:math>
</inline-formula>, and <inline-formula>
<mml:math display="inline" id="im33">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> serves as a normalization factor to ensure stability of the attention distribution.</p>
<p>To further strengthen representational capacity, we integrate spatial and channel attention mechanisms. The spatial attention emphasizes &#x201c;where&#x201d; to focus, enhancing the model&#x2019;s sensitivity to informative regions even under partial occlusion. Meanwhile, the channel attention focuses on &#x201c;what&#x201d; to emphasize, selectively enhancing discriminative feature channels. The synergy of both attention types enables the model to infer occluded or ambiguous targets from contextual cues, significantly improving robustness and recognition accuracy in complex agricultural environments.</p>
</sec>
<sec id="s2_3_3">
<label>2.3.3</label>
<title>EWDIoU loss functions</title>
<p>In target detection, IoU is often used to calculate the overlap ratio between the predicted frames and real frames. One issue this method has is that there is, a large difference in the sensitivity of IoU when applied to targets with different sizes. For example, for a small target object of <inline-formula>
<mml:math display="inline" id="im34">
<mml:mrow>
<mml:mn>4</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>pixels, a small positional deviation leads to a significant decrease in IoU, whereas for a larger target object of <inline-formula>
<mml:math display="inline" id="im35">
<mml:mrow>
<mml:mn>45</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>45</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the change in IoU is smaller for the same positional deviation, as show in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. This situation leads to insufficient learning of small target features by the model or stagnation of the training process, which does not allow the model to be fully optimized. This is because, the sensitivity of IoUs for objects of different sizes mainly stems from the particularity that the position of the enclosing box can only be changed in a discrete manner. To mitigate the situation where IoU can be significantly degraded in small-target tomato detection, we propose to use the EWDIoU loss function.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>The figure shows the variation of target IoU for different pixel sizes. In the left figure <bold>(A)</bold> is the 4*4 pixel target real frame, <bold>(B)</bold> is the predicted frame with 1 pixel deviation, <bold>(C)</bold> is the predicted frame with 3 pixel deviation; In the right figure <bold>(A)</bold> is the real target of 45*45 pixels, <bold>(B)</bold> is the prediction frame with a deviation of 1 pixel and <bold>(C)</bold> is the prediction frame with a deviation of 3 pixels.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g008.tif">
<alt-text content-type="machine-generated">Two images depict tomatoes on a plant with overlapping bounding boxes labeled A, B, and C to demonstrate Intersection over Union (IoU) calculations. The left image shows a low IoU of 0.39 between A and B, and 0.03 between A and C. The right image displays a higher IoU of 0.90 between A and B, and 0.65 between A and C.</alt-text>
</graphic>
</fig>
<p>The original YOLOv8 uses CIoU for loss calculation which can only reflect the difference in the aspect ratio of the enclosing frame, not the width and height respectively. This may hinder the model to optimize effectively (<xref ref-type="bibr" rid="B36">Zhao et&#xa0;al., 2022</xref>). Due to this limitation in CIoU, the EIoU (<xref ref-type="disp-formula" rid="eq11">Equation 11</xref>) added a penalty term to split the influence factor of the width and height ratios, and calculated the length and width of the target and predicted frames respectively (<xref ref-type="bibr" rid="B37">Zhao et&#xa0;al., 2023</xref>).</p>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Here <inline-formula>
<mml:math display="inline" id="im36">
<mml:mi>b</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im37">
<mml:mrow>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote the centroids of the prediction frame and the real frame, respectively, <inline-formula>
<mml:math display="inline" id="im38">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2016;</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>&#x2016;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the Euclidean distance between the two, and <inline-formula>
<mml:math display="inline" id="im39">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im40">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denote the width and height of the smallest outer bounding box covering the two enclosing frames, respectively. However, the discrete nature of the change in the position of the enclosing box hinders the accuracy. So, we adopt a new metric to measure the similarity of the enclosing box by Wasserstein Distance (<xref ref-type="bibr" rid="B32">Wang et&#xa0;al., 2021</xref>) which replaces the partial EIoU. The hyperparameters are utilized to balance the coordination of the two IoU distances. Here, <inline-formula>
<mml:math display="inline" id="im41">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> This way, it mitigates the high sensitivity of the normal IoU for small target tomatoes.</p>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>W</mml:mi>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:mfrac>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Firstly, we observe that in real planting scenarios, our annotation of the small tomato dataset tends to be in the form of a rectangular annotation box, where the body of the small tomato and the other background information will be distributed in the center and the edge of the bounding box. The importance of the pixel&#x2019;s weight decreases from the center to the edge of the bounding box. Therefore, we can abstract the horizontal bounding box and utilize the inner tangent circle of the bounding box to represent the different ground pixel weight distribution in the bounding box. Let the horizontal bounding box <inline-formula>
<mml:math display="inline" id="im42">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula>
<mml:math display="inline" id="im43">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im44">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the horizontal and vertical coordinates of the center of the bounding box, w and h represent the width and height of the bounding box, respectively. At this time <inline-formula>
<mml:math display="inline" id="im45">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the center coordinates of the ellipse, and <inline-formula>
<mml:math display="inline" id="im46">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im47">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the lengths of the semiaxis of the ellipse along the <inline-formula>
<mml:math display="inline" id="im48">
<mml:mi>x</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im49">
<mml:mi>y</mml:mi>
</mml:math>
</inline-formula> axes, respectively. Correspondingly, <inline-formula>
<mml:math display="inline" id="im50">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im51">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im52">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im53">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, the corresponding ellipse equations are:</p>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c1;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The probability density function for a p-dimensional random vector <inline-formula>
<mml:math display="inline" id="im54">
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> can be written as <xref ref-type="disp-formula" rid="eq14">Equation 14</xref>:</p>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">|</mml:mo>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3a3;</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>&#x3c4;</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>&#x3a3;</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>p</mml:mi>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mi>&#x3a3;</mml:mi>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The distribution defined by this function is the p-element normal distribution,denoted as <inline-formula>
<mml:math display="inline" id="im55">
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>~</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>&#x3bc;</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x3a3;</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula>
<mml:math display="inline" id="im56">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3a3;</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the inverse matrix of <inline-formula>
<mml:math display="inline" id="im57">
<mml:mtext>&#x3a3;</mml:mtext>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im58">
<mml:mrow>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mi>&#x3a3;</mml:mi>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the determinant of <inline-formula>
<mml:math display="inline" id="im59">
<mml:mtext>&#x3a3;</mml:mtext>
</mml:math>
</inline-formula>, and <inline-formula>
<mml:math display="inline" id="im60">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>&#x3c4;</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the transpose of the vector <inline-formula>
<mml:math display="inline" id="im61">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Based on the Mahalanobis distance we get that when <inline-formula>
<mml:math display="inline" id="im62">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>&#x3c4;</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>&#x3a3;</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the <xref ref-type="disp-formula" rid="eq13">Equation 13</xref> is then the contour of a two-dimensional Gaussian distribution. At this point, the horizontal bounding box <inline-formula>
<mml:math display="inline" id="im63">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> can be modeled as a two-dimensional Gaussian distribution <inline-formula>
<mml:math display="inline" id="im64">
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>&#x3bc;</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x3a3;</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> with <inline-formula>
<mml:math display="inline" id="im65">
<mml:mrow>
<mml:mtext>&#x3bc;</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im66">
<mml:mrow>
<mml:mtext>&#x3a3;</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:mn>0</mml:mn>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and the similarity between the bounding boxes A <inline-formula>
<mml:math display="inline" id="im67">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and B <inline-formula>
<mml:math display="inline" id="im68">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> can be converted into the distribution distance between two Gaussian distributions. For the two-dimensional Gaussian distributions <inline-formula>
<mml:math display="inline" id="im69">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mtext>m</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im70">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mtext>m</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for both A and B, define the two-dimensional Wasserstein Distance between the two as <xref ref-type="disp-formula" rid="eq15">Equations 15</xref>&#x2013;<xref ref-type="disp-formula" rid="eq18">18</xref>:</p>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2016;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>&#x2016;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3a3;</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msubsup>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq16">
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2016;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>&#x2016;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mn>4</mml:mn>
</mml:mfrac>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq17">
<label>(17)</label>
<mml:math display="block" id="M17">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2016;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>&#x2016;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2016;</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3a3;</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>&#x2016;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq18">
<label>(18)</label>
<mml:math display="block" id="M18">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2016;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>T</mml:mi>
</mml:mstyle>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>T</mml:mi>
</mml:mstyle>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>&#x2016;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2016;</mml:mo>
<mml:mo>&#xb7;</mml:mo>
<mml:mo>&#x2016;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the F-parameter of the matrix. Finally, <inline-formula>
<mml:math display="inline" id="im72">
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is normalized to obtain the final metric, which is collated to obtain the final EWDIoU formula as <xref ref-type="disp-formula" rid="eq12">Equation 12</xref>. where M is a constant with respect to the dataset, in our experiments, we compared the effect of different values of M on the results, and finally achieved the best results with <inline-formula>
<mml:math display="inline" id="im73">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1.0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s2_3_4">
<label>2.3.4</label>
<title>Evaluation metrics</title>
<p>This section outlines the evaluation metrics employed to comprehensively assess the performance of the small tomato detection model. The primary metrics include precision (P), recall (R), mean average precision (mAP), floating point operations per second (FLOPs), number of network parameters, and inference speed.</p>
<disp-formula id="eq19">
<label>(19)</label>
<mml:math display="block" id="M19">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq20">
<label>(20)</label>
<mml:math display="block" id="M20">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq21">
<label>(21)</label>
<mml:math display="block" id="M21">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq22">
<label>(22)</label>
<mml:math display="block" id="M22">
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>Q</mml:mi>
</mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In <xref ref-type="disp-formula" rid="eq19">Equations 19</xref>, <xref ref-type="disp-formula" rid="eq20">20</xref>, the calculation of precision(P) and recall(R) relies on three key metrics: true positives (TP), false positives (FP), and false negatives (FN). When the model successfully identifies a small tomato target, it is recorded as TP, whereas FP and FN represent, respectively, the number of false detections of nonexistent targets and the number of missed detections of actual targets by the model. Precision (P) measures the model&#x2019;s capability to correctly identify small tomato targets among all predicted targets, while recall (R) assesses the proportion of actual targets successfully detected by the model. For the detection performance of small tomatoes in each category, a precision-recall (P-R) curve can be plotted, with the average precision (AP) defined as the area under the curve. The closer the AP value is to 1, the better the model&#x2019;s detection performance for that specific category. The mean average precision (mAP), calculated as the weighted average of the AP values across all categories, is a widely adopted performance evaluation metric in target detection tasks. It provides a visual and comprehensive representation of the model&#x2019;s overall performance, where Q in the equation represents the total number of target categories. Moreover, model complexity is typically quantified by the number of floating-point operations (FLOPs), which represents the computational resources required by the model and serves as a crucial metric for assessing algorithmic efficiency. The speed of target detection is measured in frames per second (FPS), with a higher FPS value indicating superior real-time processing capability. A comprehensive evaluation of these metrics offers a thorough and rigorous assessment of the model&#x2019;s performance, enabling multidimensional comparison and optimization.</p>
</sec>
<sec id="s2_3_5">
<label>2.3.5</label>
<title>Environmental settings</title>
<p>The proposed model was trained and tested using the small tomato dataset in field scenarios with a total of 7332 images. The specific training environment is Intel(R) Xeon(R) Gold 6248R@3.00GHz processor with an NVIDIA GeForce RTX4090 graphics card. The deep learning modeling framework uses Pytorch 2.4.1 and Python 3.8.19, the CUDA version was selected as 11.7, and the operating system was selected as Windows 11. All experiments were trained for 300 epochs with the following hyperparameters: a Adaptive Moment Estimation (Adam) optimizer with a batch size of 4, an initial learning rate of 0.001, a momentum factor of 0.937, and a weight decay of 0.0005. In order to ensure the fairness and comparability of the model effects, we tried to use the same parameter settings for both the comparison and ablation experiments, and some important hyper-parameter settings are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Training parameters settings.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Parameter</th>
<th valign="top" align="left">Value</th>
<th valign="top" align="left">Parameter</th>
<th valign="top" align="left">Value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Epoch</td>
<td valign="top" align="left">300</td>
<td valign="top" align="left">Initial Learning Rate</td>
<td valign="top" align="center">1 &#xd7; 10<sup>&#x2212;3</sup>
</td>
</tr>
<tr>
<td valign="top" align="left">Batch size</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">Weight-Decay</td>
<td valign="top" align="center">5 &#xd7; 10<sup>&#x2212;4</sup>
</td>
</tr>
<tr>
<td valign="top" align="left">Optimizer</td>
<td valign="top" align="left">Adam</td>
<td valign="top" align="left">Momentum</td>
<td valign="top" align="center">0.937</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Improving test results via data enhancement</title>
<p>To expand the training samples and enhance the model&#x2019;s generalization ability, robustness, and adaptability in real-world applications, we apply data augmentation techniques such as exposure adjustment, rotation, blurring, random brightness variation, and mirroring to simulate diverse scene variations. The experimental results are presented in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of results before and after data augmentation.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Class</th>
<th valign="top" align="center">P(%)</th>
<th valign="top" align="center">R(%)</th>
<th valign="top" align="center">mAP50(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="4" align="center">YOLOv8n</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">87.0</td>
<td valign="top" align="center">75.6</td>
<td valign="top" align="center">82.4</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">68.7</td>
<td valign="top" align="center">78.1</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">78.1</td>
<td valign="top" align="center">82.2</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">80.8</td>
<td valign="top" align="center">87.1</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="center">YOLOv8n(without enhancement)</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">79.9</td>
<td valign="top" align="center">76.2</td>
<td valign="top" align="center">81.6</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">83.4</td>
<td valign="top" align="center">74.7</td>
<td valign="top" align="center">82.0</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">78.5</td>
<td valign="top" align="center">84.9</td>
<td valign="top" align="center">88.0</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">77.7</td>
<td valign="top" align="center">69.0</td>
<td valign="top" align="center">74.0</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>After data augmentation, the overall mAP@50 showed a noticeable improvement, indicating enhanced detection performance of the model. In addition, the precision values for all categories and the overall precision increased, suggesting a reduction in false positives and more effective feature learning. While the recall of the ytomato class improved, the recall of other categories slightly declined. This is likely due to increased background complexity in the augmented images, which made the model more conservative in its predictions, leading to a higher miss rate for true targets. However, since the ytomato class had relatively few samples before augmentation, the augmented data effectively alleviated the issue of data scarcity and helped the model learn more stable features. Although the mAP of gtomato and rtomato slightly decreased&#x2014;possibly due to distributional shifts or reduced feature stability caused by augmentation&#x2014;the overall improvement in mAP indicates that the model became more balanced and achieved better generalization.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Ablation study</title>
<p>In this section, the fully enhanced model is compared with simplified variants incorporating individual improvements to independently evaluate the effectiveness of each component. All enhancements are built upon the baseline YOLOv8n model, targeting the following aspects: modifications to the YOLOv8n backbone; introduction of a novel feature pyramid pooling structure, DASPPF; incorporation of a lightweight detection head; integration of the CSAM attention mechanism to improve multi-scale feature fusion; and replacement of the traditional loss function with the proposed EWDIoU loss, which leverages a two-dimensional Gaussian distribution to enhance bounding box regression. In <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>, each improvement was incrementally incorporated into the baseline model, and the corresponding performance metrics were evaluated. Specifically, &#x201c;A&#x201d; denotes the backbone enhancement, &#x201c;B&#x201d; refers to the proposed DASPPF module, &#x201c;C&#x201d; indicates the addition of a small object detection head, and &#x201c;D&#x201d; represents the proposed CSAM module. The final model, Ta-YOLO, integrates all these enhancements.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Results of ablation experiments.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Class</th>
<th valign="top" align="left">P(%)</th>
<th valign="top" align="left">R(%)</th>
<th valign="top" align="left">mAP50(%)</th>
<th valign="top" align="left">mAP50-90(%)</th>
<th valign="top" align="left">Params(M)</th>
<th valign="top" align="left">FLOPs(G)</th>
<th valign="top" align="left">FPS (frames/s)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv8n</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">87.0</td>
<td valign="top" align="center">75.6</td>
<td valign="top" align="center">82.1</td>
<td valign="top" align="center">45.6</td>
<td valign="middle" rowspan="4" align="center">11.48</td>
<td valign="middle" rowspan="4" align="center">8.1</td>
<td valign="middle" rowspan="4" align="center">120.11</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">68.7</td>
<td valign="top" align="center">78.1</td>
<td valign="top" align="center">41.1</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">78.1</td>
<td valign="top" align="center">82.2</td>
<td valign="top" align="center">48.5</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">80.8</td>
<td valign="top" align="center">87.1</td>
<td valign="top" align="center">47.2</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv8+A</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">87.7</td>
<td valign="top" align="center">73.1</td>
<td valign="top" align="center">81.2</td>
<td valign="top" align="center">43.9</td>
<td valign="middle" rowspan="4" align="center">8.5</td>
<td valign="middle" rowspan="4" align="center">10.2</td>
<td valign="middle" rowspan="4" align="center">200.0</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">85.7</td>
<td valign="top" align="center">65.2</td>
<td valign="top" align="center">76.8</td>
<td valign="top" align="center">40.1</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">77.4</td>
<td valign="top" align="center">85.7</td>
<td valign="top" align="center">49.8</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">88.6</td>
<td valign="top" align="center">76.7</td>
<td valign="top" align="center">81.1</td>
<td valign="top" align="center">41.7</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv8+A+B</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">85.1</td>
<td valign="top" align="center">75.1</td>
<td valign="top" align="center">82.4</td>
<td valign="top" align="center">44.5</td>
<td valign="middle" rowspan="4" align="center">9.96</td>
<td valign="middle" rowspan="4" align="center">10.2</td>
<td valign="middle" rowspan="4" align="center">168.2</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">83.4</td>
<td valign="top" align="center">70.1</td>
<td valign="top" align="center">78.6</td>
<td valign="top" align="center">40.9</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">82.4</td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">50.9</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">85.5</td>
<td valign="top" align="center">72.7</td>
<td valign="top" align="center">80.5</td>
<td valign="top" align="center">41.6</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv8+A+B+C</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">85.9</td>
<td valign="top" align="center">75.4</td>
<td valign="top" align="center">83.3</td>
<td valign="top" align="center">45.5</td>
<td valign="middle" rowspan="4" align="center">9.62</td>
<td valign="middle" rowspan="4" align="center">14.2</td>
<td valign="middle" rowspan="4" align="center">156.3</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">84.6</td>
<td valign="top" align="center">70.6</td>
<td valign="top" align="center">80.1</td>
<td valign="top" align="center">42.4</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">78.9</td>
<td valign="top" align="center">86.4</td>
<td valign="top" align="center">50.7</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">87.4</td>
<td valign="top" align="center">76.7</td>
<td valign="top" align="center">83.4</td>
<td valign="top" align="center">43.8</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv8+A+B+C+D</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">85.9</td>
<td valign="top" align="center">76.0</td>
<td valign="top" align="center">84.0</td>
<td valign="top" align="center">47.0</td>
<td valign="middle" rowspan="4" align="center">10.57</td>
<td valign="middle" rowspan="4" align="center">14.3</td>
<td valign="middle" rowspan="4" align="center">153.9</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">84.6</td>
<td valign="top" align="center">71.6</td>
<td valign="top" align="center">80.9</td>
<td valign="top" align="center">43.1</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">88.4</td>
<td valign="top" align="center">79.5</td>
<td valign="top" align="center">87.9</td>
<td valign="top" align="center">52.4</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">84.4</td>
<td valign="top" align="center">77.0</td>
<td valign="top" align="center">83.2</td>
<td valign="top" align="center">45.5</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">Ta-YOLO</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">84.4</td>
<td valign="top" align="center">45.9</td>
<td valign="middle" rowspan="4" align="center">10.58</td>
<td valign="middle" rowspan="4" align="center">14.3</td>
<td valign="middle" rowspan="4" align="center">131.58</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">86.0</td>
<td valign="top" align="center">70.8</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">43.2</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">87.5</td>
<td valign="top" align="center">79.0</td>
<td valign="top" align="center">87.2</td>
<td valign="top" align="center">51.5</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">75.5</td>
<td valign="top" align="center">84.9</td>
<td valign="top" align="center">43.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The results of the comprehensive ablation study are summarized in the table, highlighting the following key findings: (1) Lightweight modifications to the backbone successfully reduced the parameter count and increased inference speed (FPS), albeit at the cost of reduced accuracy. (2) The proposed DASPPF feature pyramid pooling structure significantly enhanced the extraction of salient features, with recall rates for green and red tomatoes reaching 70.1% and 82.4%, respectively&#x2014;improvements of 1.4% and 4.3% over the baseline. Furthermore, the mAP increased by 0.5% and 5.8% compared to the baseline. These results indicate that preserving global contextual information in complex real-world scenes facilitates more accurate target recognition. (3) The addition of the tiny detection head increased the mAP to 83.3%, while simultaneously reducing the parameter count and improving FPS relative to the baseline. However, this enhancement resulted in an increased computational load. These findings indicate an improved multi-scale detection capability, rendering the model more effective for small tomato detection. (4) The CSAM attention mechanism further enhanced recognition accuracy by efficiently integrating multidimensional feature information, particularly benefiting the detection of multiple small tomatoes at image edges or under occlusion. Moreover, the proposed EWDIoU loss function effectively addressed challenges associated with small target detection, yielding superior performance across small tomato categories. Across all evaluated samples, the mAP for heavily shaded green and red tomatoes improved from 78.1% and 82.2% to 81.0% and 87.2%, respectively, demonstrating the targeted effectiveness of our approach in mitigating shading-related challenges. Furthermore, a comparative analysis between the original baseline and Ta-YOLO under complex real-world conditions, including occlusion, is presented in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>. The results confirm that Ta-YOLO achieves superior detection performance in these challenging scenarios.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Example of detection results under different occlusion scenarios. <bold>(A)</bold>Original images, <bold>(B)</bold> benchmark model, <bold>(C)</bold> Ta-YOLO.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g009.tif">
<alt-text content-type="machine-generated">Nine-panel image comparing tomato plant detection methods. Each column represents a different method labeled A, B, and C. Images show tomato plants with red boxes highlighting detected areas. Each row displays varying angles and stages of tomato growth, illustrating the effectiveness of each method.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Comparative experiments</title>
<p>The two-stage detection algorithm initially extracts candidate regions from the input image, followed by classification of each candidate. To evaluate the effectiveness of the proposed method, it was compared against the classical two-stage detector Faster R-CNN. Additionally, the enhanced model was benchmarked against several widely used one-stage detection algorithms, including YOLOv5, YOLOv7, YOLOv9, YOLOv11, and HyperYOLO. To ensure fairness and emphasize the effectiveness of the proposed method, comparisons were performed using lightweight variants of the evaluated algorithms. The experimental results, presented in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>, indicate that Ta-YOLO achieved recall and mAP values of 76.9% and 84.4%, respectively, outperforming most competing models. Significantly, YOLOv9s attained the highest mAP of 85.3% in this comparison. However, YOLOv9s exhibited a parameter count of 37.88 million and a computational complexity of 40.6 GFLOPs, exceeding those of Ta-YOLO by 27.3 million parameters and 26.1 GFLOPs, respectively. Additionally, YOLOv9s demonstrated lower FPS performance compared to Ta-YOLO, underscoring the trade-off between accuracy and computational efficiency. Although YOLOv7&#x2019;s mAP was only 0.4% lower than that of Ta-YOLO, its parameter count was approximately thirteen times greater. Compared to YOLOv11 and HyperYOLO, Ta-YOLO achieves higher FPS with a similar parameter count, while demonstrating superior accuracy.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Comparison results with different target detectors.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Class</th>
<th valign="top" align="center">P(%)</th>
<th valign="top" align="center">R(%)</th>
<th valign="top" align="left">mAP50(%)</th>
<th valign="top" align="left">Params(M)</th>
<th valign="top" align="left">FLOPs(G)</th>
<th valign="top" align="left">FPS(frames/s)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="4" align="left">Faster R-CNN</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">38.0</td>
<td valign="top" align="center">50.4</td>
<td valign="top" align="center">46.89</td>
<td valign="middle" rowspan="4" align="center">136.73</td>
<td valign="middle" rowspan="4" align="center">369.7</td>
<td valign="middle" rowspan="4" align="center">18.5</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">35.63</td>
<td valign="top" align="center">61.71</td>
<td valign="top" align="center">45.96</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">41.91</td>
<td valign="top" align="center">67.21</td>
<td valign="top" align="center">56.23</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">36.73</td>
<td valign="top" align="center">54.27</td>
<td valign="top" align="center">38.47</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv5</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">91.0</td>
<td valign="top" align="center">56.6</td>
<td valign="top" align="center">66.29</td>
<td valign="middle" rowspan="4" align="center">26.81</td>
<td valign="middle" rowspan="4" align="center">16.0</td>
<td valign="middle" rowspan="4" align="center">34.3</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">91.07</td>
<td valign="top" align="center">55.60</td>
<td valign="top" align="center">67.0</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">85.96</td>
<td valign="top" align="center">63.30</td>
<td valign="top" align="center">68.9</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">98.18</td>
<td valign="top" align="center">50.97</td>
<td valign="top" align="center">62.96</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv7</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">77.6</td>
<td valign="top" align="center">84.0</td>
<td valign="middle" rowspan="4" align="center">141.93</td>
<td valign="middle" rowspan="4" align="center">105.1</td>
<td valign="middle" rowspan="4" align="center">111.1</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">86.0</td>
<td valign="top" align="center">75.3</td>
<td valign="top" align="center">82.2</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">87.4</td>
<td valign="top" align="center">77.4</td>
<td valign="top" align="center">81.7</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">93.6</td>
<td valign="top" align="center">79.7</td>
<td valign="top" align="center">88.1</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv8n</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">87.0</td>
<td valign="top" align="center">75.6</td>
<td valign="top" align="center">82.4</td>
<td valign="middle" rowspan="4" align="center">11.48</td>
<td valign="middle" rowspan="4" align="center">8.1</td>
<td valign="middle" rowspan="4" align="center">120.11</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">68.7</td>
<td valign="top" align="center">78.1</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">78.1</td>
<td valign="top" align="center">82.2</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">80.8</td>
<td valign="top" align="center">87.1</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">YOLOv9s</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">89.5</td>
<td valign="top" align="center">79.2</td>
<td valign="top" align="center">85.1</td>
<td valign="middle" rowspan="4" align="center">37.88</td>
<td valign="middle" rowspan="4" align="center">40.6</td>
<td valign="middle" rowspan="4" align="center">110.7</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">85.3</td>
<td valign="top" align="center">75.9</td>
<td valign="top" align="center">82.7</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">87.8</td>
<td valign="top" align="center">80.7</td>
<td valign="top" align="center">85.0</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">90.2</td>
</tr>
<tr>
<td valign="middle" rowspan="5" align="left">YOLOv11n</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">86.4</td>
<td valign="top" align="center">73.7</td>
<td valign="top" align="center">81.4</td>
<td valign="middle" rowspan="4" align="center">9.85</td>
<td valign="middle" rowspan="4" align="center">6.3</td>
<td valign="middle" rowspan="4" align="center">303.3</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">83.6</td>
<td valign="top" align="center">69.8</td>
<td valign="top" align="center">78.0</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">84.4</td>
<td valign="top" align="center">76.4</td>
<td valign="top" align="center">81.1</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">91.3</td>
<td valign="top" align="center">74.9</td>
<td valign="top" align="center">85.0</td>
</tr>
<tr>
<td valign="top" align="center">all</td>
<td valign="top" align="center">86.2</td>
<td valign="top" align="center">75.2</td>
<td valign="top" align="center">82.2</td>
<td valign="middle" rowspan="4" align="center">10.38</td>
<td valign="middle" rowspan="4" align="center">7.6</td>
<td valign="middle" rowspan="4" align="center">204.8</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="left">HyperYOLO</td>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">83.2</td>
<td valign="top" align="center">70.4</td>
<td valign="top" align="center">78.1</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">83.6</td>
<td valign="top" align="center">78.4</td>
<td valign="top" align="center">81.6</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">91.1</td>
<td valign="top" align="center">77.0</td>
<td valign="top" align="center">86.9</td>
</tr>
<tr>
<td valign="middle" rowspan="4" align="left">Ta-YOLO</td>
<td valign="top" align="center">all</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">84.4</td>
<td valign="middle" rowspan="4" align="center">10.58</td>
<td valign="middle" rowspan="4" align="center">14.3</td>
<td valign="middle" rowspan="4" align="center">131.58</td>
</tr>
<tr>
<td valign="top" align="center">gtomato</td>
<td valign="top" align="center">86.0</td>
<td valign="top" align="center">70.8</td>
<td valign="top" align="center">81.0</td>
</tr>
<tr>
<td valign="top" align="center">rtomato</td>
<td valign="top" align="center">87.5</td>
<td valign="top" align="center">79.0</td>
<td valign="top" align="center">87.2</td>
</tr>
<tr>
<td valign="top" align="center">ytomato</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">75.5</td>
<td valign="top" align="center">84.9</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Nevertheless, examination of the table reveals that, despite Ta-YOLO&#x2019;s superior overall performance compared to other detectors, its recall for green tomatoes is below the average recall, indicating the presence of false negatives in green tomato detection. Moreover, this issue is not unique to Ta-YOLO but is prevalent across most detection models. An analysis of the dataset revealed that extensive leaf shading on green tomatoes contributes to erroneous detections. Notably, the dataset was annotated with stringent criteria, including labeling tomatoes even when heavily occluded by foliage, which may further contribute to the detection challenges observed. It is worth noting that, YOLOv9 achieves a relatively higher recall for green tomatoes. Our analysis attributes this to YOLOv9&#x2019;s heavier parameterization, which facilitates more precise alignment of feature map edges. Consequently, future work will focus on enhancing edge and texture perception by improving the extraction and representation of edge features.</p>
<p>In this section, four representative challenging cases from the dataset were selected, as illustrated in <xref ref-type="fig" rid="f10">
<bold>Figures&#xa0;10A, D, G, J</bold>
</xref>. In these figures, yellow circles denote missed detections, blue circles indicate false positives, and orange squares mark regions with increased identification difficulty. In <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10B, E</bold>
</xref>, the leaves highlighted by blue circles were erroneously classified as green and red tomatoes, respectively. In comparison, the proposed algorithm correctly avoids these misclassifications, as demonstrated by the absence of false positives within the blue dashed circles in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10C, F</bold>
</xref>. In <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10H</bold>
</xref>, occlusion caused by tomato branches adversely affects detection, resulting in the tomato marked by the blue circle being erroneously identified as multiple instances. In contrast, the corresponding region within the blue dashed circle in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10I</bold>
</xref> is correctly detected by the proposed method. Likewise, in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10J</bold>
</xref>, extensive occlusion from the tomato petiole leads to a missed detection of the small tomato indicated by the yellow circle in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10K</bold>
</xref>, whereas <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10L</bold>
</xref> shows successful recognition. The same four challenging cases presented in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref> are used to visualize and compare the detection results between YOLOv8 and Ta-YOLO in <xref ref-type="fig" rid="f11">
<bold>Figures&#xa0;11A, D, G, J</bold>
</xref>. In <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11B</bold>
</xref>, the very small tomatoes indicated by yellow circles were completely missed, whereas those within the yellow dashed circles in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11C</bold>
</xref> were accurately detected, including occluded instances. In <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11E</bold>
</xref>, the tomato enclosed by the yellow circle was heavily obscured and not correctly identified; however, the improved algorithm presented in this study successfully detected it in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11F</bold>
</xref>. Similarly, <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11H</bold>
</xref> exhibits the same issue observed in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10H</bold>
</xref>, where the tomato marked by the blue circle was mistakenly identified as multiple instances, whereas the corresponding region in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11I</bold>
</xref> within the blue dashed circle was correctly recognized. In <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11J</bold>
</xref>, occlusion caused by the tomato petiole led to a missed detection of the small tomato marked by the yellow circle in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11K</bold>
</xref>, while <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11L</bold>
</xref> demonstrates its accurate detection. Collectively, these results demonstrate that Ta-YOLO achieves higher accuracy and greater robustness in detecting shaded small tomatoes under real production conditions.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Visual comparison figure of YOLOv7 and Ta-YOLO detection results in four extreme environments. <bold>(A, D, G, J)</bold> showed four different detection situations. <bold>(B, E, H, K)</bold> displayed the YOLOv7 detection results. <bold>(C, F, I, L)</bold> presented the Ta-YOLO detection results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g010.tif">
<alt-text content-type="machine-generated">Composite image showing tomato plants with annotations. Images A, D, G, and J highlight marking areas with orange rectangles. Images B, E, and H showcase incorrect detections with blue circles. Images C, F, I, and L display instances of leakage with red rectangles and yellow circles. Each smaller image is connected by arrows to the main images, emphasizing different aspects of detection and annotation on the plants.</alt-text>
</graphic>
</fig>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Visual comparison figure of YOLOv8 and Ta-YOLO detection results in four extreme environments. <bold>(B, E, H, K)</bold> displayed the YOLOv8 detection results. <bold>(C, F, I, L)</bold> presented the Ta-YOLO detection results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g011.tif">
<alt-text content-type="machine-generated">Image of multiple photographs showing tomato plants with annotation boxes. Annotations include orange squares for &#x201c;Marking,&#x201d; blue circles for &#x201c;Incorrect,&#x201d; and yellow circles for &#x201c;Leakage.&#x201d; Each main image (A, D, G, J) has arrows pointing to zoomed-in sections (B, C, E, F, H, I, K, L) with annotations highlighting specific areas on the plants.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Comparative experiments on different attentional</title>
<p>In deep learning, the attention mechanism, which simulates the selective focus of human cognition, has been extensively applied across domains including image processing and natural language processing. Among various attention mechanisms, the Global Attention Module (GAM) stands out as a global attentional approach that effectively preserves the majority of salient information, thereby enhancing feature interaction. The SE attention mechanism emphasizes effective feature extraction by employing a squeeze-and-excitation process that encourages the network to integrate spatial and channel information within the local receptive field. Differently, the scSE attention mechanism simultaneously combines spatial and channel attention in parallel to enhance feature representation. Similarly, the CBAM attention mechanism facilitates feature interaction through sequential fusion of channel and spatial attention. Each of these attention mechanisms, having demonstrated strong performance across various tasks, was integrated into the Ta-YOLO model for comparative evaluation against CSAM. The results showed that, CSAM achieved the highest accuracy, with mAP and recall values of 84.4% and 76.9%, respectively. Among the competing mechanisms, CBAM exhibited the lowest parameter count and computational cost, with 9.62 million parameters and 14.2 GFLOPs, although its mAP and recall were 1% and 2% lower than those of CSAM. GAM, despite having the largest parameter count at 15.88 million, attained an mAP of 83.4%, comparable to CBAM, thus neither surpassing CSAM&#x2019;s performance nor justifying the increased complexity. Additionally, CSAM maintains the same plug-and-play compatibility as these established attention modules. <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref> presents the detection performance of Ta-YOLO on the real tomato dataset, while <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref> illustrates heatmap visualizations corresponding to different attention mechanisms. The results clearly indicated that the proposed CSAM module outperforms others by effectively concentrating on heavily occluded and small-sized tomatoes. In summary, the integration of spatial and channel attention within the CSAM module yields superior detection efficacy.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Comparison of 5 different attention mechanism with metrics of mAP50, mAP50-90, precision, recall, Parameters and FLOPs.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Models</th>
<th valign="top" align="center">P (%)</th>
<th valign="top" align="center">R (%)</th>
<th valign="top" align="center">mAP50 (%)</th>
<th valign="top" align="center">mAP50-90 (%)</th>
<th valign="top" align="center">Params (M)</th>
<th valign="top" align="center">FLOPs (G)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Ta-YOLO_GAM</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">75.0</td>
<td valign="top" align="center">83.4</td>
<td valign="top" align="center">45.9</td>
<td valign="top" align="center">15.88</td>
<td valign="top" align="center">15.5</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO_SE</td>
<td valign="top" align="center">86.6</td>
<td valign="top" align="center">75.1</td>
<td valign="top" align="center">83.6</td>
<td valign="top" align="center">45.8</td>
<td valign="top" align="center">9.63</td>
<td valign="top" align="center">14.2</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO_scSE</td>
<td valign="top" align="center">86.8</td>
<td valign="top" align="center">74.4</td>
<td valign="top" align="center">82.7</td>
<td valign="top" align="center">45.1</td>
<td valign="top" align="center">9.89</td>
<td valign="top" align="center">14.4</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO_CBAM</td>
<td valign="top" align="center">86.3</td>
<td valign="top" align="center">74.9</td>
<td valign="top" align="center">83.4</td>
<td valign="top" align="center">45.7</td>
<td valign="top" align="center">9.62</td>
<td valign="top" align="center">14.2</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO_CSAM</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">84.4</td>
<td valign="top" align="center">45.9</td>
<td valign="top" align="center">10.58</td>
<td valign="top" align="center">14.3</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Heatmap visualizations of small tomato detection under different attention mechanisms. <bold>(A)</bold> Example of Extremely Small Tomato; <bold>(B)</bold> Example of Occlusion by Stem; <bold>(C)</bold> Example of Occlusion by Leaf; <bold>(D)</bold> Example of Inter-Class Occlusion.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g012.tif">
<alt-text content-type="machine-generated">A grid of plant images shows the effect of different attention models. Rows labeled A to D depict various plant scenes. Columns from left to right show the original image, then the same image with attention models applied: Without Attention, With GAM, With scSE, With CBAM, With SE, and Ours. Each subsequent column displays color-coded areas highlighting detected features, with changes in focus and intensity across models.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Comparative experiments on different loss functions</title>
<p>The loss function plays a critical role in object detection tasks by quantifying the discrepancy between model predictions and ground truth labels. This measurement guides model optimization during training, ultimately improving detection performance. In this study, several widely used loss functions were evaluated on the Ta-YOLO model and compared against the proposed EWDIoU loss function to assess its effectiveness. The corresponding experimental results were summarized in <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref>. Among the evaluated loss functions, the EWDIoU achieved the highest mAP@50 and recall scores of 84.4% and 76.9%, respectively. Although the GIoU loss attained an identical recall value, its accuracy was lower at 83.0%, representing a 1.4% deficit compared to EWDIoU. The EIoU loss reached a mAP@50 of 83.8%, close to the highest value; however, its precision and recall metrics were inferior to those of the EWDIoU loss function. Although the original CIoU loss used in YOLOv8 achieved the highest precision of 88.2%, its mAP was limited to 82.3%, and recall reached only 75.1%. These results indicated that the model struggled to detect all valid targets, particularly small tomatoes with occluded edges, leading to missed detections. Consequently, this shortcoming contributed to the decline in both recall and mAP. The EWDIoU loss function employs a two-dimensional Gaussian distribution approach to process discretized data, effectively addressing the bounding box insensitivity to small tomato targets and thereby enhancing detection accuracy. The proposed improvements demonstrate notable performance gains, indicating that targeted enhancements can overcome common detection challenges, including performance degradation in complex environments characterized by occlusion and small targets, as well as reducing instances of misdetection and omission.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Comparison of 5 different loss functions with metrics of mAP50 (<xref ref-type="disp-formula" rid="eq16">Equation 16</xref>), precision (<xref ref-type="disp-formula" rid="eq17">Equation 17</xref>), recall (<xref ref-type="disp-formula" rid="eq19">Equation 19</xref>) and mAP50-90.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Models</th>
<th valign="top" align="center">P(%)</th>
<th valign="top" align="center">R(%)</th>
<th valign="top" align="center">mAP50(%)</th>
<th valign="top" align="center">mAP50-90(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Ta-YOLO-CIoU</td>
<td valign="top" align="center">88.2</td>
<td valign="top" align="center">75.1</td>
<td valign="top" align="center">82.3</td>
<td valign="top" align="center">45.7</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO-DIoU</td>
<td valign="top" align="center">87.4</td>
<td valign="top" align="center">76.7</td>
<td valign="top" align="center">82.9</td>
<td valign="top" align="center">45.2</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO-GIoU</td>
<td valign="top" align="center">86.6</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">83.0</td>
<td valign="top" align="center">45.7</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO-EIoU</td>
<td valign="top" align="center">86.4</td>
<td valign="top" align="center">76.0</td>
<td valign="top" align="center">83.8</td>
<td valign="top" align="center">45.6</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO-SIoU</td>
<td valign="top" align="center">85.7</td>
<td valign="top" align="center">73.2</td>
<td valign="top" align="center">83.2</td>
<td valign="top" align="center">46.1</td>
</tr>
<tr>
<td valign="top" align="center">Ta-YOLO-EWDIoU</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">84.4</td>
<td valign="top" align="center">45.9</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Experiments with different values of <inline-formula>
<mml:math display="inline" id="im74">
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>&#x3bb;</mml:mi>
</mml:mstyle>
</mml:math>
</inline-formula>in the EWDIoU function</title>
<p>In the proposed EWDIoU loss function, to effectively mitigate the impact of small target tomato bounding boxes on the loss calculation while preserving the detection performance advantages for larger target tomatoes, an adjustable hyperparameter <inline-formula>
<mml:math display="inline" id="im75">
<mml:mi>&#x3bb;</mml:mi>
</mml:math>
</inline-formula> was introduced. This hyperparameter balances the contribution of the IoU in the loss function calculation, allowing for adaptive adjustments across different target scales. On the one hand, <inline-formula>
<mml:math display="inline" id="im76">
<mml:mi>&#x3bb;</mml:mi>
</mml:math>
</inline-formula> suppresses the bias amplification effect caused by the smaller scale of small targets in bounding box errors. On the other hand, it ensures the importance of large targets is preserved in the detection task, thereby achieving a dynamic balance and optimizing the loss function&#x2019;s performance for detecting targets of varying scales. The model&#x2019;s robustness and accuracy in handling multi-scale targets were significantly enhanced. We conducted experiments with 10% intervals and keeping the criterion of <inline-formula>
<mml:math display="inline" id="im77">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to observe the effects brought by different values on the overall detection results, as shown in <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref>. The number of experiments is 9 groups in total, and the experimental results showed that when <inline-formula>
<mml:math display="inline" id="im78">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is 0.7 and <inline-formula>
<mml:math display="inline" id="im79">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is 0.3, our EWDIoU effect performed the best on the small tomato dataset of the field scene, and its total category mAP value reached 84.4%. In the comparison across different categories, the mAP trends of green-fruited tomatoes and yellow-fruited tomatoes exhibited high consistency with the overall category mAP. With the highest mAP in the total category, green-fruited tomatoes and yellow-fruited tomatoes reached 81% vs. 84.9% mAP, respectively. Our data analysis showed that green-fruited tomatoes had a higher probability of being obscured in the sample of obscured small tomatoes, and that the number of green-fruited tomatoes in the sample of small targets was relatively large. Therefore, when the mAP of green-fruited tomato reached the optimum, the mAP of the total category also reached the peak, which further validated that the EWDIoU loss function is able to effectively solve the problem of the detection of occluded fruits. In addition, in the practical application scenario, the shading rate of yellow-fruited tomato was much lower than that of green-fruited tomato, but its mAP was still able to keep the same trend with the total category mAP, which indicated that the loss function has strong robustness and adaptability. The change in mAP of red-fruited tomato was relatively smooth, and the fluctuation range of its mAP was controlled within 2.1%. This phenomenon can be primarily attributed to the relatively moderate shading in red-fruited tomato samples, coupled with greater color variability arising from differing ripeness levels. Nonetheless, the consistent detection performance suggests that the EWDIoU loss function does not induce significant errors or overfitting when applied to this category. In summary, EWDIoU demonstrates strong adaptability and stability across varying degrees of occlusion scenarios.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>mAP values of different classes of small tomatoes at different <inline-formula>
<mml:math display="inline" id="im80">
<mml:mtext>&#x3bb;</mml:mtext>
</mml:math>
</inline-formula>-values.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-16-1618214-g013.tif">
<alt-text content-type="machine-generated">Bar chart comparing mAP50 percentages for different tomato categories (all, green, red, yellow) across varying lambda values from 0.1 to 0.9. Red tomatoes consistently have the highest values, peaking at 88.1% at lambda 0.5.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>Accurate counting of small tomato fruits in real-world production environments poses significant challenges and is of substantial practical importance. It is imperative to reduce the labor costs associated with manual counting while mitigating errors arising from the diminutive size of tomato targets and inter-fruit shading. This study proposed a detection and counting method specifically tailored for densely planted small tomatoes under realistic cultivation conditions. The algorithm maintains the integrity of small tomato targets within unaltered field images&#x2014;without necessitating image zooming or cropping&#x2014;thereby enabling effective and reliable detection.</p>
<p>The algorithm used yolov8n as the baseline model, and used the C2f_Repghost module and the SPDC module to adjust the structure of the backbone network in the original algorithm, so that the model could reduce the amount of computation and maintain the lightweight while enhancing the feature information extraction ability for small targets, so as to cope with the occlusion problem more effectively. And the new DASPPF structure was proposed to use average pooling to reduce the influence of redundant information on effective features and further improve the quality of feature extraction in the case of occlusion. Meanwhile, the CSAM multiple attention structure was constructed to introduce spatial and channel attention mechanisms after decomposing the input information to realize the deep fusion of feature maps at different scales. In addition, a new formulation of the EWDIoU loss function was proposed that utilized a two-dimensional Gaussian distribution function to abstract the original IoU loss function, which solved the problem of insensitivity of the original IoU to small target detection and effectively improves the performance of small tomato detection in the case of occlusion. Finally, an additional small detection head was incorporated into the detector architecture to enhance the extraction of fine-grained features, thereby improving the recognition of small targets. Experimental results demonstrate that the proposed Ta-YOLO model achieves high accuracy and robustness in addressing the occlusion challenges inherent in small tomato counting within real production environments. Compared to the original baseline, Ta-YOLO exhibits significant improvements in both accuracy and recall, alongside enhanced global feature extraction and superior small target detection performance.</p>
<p>And why did we choose YOLO as the baseline model and not use other lightweight models? As a single-stage detector, YOLO is well-suited for real-time video analysis in agricultural environments, where rapid and continuous detection is required. In contrast, models such as MobileNet-SSD offer faster inference but tend to underperform in complex scenes, while Transformer-based detectors like DETR are resource-intensive and less suitable for real-time deployment. Moreover, YOLO benefits from extensive open-source support and compatibility with deployment toolchains (e.g., TensorRT, ONNX), which significantly simplifies engineering implementation. Designing a lightweight model from scratch would introduce challenges such as lack of pre-trained weights and increased risk of overfitting, particularly in data-limited agricultural scenarios. For these reasons, we chose to adopt and tailor YOLO through lightweight modifications, balancing performance, efficiency, and practical.</p>
<p>Despite its advantages, Ta-YOLO has certain limitations. As shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>, while Ta-YOLO attains a high recognition accuracy of 87.2% for red-fruited tomatoes, its accuracy for yellow-fruited tomatoes is 5.3% lower than that of YOLOv9s. This notable gap contributes to an overall detection accuracy that is lower than YOLOv9s. A likely cause for this discrepancy is data imbalance; constraints in the actual production environment and the short growth cycle of small tomatoes resulted in fewer images containing yellow-fruited tomatoes during data collection. Consequently, the dataset contained fewer samples of yellow-fruited tomatoes compared to green- and red-fruited varieties. What&#x2019;s more, inaccuracies in manual annotation during dataset preparation may have led to misclassifications, especially for small tomatoes exhibiting intermediate colors during their developmental stages. Therefore, expanding the dataset and refining the maturity category definitions would be beneficial. Secondly, to preserve the natural growth state of small tomatoes and effectively address the influence of leaf shading on counting in actual production, we deliberately avoided regional cropping or other image preprocessing techniques. Instead, the full appearance of small tomatoes as seen in the production environment was retained. Although this approach increased detection difficulty, it enhanced Ta-YOLO&#x2019;s applicability and robustness in real-world agricultural scenarios.</p>
<p>Overall, Ta-YOLO represents a significant advancement for real-world production settings, particularly in the detection and counting of small tomatoes under occlusion. Its demonstrated accuracy, efficiency, and robustness provide a practical solution for improving the commercial productivity of agricultural operations. Moreover, Ta-YOLO effectively balances detection speed and precision, underscoring its potential to supplant labor-intensive manual counting. Future work will aim to further optimize the model and investigate its scalability across other small-target crop species and diverse application contexts.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>YZ: Writing &#x2013; review &amp; editing. YC: Methodology, Visualization, Writing &#x2013; original draft. XX: Conceptualization, Data curation, Investigation, Writing &#x2013; review &amp; editing. YH: Conceptualization, Project administration, Supervision, Writing &#x2013; review &amp; editing. HG: Supervision, Validation, Writing &#x2013; review &amp; editing. NW: Conceptualization, Supervision, Writing &#x2013; review &amp; editing. ZW: Conceptualization, Data curation, Validation, Writing &#x2013; review &amp; editing. XS: Conceptualization, Data curation, Software, Writing &#x2013; review &amp; editing. YW: Data curation, Investigation, Supervision, Writing &#x2013; review &amp; editing. PS: Conceptualization, Data curation, Investigation, Writing &#x2013; review &amp; editing. YM: Conceptualization, Project administration, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by the National Natural Science Foundation of China (62476251, 32401708) and the National Key Research and Development Program of China (2019YFE0126100).</p>
</sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>Author YM was employed by company Pegasor Oy.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cai</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Vasconcelos</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Cascade r-cnn: Delving into high quality object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition  (CVPR)</source>, <publisher-loc>Salt Lake City, UT, USA</publisher-loc>, <conf-date>18&#x2013;22 June 2018</conf-date>., <fpage>6154</fpage>&#x2013;<lpage>6162</lpage>.</citation></ref>
<ref id="B2">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Repghost: A hardware-efficient ghost module via re-parameterization</article-title>. <source>arXiv preprint</source>. arXiv:2211.06088. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2211.06088</pub-id>
</citation></ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Rong</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Tracking and counting method for tomato fruits scouting robot in greenhouse</article-title>,&#x201d; in <conf-name>Intelligent Robotics and Applications: 15th International Conference, ICIRA 2022</conf-name>, <conf-loc>Harbin, China</conf-loc>, <conf-date>August 1&#x2013;3, 2022</conf-date>, Proceedings, Part I (Vol. <volume>13455</volume>, p. <fpage>60</fpage>).  (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>).</citation></ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ge</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Tracking and counting of tomato at different growth period using an improving YOLO-deepsort network for inspection robot</article-title>. <source>Machines</source> <volume>10</volume>, <fpage>489</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/machines10060489</pub-id>
</citation></ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<source>Fast r-cnn</source>.&#x201d; in <conf-name>Proceedings of the IEEE international conference on computer vision (ICCV)</conf-name>, <conf-loc>Santiago, Chile</conf-loc>, <conf-date>7-13 Dec. 2015</conf-date>., pp. <fpage>1440</fpage>&#x2013;<lpage>1448</lpage>.</citation></ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Donahue</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Darrell</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Malik</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Rich feature hi-erarchies for accurate object detection and semantic segmentation</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)</source>, <publisher-loc>Columbus, OH, USA</publisher-loc>, 23-28 June 2014., <fpage>580</fpage>&#x2013;<lpage>587</lpage>.</citation></ref>
<ref id="B7">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Guan</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Biswas</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The US tomato industry: An overview of production and trade: FE1027, 9/2017</article-title>. <source>EDIS</source> <volume>2018</volume> (<issue>2</issue>). doi: <pub-id pub-id-type="doi">10.32473/edis-fe1027-2017</pub-id>
</citation></ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>J. Y.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C. J.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Ghostnet: More features from cheap operations</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on&#xa0;Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Seattle, WA, USA</conf-loc>, <conf-date>14&#x2013;19 June&#xa0;2020</conf-date>., pp. <fpage>1580</fpage>&#x2013;<lpage>1589</lpage>.</citation></ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Gkioxari</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Mask r-cnn</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE international conference on computer vision (ICCV)</conf-name>, <conf-loc>Venice, Italy</conf-loc>, <conf-date>22-29 Oct. 2017</conf-date>., <fpage>2961</fpage>&#x2013;<lpage>2969</lpage>.</citation></ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Johnson</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sharma</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Srinivasan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Masakapalli</surname> <given-names>S.K.</given-names>
</name>
<name>
<surname>Sharma</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sharma</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Enhanced field-based detection of potato blight in complex backgrounds using deep learning</article-title>. <source>Plant Phenom</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.34133/2021/9835724</pub-id>, PMID: <pub-id pub-id-type="pmid">34104897</pub-id></citation></ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khaki</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Pham</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Kuhl</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Kent</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Convolutional neural networks for image-based corn kernel detection and counting</article-title>. <source>Sensors</source> <volume>20</volume>, <fpage>2721</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s20092721</pub-id>, PMID: <pub-id pub-id-type="pmid">32397598</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khanam</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Muhammad</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Yolov11: An overview of the key architectural enhancements</article-title>. <source>arXiv preprint. arXiv:2410.17725</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2410.17725</pub-id>
</citation></ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lawal</surname> <given-names>O. M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Development of tomato detection model for robotic platform using deep learning</article-title>. <source>Multimedia Tools Appl.</source> <volume>80</volume>, <fpage>26751</fpage>&#x2013;<lpage>26772</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-021-10933-w</pub-id>
</citation></ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Long</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Tomato maturity detection and counting model based on MHSA-YOLOv8</article-title>. <source>Sensors</source> <volume>23</volume>, <fpage>6701</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s23156701</pub-id>, PMID: <pub-id pub-id-type="pmid">37571485</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T. Y.</given-names>
</name>
<name>
<surname>Goyal</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Focal loss for dense object detection</article-title>.&#x201d; in <source>Proceedings of the IEEE international conference on computer vision</source>., pp. <fpage>2980</fpage>&#x2013;<lpage>2988</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.324</pub-id>
</citation></ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Nouaze</surname> <given-names>J. C.</given-names>
</name>
<name>
<surname>Touko Mbouembe</surname> <given-names>P. L.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J. H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>YOLO-tomato: A robust algorithm for tomato detection based on YOLOv3</article-title>. <source>Sensors</source> <volume>20</volume>, <fpage>2145</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s20072145</pub-id>, PMID: <pub-id pub-id-type="pmid">32290173</pub-id></citation></ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Anguelov</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Erhan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Szegedy</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Reed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>C. Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). &#x201c;<article-title>Ssd: Single shot multibox detector</article-title>,&#x201d; in <conf-name>Computer Vision&#x2013;ECCV 2016: 14th European Conference</conf-name>, <conf-date>October 11&#x2013;14, 2016</conf-date>, Proceedings, Part I 14. (<publisher-loc>Amsterdam, The Netherlands</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>),  pp. <fpage>21</fpage>&#x2013;<lpage>37</lpage>. </citation></ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miao</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>He</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Efficient tomato harvesting robot based on image processing and deep learning</article-title>. <source>Precis. Agric.</source> <volume>24</volume>, <fpage>254</fpage>&#x2013;<lpage>287</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-022-09944-w</pub-id>
</citation></ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Qiao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L. C.</given-names>
</name>
<name>
<surname>Yuille</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Detectors: Detecting objects with recursive feature pyramid and switchable atrous convolution</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, <conf-loc>Nashville, TN, USA</conf-loc>, <conf-date>10-25 June 2021</conf-date>., <fpage>10213</fpage>&#x2013;<lpage>10224</lpage>.</citation></ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: Unified, real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <publisher-loc>Las Vegas, NV, USA</publisher-loc>, <conf-date>27&#x2013;30 June 2016</conf-date>., pp. <fpage>779</fpage>&#x2013;<lpage>788</lpage>. </citation></ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Faster R-CNN: Towards real-time object detection with region proposal networks</article-title>,&#x201d; in <conf-name>IEEE transactions on pattern analysis and machine intelligence</conf-name>. (<publisher-loc>10662 Los Vaqueros Circle, PO Box 3014, Los Alamitos</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>). <volume>39</volume> (<issue>6</issue>), <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>., PMID: <pub-id pub-id-type="pmid">27295650</pub-id></citation></ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Tomato cluster detection and counting using improved YOLOv5 based on RGB-D fusion</article-title>. <source>Comput. Electron. Agric.</source> <volume>207</volume>, <fpage>1077415</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107741</pub-id>
</citation></ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ruparelia</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Jethva</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Gajjar</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Real-time tomato detection, classification, and counting system using deep learning and embedded systems</article-title>,&#x201d; in <conf-name>Proceedings of the International e-Conference on Intelligent Systems and Signal Processing: e-ISSP 2020</conf-name>. (<publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>511</fpage>&#x2013;<lpage>522</lpage>.</citation></ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Seo</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Cho</surname> <given-names>B.-H.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>K.-C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Development of monitoring robot system for tomato fruits in hydroponic greenhouses</article-title>. <source>Agronomy</source> <volume>11</volume>, <fpage>2211</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy11112211</pub-id>
</citation></ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Srinivas</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sarvadevabhatla</surname> <given-names>R. K.</given-names>
</name>
<name>
<surname>Mopuri</surname> <given-names>K. R.</given-names>
</name>
<name>
<surname>Prabhu</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Kruthiventi</surname> <given-names>S. S.</given-names>
</name>
<name>
<surname>Babu</surname> <given-names>R. V.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>A taxonomy of deep convolutional neural nets for computer vision</article-title>. <source>Front. Robotics AI</source> <volume>2</volume>, <elocation-id>36</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/frobt.2015.00036</pub-id>
</citation></ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sunkara</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>No more strided convolutions or pooling: A new cnn building block for low-resolution images and small objects</article-title>,&#x201d; in <conf-name>Joint European conference on machine learning and knowledge discovery in databases</conf-name>. (<publisher-loc>Springer Nature Switzerland</publisher-loc>: <publisher-name>Cham</publisher-name>), <fpage>443</fpage>&#x2013;<lpage>459</lpage>.</citation></ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Toudeshki</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ehsani</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A methodology for fresh tomato maturity detection using computer vision</article-title>. <source>Comput. Electron. Agric.</source> <volume>146</volume>, <fpage>43</fpage>&#x2013;<lpage>505</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.01.011</pub-id>
</citation></ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Ling</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Nie</surname> <given-names>L.</given-names>
</name>
<name>
<surname>An</surname> <given-names>G.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>An improved Faster R-CNN model for multi-object tomato maturity detection in complex scenarios</article-title>. <source>Ecol. Inf.</source> <volume>72</volume>, <fpage>1018865</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2022.101886</pub-id>
</citation></ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A transformer-based mask R-CNN for tomato detection and segmentation</article-title>. <source>J. Intelligent Fuzzy Syst.</source> (<publisher-loc>Springer Nature Switzerland</publisher-loc>: <publisher-name>Cham</publisher-name>) <volume>44</volume>, <fpage>8585</fpage>&#x2013;<lpage>8595</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3233/JIFS-222954</pub-id>
</citation></ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Gupta</surname> <given-names>A.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Non-local neural networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. <fpage>7794</fpage>&#x2013;<lpage>7803</lpage>.</citation></ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>He.</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Apple detection and instance segmentation in natural environments using an improved Mask Scoring R-CNN Model</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>1016470</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.1016470</pub-id>, PMID: <pub-id pub-id-type="pmid">36531408</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A normalized Gaussian Wasserstein distance for tiny object detection</article-title>. <source>arXiv preprint arXiv:2110.13389</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2110.13389</pub-id>
</citation></ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Yeh</surname> <given-names>I. H.</given-names>
</name>
<name>
<surname>Mark Liao</surname> <given-names>H. Y.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Yolov9: Learning what you want to learn using programmable gradient information</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name>.  (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Nature Switzerland</publisher-name>), pp. <fpage>1</fpage>&#x2013;<lpage>21</lpage>.</citation></ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Weng</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Rapid and accurate identification of bakanae pathogens carried by rice seeds based on hyperspectral imaging and deep transfer learning</article-title>. <source>Spectrochim. Acta Part A.: Mol. Biomol. Spectrosc.</source> <volume>311</volume>, <fpage>1238895</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.saa.2024.123889</pub-id>, PMID: <pub-id pub-id-type="pmid">38340442</pub-id></citation></ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>An algorithm for automatic identification of multiple developmental stages of rice spikes based on improved Faster R-CNN</article-title>. <source>Crop J.</source> <volume>10</volume>, <fpage>1323</fpage>&#x2013;<lpage>1333</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cj.2022.06.004</pub-id>
</citation></ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>RIC-Net: A plant disease classification model based on the fusion of Inception and residual structure and embedded attention mechanism</article-title>. <source>Comput. Electron. Agric.</source> <volume>193</volume>, <fpage>1066445</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2021.106644</pub-id>
</citation></ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Precision detection of crop diseases based on improved YOLOv5 model</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>10668355</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.1066835</pub-id>, PMID: <pub-id pub-id-type="pmid">36699833</pub-id></citation></ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xu.</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>A novel algorithm for damage recognition on pest-infested oilseed rape leaves</article-title>. <source>Comput. Electron. Agric.</source> <volume>89</volume>, <fpage>41</fpage>&#x2013;<lpage>50</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2012.07.014</pub-id>
</citation></ref>
</ref-list>
</back>
</article>