<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2023.1224884</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Real-time dense small object detection algorithm based on multi-modal tea shoots</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Shuai</surname>
<given-names>Luyu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2312117"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Ziao</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Zhiyong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Hongdan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Boda</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Yuchao</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Mu</surname>
<given-names>Jiong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2233443"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Information Engineering, Sichuan Agricultural University</institution>, <addr-line>Ya&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Ya&#x2019;an Digital Agricultural Engineering Technology Research Center, Sichuan Agricultural University</institution>, <addr-line>Ya&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>College of Law, Sichuan Agricultural University</institution>, <addr-line>Ya&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>College of Mechanical and Electrical Engineering, Sichuan Agricultural University</institution>, <addr-line>Ya&#x2019;an</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Zhenghong Yu, Guangdong Polytechnic of Science and Technology, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Sijia Yu, Rutgers, The State University of New Jersey- Busch, United States; Danfeng Hong, Aerospace Information Research Institute (CAS), China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Jiong Mu, <email xlink:href="mailto:jmu@sicau.edu.cn">jmu@sicau.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>18</day>
<month>07</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1224884</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>05</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>06</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Shuai, Chen, Li, Li, Zhang, Wang and Mu</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Shuai, Chen, Li, Li, Zhang, Wang and Mu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The difficulties in tea shoot recognition are that the recognition is affected by lighting conditions, it is challenging to segment images with similar backgrounds to the shoot color, and the occlusion and overlap between leaves.</p>
</sec>
<sec>
<title>Methods</title>
<p>To solve the problem of low accuracy of dense small object detection of tea shoots, this paper proposes a real-time dense small object detection algorithm based on multimodal optimization. First, RGB, depth, and infrared images are collected form a multimodal image set, and a complete shoot object labeling is performed. Then, the YOLOv5 model is improved and applied to dense and tiny tea shoot detection. Secondly, based on the improved YOLOv5 model, this paper designs two data layer-based multimodal image fusion methods and a feature layerbased multimodal image fusion method; meanwhile, a cross-modal fusion module (FFA) based on frequency domain and attention mechanisms is designed for the feature layer fusion method to adaptively align and focus critical regions in intra- and inter-modal channel and frequency domain dimensions. Finally, an objective-based scale matching method is developed to further improve the detection performance of small dense objects in natural environments with the assistance of transfer learning techniques. </p>
</sec>
<sec>
<title>Results and discussion</title>
<p>The experimental results indicate that the improved YOLOv5 model increases the mAP50 value by 1.7% compared to the benchmark model with fewer parameters and less computational effort. Compared with the single modality, the multimodal image fusion method increases the mAP50 value in all cases, with the method introducing the FFA module obtaining the highest mAP50 value of 0.827. After the pre-training strategy is used after scale matching, the mAP values can be improved by 1% and 1.4% on the two datasets. The research idea of multimodal optimization in this paper can provide a basis and technical support for dense small object detection. </p>
</sec>
</abstract>
<kwd-group>
<kwd>dense small object detection</kwd>
<kwd>multimodal image fusion</kwd>
<kwd>RGB-D-IR</kwd>
<kwd>scale matching</kwd>
<kwd>frequency domain</kwd>
<kwd>attention mechanism</kwd>
<kwd>tea shoots</kwd>
</kwd-group>
<contract-sponsor id="cn001">Department of Science and Technology of Sichuan Province<named-content content-type="fundref-id">10.13039/501100004829</named-content>
</contract-sponsor>
<counts>
<fig-count count="11"/>
<table-count count="7"/>
<equation-count count="21"/>
<ref-count count="46"/>
<page-count count="20"/>
<word-count count="11365"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>In recent years, the aging trend of agricultural labor has significantly intensified, and the difficulty in recruiting and expensive labor has limited the development of the tea industry (<xref ref-type="bibr" rid="B10">Han et&#xa0;al., 2014</xref>). The manual picking of premium tea accounts for about 60% of the labor used for managing the whole tea plantation, while excellent high-grade tea is picked with delicate leaf tips that grow in different positions, postures, and densities, making it difficult for machine picking especially in the unstructured environment with wind and light changes (<xref ref-type="bibr" rid="B45">Xu et&#xa0;al., 2022</xref>). Thus, it is essential to study intelligent tea-picking technology to promote the development of the tea industry. The key to realizing automated tea picking is the accurate identification of tea shoots. In recent years, with the development and application of computer technology, the accurate identification of tea shoots based on image processing has become a research hotspot (<xref ref-type="bibr" rid="B22">Lin et&#xa0;al., 2019</xref>).</p>
<p>Since there are obvious color differences between tea shoots and old leaves and tree trunks, color features can be used to extract shoot regions in the image, so the early research on tea shoot segmentation is mainly based on color features. The primary process of traditional image processing algorithms based on color space involves image pre-processing, color feature selection, segmentation, and other steps (<xref ref-type="bibr" rid="B5">Bojie et&#xa0;al., 2019</xref>). To further address the issue that tea leaf segmentation under natural conditions is easily affected by the external environment, such as old leaves, branches, and soil, and obscured and overlapping tea leave. Machine learning methods have been introduced for identification by extracting and synthesizing various feature sample data for training, and standard methods for tee shoot identification are developed based on features such as color, texture, and shape, combined with the use of K-mean clustering, support vector machine methods, Bayesian discriminant methods, and cascade classifiers. Recognition methods based on traditional machine vision rely on image pre-processing and data conversion, and unreasonable pre-processing will significantly affect the accuracy of the model (<xref ref-type="bibr" rid="B16">Karunasena and Priyankara, 2020</xref>) (<xref ref-type="bibr" rid="B17">Li et&#xa0;al., 2021</xref>).</p>
<p>The algorithm based on deep learning has high accuracy, providing a basis for studying intelligent tea shoot-picking equipment in complex backgrounds. To alleviate the influence of a complex environment on the performance of the detection model, (<xref ref-type="bibr" rid="B44">Xiaoxiao et&#xa0;al., 2019</xref>) employed a pre-segmentation method and then used the improved YOLO series of medium and large-scale network models to detect tea shoots with an average accuracy of 84.2%. To promote the deployment of models for detecting tea shoots to picking leaf tips, lightweight models have received much attention from researchers. (<xref ref-type="bibr" rid="B45">Xu et&#xa0;al., 2022</xref>) exploited the fast detection capability of YOLOv3 and the high-precision classification capability of DenseNet201 through a cascaded network to detect tea shoots accurately. Although the above methods have relatively high accuracy, robustness, and generalization performance, they are difficult to detect adequate tea shoots in complex environments on low arithmetic devices in farmland due to the high dependence of deep learning network models on arithmetic power. Thus, researchers have investigated the accuracy, speed, and lightness of model detection simultaneously (<xref ref-type="bibr" rid="B6">Cao et&#xa0;al., 2022</xref>). proposed a tea shoot detection algorithm that fuses GhostNet and YOLOv5; (<xref ref-type="bibr" rid="B18">Li Y. et&#xa0;al., 2022</xref>) designed a YOLOv3-SPP deep learning algorithm based on channel and layer pruning, which reduced the number of parameters, model size, and inference time while achieving efficient and accurate tea shoot detection. Note that few studies have focused on crop objects that are dense and minutely difficult. However, in the study of small target detection problems, remote sensing image target detection has achieved excellent results. (<xref ref-type="bibr" rid="B43">Wu et&#xa0;al., 2019</xref>) presented a detector called ORSIm, which effectively improves the accuracy of small target detection in optical remote sensing images by integrating different channel features, feature learning, and fast image pyramid matching and enhancement strategies. To reduce the difficulty in infrared small target detection, (<xref ref-type="bibr" rid="B42">Wu et&#xa0;al., 2023</xref>) proposed an interactive cross-notice nested U-Net network called UIU-Net. However, UIU-Net models infrared small target detection as a semantic segmentation problem, which increases the cost of labeling. Therefore, this study improves the detection performance of dense and tiny tea shoots by improving the target detection model and adopting migration learning techniques.</p>
<p>The above studies took only RGB images as the input to the network. Nevertheless, in an unstructured environment, a single sensor provides limited information to detect shoot targets under various difficulties, such as different lighting conditions, the similar color of tea shoots to the background, the small size of tea shoots, dense tea shoots, overlapping tea shoots, branch and leaf occlusion, as well as different poses. To overcome these difficulties, the approach of using multimodal data can be adopted since there is a certain complementarity and consistency between multimodal information. Although RGB images can reflect features such as color, brightness, and texture of objects, they can only provide two-dimensional (2D) details. With the further development of image acquisition devices, the availability of multimodal data for object detection in agricultural environments has increased greatly, such as depth images, infrared images, etc. (<xref ref-type="bibr" rid="B34">Sun et&#xa0;al., 2022</xref>). Depth images contain information about the distance from the object to the sensor, which can reflect the depth and three-dimensional (3D) morphology of the object. So, depth images have more unique edge features and shape features that can be exploited to better distinguish between foreground and background. Meanwhile, infrared images collect information about the heat distribution of the object, which can reflect the temperature and thermal radiation characteristics of the object. Most importantly, depth and infrared images are less affected by illumination and viewing angle, and they can be used to perform stable target detection in complex environments. Thus, in recent years, research work has been devoted to using multimodal information to improve the performance of crop detection. For instance, (<xref ref-type="bibr" rid="B36">Tao and Zhou, 2017</xref>) extracted improved 3D descriptors (Color-FPFH) that incorporate color features and 3D geometric features from pre-processed point clouds to obtain richer feature information to enhance the accuracy of detecting apples. (<xref ref-type="bibr" rid="B8">Gan et&#xa0;al., 2018</xref>) designed an algorithm for green citrus fruit detection by integrating image alignment, information fusion, fruit classification, and detection into a single step to realize real-time detection. Experimental results indicate that the fusion of color and thermal images can effectively improve the detection of unripe green citrus fruits. Additionally, some studies use depth information to exclude complex backgrounds in agricultural environments to enhance the detection performance of target objects in RGB images. For example, (<xref ref-type="bibr" rid="B22">Lin et&#xa0;al., 2019</xref>) presented a depth filter and Bayesian classifier-based image segmentation method based on red-green-blue-depth (RGB-D) images to remove complex backgrounds. This improves citrus detection and localization accuracy in a natural outdoor orchard environment. (<xref ref-type="bibr" rid="B7">Fu et&#xa0;al., 2020</xref>) developed a faster R-CNN-based apple detection method using RGB images and depth features in a dense leafy wall tree. The background was first eliminated using a depth threshold of 1.2&#xa0;m to obtain the foreground RGB image. Then, the detection results of the original RGB image and the foreground RGB image were compared by using two different pre-trained network architectures (ZFNet and VGG16). The results demonstrated that removing the background tree using the depth filter can improve the fruit detection accuracy by 2.5%.</p>
<p>Methods for effective fusion methods of multimodal information have attracted much attention. In multimodal image target detection, the fusion methods for different information can be usually divided into three types: data layer fusion, feature layer fusion, and decision layer fusion. First, data layer fusion methods treat multimodal data as indistinguishable multichannel data and can exploit the inherent complementarity between different modalities to supplement the incomplete information in the input stage. For instance, (<xref ref-type="bibr" rid="B9">Gen&#xe9;-Mola et&#xa0;al., 2019</xref>) collected RGB images, depth images, and infrared images of apples simultaneously and performed range-correction on the signal intensity to solve the signal attenuation problem. The detection of apples was achieved by applying the Faster R-CNN model to five channels of input images (color (RGB), depth (D), and distance-corrected intensity signal (S)). The results indicate that the F1-score improves by 4.46% when depth and range-corrected intensity channels are added, and an F1-score of 0.898 and an AP of 94.8% are obtained when all channels are used. (<xref ref-type="bibr" rid="B24">Liu et&#xa0;al., 2019</xref>) proposed a method to fuse aligned RGB images, NIR images, and deep convolutional neural networks for kiwifruit detection. In their study, two different fusion methods were investigated: image fusion (fusing RGB and infrared images on the input layer) and feature fusion (combining the feature maps of two VGG16 networks with separate input RGB and NIR images). The results showed that the highest AP value of 90.7% was achieved by using the image fusion method. (<xref ref-type="bibr" rid="B31">Rong et&#xa0;al., 2023</xref>) applied a multimodal (RGB images and depth images) data fusion approach to optimize the input of YOLOv5 to reduce the effect of background on false tomato recognition and improved the recall of unripe tomatoes with a detection accuracy of 97.9% by the improved YOLOv5-4D. However, the crude data layer fusion method may result in information redundancy and noise propagation with limited enhancement effect, affecting the quality and accuracy of the fused data. The second type of fusion method, i.e., the feature layer fusion method, inputs multimodal images into parallel branches, extracts independent features at different scales in different modes, and then fuses the features. For instance, (<xref ref-type="bibr" rid="B41">Wu et&#xa0;al., 2021</xref>) developed a new multimodal remote sensing image classification network called CCR-Net. CCR-Net uses features from different modalities obtained by a CNN extractor and fuses them more compactly, allowing better processing and analysis of multimodal remote sensing data. (<xref ref-type="bibr" rid="B13">Hong et&#xa0;al., 2021</xref>) designed a new supervised algorithm for GCNs, called miniGCNs. miniGCNs jointly uses CNNs and GCNs to extract more diverse and differentiated feature representations for hyperspectral image classification tasks. However, both are based on image classification tasks. (<xref ref-type="bibr" rid="B34">Sun et&#xa0;al., 2022</xref>) proposed a noise-tolerant RGB-D feature fusion network for outdoor fruit detection to integrate RGB feature information, depth feature information, and an attention-based fusion module to adaptively fuse multimodal features to remove the adverse effects of depth noise and focus perception on the essential parts of the features. The proposed NT-FFN achieves an AP50 value of 95.4%. However, the inappropriate feature fusion approach in the feature layer fusion method may increase the difficulty of model learning and aggravate the imbalance of the network learning modality. The third type of feature fusion method, i.e., the decision layer fusion method, fuses the detection results of the last stage. For example, (<xref ref-type="bibr" rid="B37">Tu et&#xa0;al., 2018</xref>) adopted a faster region-based convolutional neural network (Faster R-CNN) to detect passion fruit for color images and depth images, respectively, and the two detection results based on RGB images and depth images were combined to improve the detection performance. (<xref ref-type="bibr" rid="B21">Lin et&#xa0;al., 2022</xref>) developed a regression network with multi-branch architecture to extract and fuse RGB, depth, and geometric features easily. The proposed post-fusion architecture significantly improved the fresh weight detection accuracy of lettuce shoots at different growth periods. However, the decision-level fusion method may consume a lot of computational resources due to the repeated computation of other multimodal branches, and the process learns the features of individual modalities independently without considering the correlation between different modal information. Therefore, to realize efficient real-time detection of tea shoots in an agricultural intelligent picking environment, this study investigates two data layer-based multimodal information fusion methods and a feature layer-based multimodal information fusion method, respectively. Meanwhile, a lightweight frequency domain attention mechanism module is designed for the feature layer fusion method to effectively fuse feature information across modalities.</p>
<p>To efficiently detect small targets of dense tea shoots in complex environments, this study improves the architecture of the YOLOv5 target detection model. Additionally, to make up for the deficiency of RGB image-based tea shoot detection, this study designs two data layer-based multimodal fusion methods and a feature layer-based multimodal fusion method based on the YOLOv5 model and designs a cross-modal fusion module based on frequency domain and attention mechanism. The main contributions of this study are summarized below:</p>
<list list-type="order">
<list-item>
<p>A tea image dataset of the natural environment is constructed. It contains aligned RGB images, depth images, and infrared images; the RGB images are annotated with tea shoot objects.</p>
</list-item>
<list-item>
<p>The architecture of the YOLOv5 model is modified and adjusted to improve the detection performance of the model for dense and tiny tea shoots.</p>
</list-item>
<list-item>
<p>The scale matching method is optimized based on the object scale. The generalization and robustness of the tea shoot detection model are improved by applying transfer learning techniques.</p>
</list-item>
<list-item>
<p>Two multimodal fusion methods based on the data layer and one multimodal fusion method based on the feature layer are investigated. Meanwhile, a cross-modal fusion module based on frequency domain and attention mechanism is designed to learn complementary information by adaptively focusing key regions in intra- and inter-modal frequency domain dimension and channel dimension to improve the performance of the tea shoot detector.</p>
</list-item>
</list>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data</title>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Data acquisition</title>
<p>The dataset used in this study was obtained at the National Tea Tree Breeding Farm, Mengdingshan Tea Modern Agricultural Park, Ya&#x2019;an City, Sichuan Province, China. The images were taken on the evening of 09/03/2023 and 19/03/2023, the prime time for famous tea harvesting. This study took Microsoft Kinectv2 as the image acquisition device, which integrates an RGB camera and a depth sensor that works following the TOF principle. The sensor provides three types of data: a color image, a depth image that can generate a 3D point cloud of the scene, and a received infrared backscattered intensity image.</p>
<p>In the data acquisition process, the Microsoft Kinect v2 depth camera was fixed on a triangular stand, with one end of the camera being connected to 220V outdoor mobile power and the other end being connected to a laptop <italic>via</italic> USB 3.0. The depth image, infrared image, color, and depth information aligned low-resolution image were captured simultaneously on the computer by calling the API of PyKinectV2 (<xref ref-type="bibr" rid="B2">Kinect/PyKinect2</xref>). First, a depth image, an infrared image, and an aligned image (RGB) with both color and depth information were captured simultaneously; then, they were resized to 512&#xd7;424 pixels; finally, the images were mirrored and inverted separately and saved. The RGB image was stored in 24 bits, the infrared image in 16, and the depth image in 8. The depth camera was placed vertically from 0.5-1.0&#xa0;m away from the top of the tea. To reduce the effect of bright light on sensor performance under outdoor conditions, all data were captured from 5:00 to 7:00 PM on an overcast day. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> presents the parameters and specifications of the equipment used in the data acquisition process.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Acquisition equipment specifications.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Device</th>
<th valign="top" align="center">Specifications</th>
<th valign="top" align="center">Parameter</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="6" align="left">RGB-D Sensor</td>
<td valign="top" align="left">Manufacturer and model</td>
<td valign="top" align="left">Microsoft Kinectv2</td>
</tr>
<tr>
<td valign="top" align="left">RGB channel resolution (pixels)</td>
<td valign="top" align="left">1920 &#xd7; 1080</td>
</tr>
<tr>
<td valign="top" align="left">RGB channel field-of-view (FOV)</td>
<td valign="top" align="left">84.1&#xb0; &#xd7; 53.8&#xb0;</td>
</tr>
<tr>
<td valign="top" align="left">IR and Depth channel resolution (pixels)</td>
<td valign="top" align="left">512 &#xd7; 424</td>
</tr>
<tr>
<td valign="top" align="left">IR and Depth channel FOV</td>
<td valign="top" align="left">70&#xb0; &#xd7; 60&#xb0;</td>
</tr>
<tr>
<td valign="top" align="left">Working range (m)</td>
<td valign="top" align="left">0.5&#x2013;8</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">Notebook Computer</td>
<td valign="top" align="left">Manufacturer and model</td>
<td valign="top" align="left">ASUS</td>
</tr>
<tr>
<td valign="top" align="left">Processor</td>
<td valign="top" align="left">AMD Ryzen 7 6800H with Radeon Graphics 3.20 GHz</td>
</tr>
<tr>
<td valign="top" align="left">RAM</td>
<td valign="top" align="left">16.0 GB</td>
</tr>
<tr>
<td valign="top" rowspan="4" align="left">Outdoor mobile power</td>
<td valign="top" align="left">Manufacturer and model</td>
<td valign="top" align="left">St. Xinlong</td>
</tr>
<tr>
<td valign="top" align="left">Size</td>
<td valign="top" align="left">255&#xd7;165&#xd7;145mm</td>
</tr>
<tr>
<td valign="top" align="left">Power capacity</td>
<td valign="top" align="left">90000mAh</td>
</tr>
<tr>
<td valign="top" align="left">Output voltage</td>
<td valign="top" align="left">220V</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Data preparation</title>
<p>A multimodal image dataset consisting of RGB, infrared, and depth images was obtained after data acquisition, each with a resolution of 512&#xd7;424 pixels. The original image schematic is shown in the first row of <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. Since the depth sensor has a larger vertical field of view than the color camera, the RGB, infrared, and depth images were cropped by removing the bottom and top images that do not provide RGB information, and the image resolution became 521&#xd7;360 pixels, as shown in the second row of <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>RGB images, IR images, and Depth images are represented from left to right. <bold>(A-C)</bold> captured original image; <bold>(D-F)</bold> cropped image; <bold>(G-I)</bold> annotated image. .</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g001.tif"/>
</fig>
<p>In the data annotation process, tea shoots were manually annotated using the COCO Annotator (<xref ref-type="bibr" rid="B33">Stefanics et al., 2022</xref>) online annotation software for RGB images only. To simulate the complexity of tea shoot growth in a natural environment and reflect the effectiveness of the detector, tea shoots with less than 75% occlusion and tiny tea shoots were annotated with absolute pixels larger than 2&#xd7;2 pixels. Each image annotation process took 0.5-0.6 hours, and each image contains 200-400 tea shoot targets with an absolute scale of about 30&#xd7;30 pixels. To achieve a low manual annotation cost and investigate the effect of multimodal images on the performance of tea shoot detection, RGB, infrared, and depth images shared a common set of labels: the annotation result on RGB images. An example of the image after mapping the labeling results to infrared and depth images is shown in the third row in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>.</p>
<p>This study collected 100 sets of multimodal image data on 09/03/2023 and 19/03/2023, respectively, 200 sets in total. Each dataset contains one RGB, infrared, and depth image, as well as the corresponding labels. <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref> shows the distribution of the datasets and example images. Dataset1 and Dataset2 represent the datasets collected on 09/03/2023 and 09/03/2023, respectively. Dataset 3 represents the set of Dataset 1 and Dataset 2 datasets.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Distribution of data sets and image examples.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" colspan="2" align="left">Datasets</th>
<th valign="top" align="center">Collection time</th>
<th valign="top" align="center">Number</th>
<th valign="top" align="center">RGB</th>
<th valign="top" align="center">IR</th>
<th valign="top" align="center">Depth</th>
<th valign="top" align="center">Label</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">Dataset3</td>
<td valign="top" align="left">Dataset1</td>
<td valign="top" align="left">2023.03.09</td>
<td valign="top" align="left">100</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i001.tif"/>
</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i002.tif"/>
</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i003.tif"/>
</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i004.tif"/>
</td>
</tr>
<tr>
<td valign="top" align="left">Dataset2</td>
<td valign="top" align="left">2023.03.19</td>
<td valign="top" align="left">100</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i005.tif"/>
</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i006.tif"/>
</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i007.tif"/>
</td>
<td valign="top" align="left">
<inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-i008.tif"/>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Since the color camera has a more significant horizontal field of view than the depth sensor, the original high-resolution color image (1920&#xd7;1080 pixels) and the RGB image (521&#xd7;360 pixels) used in this study were unaligned, and this study aimed to investigate the detection method and model for dense small targets in low-resolution images. Therefore, this study only used the low-resolution RGB images and the aligned infrared and depth images as experimental data. In future work, we will explore the problem of image alignment and super-resolution-assisted small target detection based on high-resolution and low-resolution images, and the original high-resolution color images will be used.</p>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Methods</title>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>YOLOv5s baseline and improvement architecture</title>
<p>YOLO (You Only Look Once) (<xref ref-type="bibr" rid="B29">Redmon et&#xa0;al., 2016</xref>) is a classic single-stage target detection network. The YOLOv5 (<xref ref-type="bibr" rid="B14">Jocher et&#xa0;al., 2022</xref>) model is widely used in various target detection tasks because of its flexibility and versatility. It uses CSPNet (Cross Stage Partial Network) (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2020</xref>) as the backbone to extract feature information and SPP (Spatial Pyramid Pooling) (<xref ref-type="bibr" rid="B11">He et&#xa0;al., 2015</xref>) to extract multi-scale depth features and then fuse the features at different scales through a feature pyramid constructed by PANet (Path Aggregation Network) (<xref ref-type="bibr" rid="B23">Liu et&#xa0;al., 2018</xref>), and the final results are output through three detection heads P3, P4, and P5. The depth and width of the YOLOv5 model depend on the bottleneck layer and several convolutional kernels, whereas the YOLOv5s model has a small size and fast inference speed, which is beneficial for real-time target detection in realistic scenarios. This is the reason why this study chooses YOLOv5s as the baseline. However, since the baseline model is usually designed for detecting medium and large targets, there are some limitations in the detection of small objects. YOLOv5s mainly includes the Focus layer, the design of the CSP1_n module, the number of stacks, and the PANet architecture. This study will elaborate on their limitations and the corresponding improvement measures for dense and tiny tea shoot detection. <xref ref-type="fig" rid="f2">
<bold>Figures&#xa0;2A, B</bold>
</xref> show the architectures of the YOLOv5 model and our improved YOLOv5s_improve model, respectively, and <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2C</bold>
</xref> shows the detailed construction of the modules that may be included in these two models.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Model architecture diagram and detailed module construction diagram. <bold>(A)</bold> YOLOv5s model architecture diagram; <bold>(B)</bold> YOLOv5s_improve model architecture diagram; <bold>(C)</bold> detailed construction of the modules that may be included in the model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g002.tif"/>
</fig>
<p>The limitations and improvements are analyzed as follows:</p>
<list list-type="order">
<list-item>
<p>From Focus to Conv: Focus is a lightweight convolutional layer. To reduce computational cost and speed up network training and inference, the Focus layer divides the input into four parts; convolutional operations are performed on each part separately, and the results are stacked finally to form the output feature map. However, this approach may sacrifice the accuracy of small target detection. Therefore, to better capture the feature information of small targets, this study uses replaces the Focus layer with a superficial Conv layer to increase the perceptual field of the model and the feature representation.</p>
</list-item>
<list-item>
<p>From &#x201c;3693&#x201d; to &#x201c;8833&#x201d;: The backbone of YOLOv5 used convolution with a step size of 2 in the early stage to halve the feature size. As the network deepens, the feature size retained for multi-scale target detection is much smaller than the size of the original input image. This low-resolution feature map does not contain information that can be used to reliably distinguish tiny objects. (<xref ref-type="bibr" rid="B26">Ning et&#xa0;al., 2023</xref>) effectively improved the performance of small object detection by increasing the shallow layers (the convolutional layers in the high-resolution stage) in the ResNet (<xref ref-type="bibr" rid="B12">He et&#xa0;al., 2016</xref>) and HRNet (<xref ref-type="bibr" rid="B35">Sun et&#xa0;al., 2019</xref>), thereby using fewer convolutional layers in the later stages of the network. The experimental results indicated that the early downsampling leads to information loss and difficulty in representing the features of small targets. Similarly, the number of CSP1_n modules in each phase of the YOLOv5 backbone network is modified to allocate more resources to handle higher-resolution features, and the number of CSP1_n modules in the post-backbone stage of the network is reduced to not introduce additional computational burden. The original YOLOv5 backbone contains four CSP1_n modules, and the number of modules is 3, 6, 9, and 3 in order. Through several experimental adjustments, this study finds that the optimal number of CSP1_n modules is 8, 8, 3, and 3 in order.</p>
</list-item>
<list-item>
<p>From CSP2 to C3_DSConv: In the CSP2 module of the neck, the standard convolution operation may cause the small object model of tea shoots to overfit and introduce an enormous computational burden. (<xref ref-type="bibr" rid="B25">Nascimento et&#xa0;al., 2019</xref>) proposed a flexible quantized convolution operator DSConv that uses inexpensive integer operations instead of single-precision operations while maintaining the kernel weights and output on the probability distribution. This study replaces the standard convolution in the neck CSP2 module with DSConv to ensure the lightweight and real-time characteristics of the tea shoot detection model.</p>
</list-item>
<list-item>
<p>From PANet to FPN: The main idea of PANet is to obtain higher-level semantic information through aggregation and transfer, but it requires a lot of computational resources and time and may lead to information loss and model overfitting, and PANet focuses on the improvement of detection accuracy of medium and large targets. FPN (Feature Pyramid Network) (<xref ref-type="bibr" rid="B20">Lin et&#xa0;al., 2016</xref>) obtains better scale adaptation and semantic information through feature transfer and fusion, which helps to preserve the delicate features and information required for small object detection and effectively reduces the complexity of the model. Thus, this study replaces the PANet structure with FPN.</p>
</list-item>
</list>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Multimodal object detection architecture</title>
<sec id="s2_2_2_1">
<label>2.2.2.1</label>
<title>Multimodal image object detection</title>
<p>To fully utilize the complementary information between RGB, infrared, and depth images of tea shoots to enhance the ability of the model to detect and localize tea shoots, two data layer-based fusion methods and a feature layer-based fusion method is established in this study. Besides improving the quality of intra-modal and inter-modal information fusion, a simple and effective FFA module is designed in this study by using the feature layer-based data fusion method. The input and the backbone of the models of the three fusion methods in this study are illustrated in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Three fusion methods for multimodal images. <bold>(A)</bold> Data layer-based fusion method 1; <bold>(B)</bold> Data layer-based fusion method 2; <bold>(C)</bold> Feature layer-based fusion method.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g003.tif"/>
</fig>
<p>Method 1 uses a simple data layer fusion approach. As shown in method (A) in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>, through several repetitive comparative experiments, the best weighting coefficients are first derived for RGB, infrared, and depth images, and they are 0.6, 0.2, and 0.2, respectively. Secondly, the RGB, infrared, and depth images are fused by simple pixel-level summation with the best weighting coefficients, respectively. Then, the synthesized images are fed into the single-stream object detection backbone for feature extraction. Finally, BP3, BP4, and BP5 features are provided to the model head for detection.</p>
<p>Method 2 uses data layer fusion based on channel mapping. Again, the best weighting coefficients are derived for infrared and depth images by repeated experiments with multiple comparisons of 0.5 and 0.5, respectively. Then, the infrared and depth images are fused by simple pixel-level summation with the best weighting coefficients. The obtained image A is taken as the fourth channel of the image to obtain a four-channel RGBA image by stitching it with the color RGB image. Next, the RGBA image is fed into the designed 4-channel single-stream object detection backbone for feature extraction, and finally, BP3, BP4, and BP5 features are provided to the model head for detection. The details are shown in method (B) in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>.</p>
<p>Method 3 uses feature layer fusion. The infrared and depth images are first stitched into a single three-channel image (D_IR_IR) to preserve as much information as possible under each modality; then, the stitched and colored RGB images are fed into the designed dual-stream object detection backbone to extract features, and finally, BP3, BP4, and BP5 features are provided to the model head for detection. The detailed design of YOLOv5s-Multimodal, a multimodal image fusion architecture based on feature layers, is presented in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3C</bold>
</xref>. In the YOLOv5s_Multimodal model, this study uses YOLOv5s_improve as the backbone of two branches, but the parameters in the two backbones are not shared. The same backbone structure is used to extract features from D_IR_IR and RGB images under each modality. In the intermediate stage of the backbone, the features are fused by the frequency domain-based cross-modal fusion attention module (FFA) to facilitate the interaction and fusion of modalities, and the fused features are fed to the RGB stream and the D_IR_IR stream respectively for feature extraction in depth.</p>
</sec>
<sec id="s2_2_2_2">
<label>2.2.2.2</label>
<title>Cross-modal fusion attention module based on frequency domain</title>
<p>RGB, infrared, and depth images have their strengths and weaknesses, and their information is usually complementary but contains noise. There are better solutions than simply fusing or processing RGB, infrared, and depth images. However, noisy information can be filtered and calibrated using features from another modality, so this study proposes FFA, and its structure is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Structure of the FFA module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g004.tif"/>
</fig>
<p>To reduce expensive computations, improve the inference speed of the model and better preserve the spatial and semantic information of the images, this study chooses to filter, enhance, and fuse the information of different modalities in the frequency domain. To resolve the noise and uncertainty in other modalities and to calibrate and extract the frequency feature information in various modalities, this study infers the attention map along the channel dimension and frequency dimension in turn and then multiplies the attention map with the feature map in the frequency domain to perform adaptive frequency domain feature fusion optimization. To facilitate feature extraction and interaction between modes, this study enhances information interaction between other methods by simple convolution and cross-fusion.</p>
<p>Spatial domain to frequency domain: feature maps <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are respectively converted to <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the frequency domain using FFT. Equations (1-2) show the corresponding 2D FFT.</p>
<disp-formula>
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
<mml:mfenced>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
<mml:mfenced>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mfenced>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is a feature map of size <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and equations (1) and (2) are evaluated for the discrete variables <italic>u</italic> and <italic>v</italic> with <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>M</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Information fusion and enhancement of channel dimensions: First, global pooling operations are performed on the frequency -domain feature maps <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> respectively to obtain global frequency -domain feature information, and both global average pooling and global maximum pooling are used to retain as much information as possible. Then, four resultant vectors are generated and stitched to form a richer frequency -domain feature representation. Next, the frequency -domain feature information is further extracted and fused by the MLP_1 layer. Subsequently, the sigmoid operation is performed to obtain the weights, and the weights are divided into <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> by the split operation. Finally, the weights are multiplied with the input frequency-domain feature maps <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to obtain the frequency-domain feature maps <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. In this way, the information enhancement and complementation of the channel dimension of RGB and Depth_IR features are realized. The whole process is shown in Equations (3-7).</p>
<disp-formula>
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>(</mml:mo>
<mml:mo>(</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>(</mml:mo>
<mml:mo>(</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>_</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>&#x229b;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>&#x229b;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im17">
<mml:mi>&#x3b4;</mml:mi>
</mml:math>
</inline-formula> represents the Sigmoid operation.</p>
<p>Information fusion and enhancement in the frequency domain: first, the Concat operation is performed on frequency -domain feature maps <inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> to obtain a richer frequency -domain feature representation. Then, after MLP_2 layers, which are two 1&#xd7;1 convolution and nonlinear transform RELU operations, more features are extracted to obtain a complex frequency -domain feature representation. Next, the sigmoid operation is performed to obtain the weights, and the weights are divided into <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im21">
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> by the split operation. Finally, the weights are multiplied with the input frequency-domain feature maps <inline-formula>
<mml:math display="inline" id="im22">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im23">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> to obtain the frequency-domain feature maps <inline-formula>
<mml:math display="inline" id="im24">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im25">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. In this way, the information enhancement and complementarity of the frequency dimension of RGB and Depth_IR features are realized. The whole process is shown in Equations (8-11).</p>
<disp-formula>
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced>
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>,</mml:mi>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mo>&#x229b;</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mo>&#x229b;</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im26">
<mml:mi>&#x3b4;</mml:mi>
</mml:math>
</inline-formula> represents the Sigmoid operation.</p>
<p>Frequency domain to spatial domain: IFFT is performed on feature maps <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im28">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> to convert them back to feature maps <inline-formula>
<mml:math display="inline" id="im29">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im30">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> in the spatial domain, respectively. The corresponding 2D IFFT is shown in Equations (12-13).</p>
<disp-formula>
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mfenced>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
<mml:mfenced>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mi>F</mml:mi>
</mml:msubsup>
<mml:mfenced>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
<mml:mfenced>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im31">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>M</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im32">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Re-enhancement of purified information: To obtain a better feature representation, two convolution operations are used to enhance the feature information extracted in the above process, and the information is fed to the RGB stream and Depth_IR stream respectively for the next stage of feature extraction and fusion by cross-fusion. Equation (14-15) shows the purified information re-enhancement operation.</p>
<disp-formula>
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
<mml:mo>&#x2295;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
<mml:mo>&#x2295;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Objective-based scale matching</title>
<p>The influence of uncontrollable factors in the natural environment, such as light, temperature, and humidity, leads to different growth states of tea shoots. Particularly, tea shoots proliferate from early March to early April, as shown in Dataset1 and Dataset2, which exhibit large differences in length, volume, posture, and color, although only ten days. This poses a challenge to the generalizability and robustness of the detection model. <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref> shows the number and relative scale distribution of tea shoot objects in the two datasets.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Distribution of tea shoot objects in Dataset1 and Dataset2 datasets. <bold>(A)</bold> the total number of targets and the relative width and height scales of target boxes in Dataset1; <bold>(B)</bold> the total number of objects and the relative width and height scales of object boxes in Dataset2; <bold>(C)</bold> the relative width and height scales and distribution of objects in Dataset1; <bold>(D)</bold> the relative width and height scales and distribution of objects in Dataset2.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g005.tif"/>
</fig>
<p>From <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, it can be observed that: In Dataset1, the total number of tea shoots exceeds 20,000, the distribution of tea shoots is relatively dense, the width and height of tea shoots are similar in attitude, and the relative scale of over 90% of the tea shoots is less than 5%. In Dataset2, the total number of tea shoots is close to 25,000, the distribution of tea shoots is very dense, the width, height, and posture of tea shoots are different, and the relative scale of over 90% of tea shoots is less than 10%. Overall, both datasets are dense, making it challenging to find targets. The difference between them is that Dataset1 has fewer samples and more minor relative scale differences, while Dataset2 has more samples and larger relative scale differences.</p>
<p>(<xref ref-type="bibr" rid="B46">Yu et&#xa0;al., 2020</xref>) found that the problem of scale mismatch reduces the accuracy of feature representation and detection models, and a smaller dataset may lead to model overfitting. To improve the generalization and robustness of the detector for detecting tea shoots of different periods under the condition of small samples, this study uses a simple scale-matching method combined with migration learning techniques to improve the detection performance of the model. The targets in Dataset2 are scaled to align with the relative scales of the targets in Dataset1. Then, the best weights obtained from training using the aligned dataset are used as pre-training weights to guide the detection model to fine-tune the parameters on Dataset1 to improve the detection capability of the detector for Dataset1. This facilitates the distribution of features between the pre-trained dataset of the aligned network and the dataset learned by the detector, enabling the model to better utilize the information at small scales.</p>
<p>The specific procedure is as follows: first, the average scale (s<sub>1</sub>, s<sub>2</sub>) of the two datasets Dataset1 and Dataset2, and their distributions are calculated by statistical data methods, and the scale scaling factors (a<sub>12</sub>, a<sub>21</sub>) between the two datasets are obtained. Then, search, judgment, and scaling operations are performed for all targets in the images. For instance, for Dataset2, if the relative scale of an object is larger than the average scale s1, the target object is keyed out according to the label box, followed by scaling the object according to the scale scaling factor a<sub>21</sub>, and then the object is put back to the original position to keep the center position unchanged. Additionally, to not damage the contextual structure information of the target object, this study uses the adjacent pixel-based image interpolation method to recover the empty part caused by scaling the target object, and the same processing is conducted for Dataset1. <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref> shows the image comparison effect of the objective-based scale matching method.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Objective-based scale matching method. <bold>(A)</bold> Example image in Dataset1; <bold>(B)</bold> Example image in Dataset2; <bold>(C)</bold> Example image after Dataset1 is aligned to Dataset2 scale; <bold>(D)</bold> Example image after Dataset2 is aligned to Dataset1 scale.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g006.tif"/>
</fig>
</sec>
<sec id="s2_2_4">
<label>2.2.4</label>
<title>Loss function</title>
<p>The loss function used to detect tea shoots in this paper consists of three components: confidence loss function, classification loss function, and boundary regression prediction loss function, as shown in Equation (16).</p>
<disp-formula>
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and discussion</title>
<sec id="s3_1">
<label>3.1</label>
<title>Experimental details</title>
<p>The experiment was conducted on a computer running Windows 10 operating system, and the hardware and software parameters are listed in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. The official YOLOv5 version 6.1 (<xref ref-type="bibr" rid="B14">Jocher et&#xa0;al., 2022</xref>) codebase was taken, and the modifications described in sections 2.2.1 and 2.2.2 were implemented on top of it. The training was performed using the SGD optimizer. The initial learning rate was 1E-2, the final learning rate was 1E-5, and the weights decayed to 5E-3. After a momentum of 0.8 was used in the first three warm-up phases, it became 0.937. The training process was run for 300 epochs with a batch size of 4. Online data enhancement methods such as horizontal flip, random rotation, color change, and mosaic, were used during the training to enhance the sample diversity.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Software and hardware parameters.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Accessories</th>
<th valign="top" align="center">Model</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Operating system</td>
<td valign="top" align="left">Windows 10</td>
</tr>
<tr>
<td valign="top" align="left">CPU</td>
<td valign="top" align="left">Intel(R) Xeon(R) Gold 5218 CPU @ 2.30GHz</td>
</tr>
<tr>
<td valign="top" align="left">RAM</td>
<td valign="top" align="left">128 GB</td>
</tr>
<tr>
<td valign="top" align="left">GPU</td>
<td valign="top" align="left">NVIDIA Quadro RTX 5000</td>
</tr>
<tr>
<td valign="top" align="left">Development environments</td>
<td valign="top" align="left">Python3.8, Pytorch1.10.1, CUDA10.2</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Evaluation metrics</title>
<p>In this study, floating point operations per second (GFLOPs), precision (Precision), recall (Recall), and average precision (mAP) were taken as evaluation metrics for measuring model complexity and performance. The calculation formulas of these metrics are shown in Equations (17-21).</p>
<disp-formula>
<label>(17)</label>
<mml:math display="block" id="M17">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(18)</label>
<mml:math display="block" id="M18">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(19)</label>
<mml:math display="block" id="M19">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mi>P</mml:mi>
<mml:mfenced>
<mml:mi>R</mml:mi>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(20)</label>
<mml:math display="block" id="M20">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(21)</label>
<mml:math display="block" id="M21">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The parameter denotes the number of parameters of the model. GFLOPs is a metric of the computational power of the model, and a smaller GFLOPs value indicates that the model has less computational burden and can respond to requests faster. The two metrics visually represent the complexity of the model. TP, FP, and FN denote the number of correctly detected objects, incorrectly detected objects, and undetected tea shoot objects, respectively. Precision is the probability that a tea shoot is predicted to be a positive sample among the actual positive samples. The recall is the probability of tea shoots being predicted as positive among the actual positive samples. AP represents the average precision, a combination of precision and recall. The mAP is the average of AP of different categories, where N is the number of types; in this experiment, there is only one category of tea shoots, so N is 1. In this study, mAP50 and mAP95 refer to the mAP values when the value of IOU is taken at 50% and 95%, respectively.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Ablation and comparison experiments</title>
<p>This section validates the models and methods selected and designed in this study through ablation experiments and comparison experiments. First, a set of comparison experiments was designed to verify the validity of the baseline model selected in this study. Then, a group of ablation experiments based on the modified baseline model was carried out to demonstrate the effectiveness of the improved method adopted in this study. Next, the superiority of the proposed method was verified by designing a set of comparative experiments of multimodal image target detection using different fusion methods and approaches. Finally, a set of ablation experiments was designed to verify the effectiveness of the migration learning and scale -matching methods.</p>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Validation of the baseline framework</title>
<p>In this experiment set, 200 color RGB tea shoot images in Datatset3 were used as the experimental dataset, and it was divided into a training set, a validation set, and a test set at the ratio of 8:1:1. The dataset was trained and validated on models of YOLOv3 (<xref ref-type="bibr" rid="B30">Redmon and Farhadi, 2018</xref>), YOLOv4 (<xref ref-type="bibr" rid="B4">Bochkovskiy et&#xa0;al., 2020</xref>), YOLOv5, YOLOv6 (<xref ref-type="bibr" rid="B19">Li C. et&#xa0;al., 2022</xref>), YOLOv7 (<xref ref-type="bibr" rid="B39">Wang et&#xa0;al., 2022</xref>), and YOLOv8 (<xref ref-type="bibr" rid="B15">Jocher et al., 2023</xref>), and the test results and model performance are shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. To ensure fairness, no pre-training weights were used for all models in the training process, and the testing environment and configuration were identical during the experiments.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Comparative results of detection capabilities of different YOLO frameworks and baseline models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Method</th>
<th valign="top" align="center">Parameters</th>
<th valign="top" align="center">GFLOPs</th>
<th valign="top" align="center">P</th>
<th valign="top" align="center">R</th>
<th valign="top" align="center">mAP50</th>
<th valign="top" align="center">mAP95</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">YOLOv3</td>
<td valign="top" align="left">8666692</td>
<td valign="top" align="left">12.9</td>
<td valign="top" align="left">0.686</td>
<td valign="top" align="left">0.585</td>
<td valign="top" align="left">0.647</td>
<td valign="top" align="left">0.254</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv4</td>
<td valign="top" align="left">5874116</td>
<td valign="top" align="left">20.0</td>
<td valign="top" align="left">0.636</td>
<td valign="top" align="left">0.608</td>
<td valign="top" align="left">0.799</td>
<td valign="top" align="left">0.377</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5</td>
<td valign="top" align="left">7012822</td>
<td valign="top" align="left">15.8</td>
<td valign="top" align="left">0.825</td>
<td valign="top" align="left">0.733</td>
<td valign="top" align="left">0.801</td>
<td valign="top" align="left">0.425</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv6</td>
<td valign="top" align="left">18500000</td>
<td valign="top" align="left">45.17</td>
<td valign="top" align="left">0.779</td>
<td valign="top" align="left">0.715</td>
<td valign="top" align="left">0.623</td>
<td valign="top" align="left">0.322</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv7</td>
<td valign="top" align="left">6007596</td>
<td valign="top" align="left">13.0</td>
<td valign="top" align="left">0.814</td>
<td valign="top" align="left">0.733</td>
<td valign="top" align="left">0.703</td>
<td valign="top" align="left">0.326</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv8</td>
<td valign="top" align="left">11125971</td>
<td valign="top" align="left">28.4</td>
<td valign="top" align="left">0.819</td>
<td valign="top" align="left">0.732</td>
<td valign="top" align="left">0.802</td>
<td valign="top" align="left">0.459</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Although YOLOv8 obtained the highest mAP50 value, its number of parameters was 1.5 times larger than that of the YOLOv5s model, and its GFLOPs was 1.8 times higher than that of the YOLOv5s model. YOLOv3, YOLOv4, and YOLOv7, although their number of parameters and GFLOPs were smaller, had relatively low mAP50 values, and especially, YOLOv3 and YOLOv4 had a lower recall. YOLOv6 performed relatively poorly on small targets with dense tea shoots. Overall, YOLOV5 is much smaller and more lightweight than the other models in terms of parameter size and GFLOPS, although its mAP50 value is lower than the highest value. Therefore, YOLOv5s is easier to deploy in practical application scenarios. The above results validate the selection of YOLOv5s as the baseline model in this study.</p>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Validation of baseline model improvements</title>
<p>In this set of experiments, 200 color RGB tea shoot images in Datatset3 were used as the experimental dataset, and they were divided into a training set, a validation set, and a test set at the ratio of 8:1:1. &#x201c;From Focus to conv&#x201d; (NoFocus),&#x201d; From 3693 to 8833&#x201d; (BH),&#x201d; From CSP2 to C3_DSConv&#x201d; (C3_DSConv), and &#x201c;From PANet to FPN&#x201d; (FPN) modular architectures and methods were added to the baseline model, respectively. <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> presents the experimental results. Note that no pre-training weights were used for all models during training, and the testing environment and configuration were identical during the experiments.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Results of ablation experiments with improved baseline model.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">NoFocus</th>
<th valign="top" align="center">BH</th>
<th valign="top" align="center">FPN</th>
<th valign="top" align="center">C3_DSConv</th>
<th valign="top" align="center">Parameters</th>
<th valign="top" align="center">GFLOPs</th>
<th valign="top" align="center">P</th>
<th valign="top" align="center">R</th>
<th valign="top" align="center">mAP50</th>
<th valign="top" align="center">mAP95</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left">7012822</td>
<td valign="top" align="left">15.8</td>
<td valign="top" align="left">0.825</td>
<td valign="top" align="left">0.733</td>
<td valign="top" align="left">0.801</td>
<td valign="top" align="left">0.425</td>
</tr>
<tr>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left">7012822</td>
<td valign="top" align="left">15.8</td>
<td valign="top" align="left">0.835</td>
<td valign="top" align="left">0.742</td>
<td valign="top" align="left">0.808</td>
<td valign="top" align="left">0.427</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left">6746326</td>
<td valign="top" align="left">16.4</td>
<td valign="top" align="left">0.841</td>
<td valign="top" align="left">0.750</td>
<td valign="top" align="left">0.814</td>
<td valign="top" align="left">0.444</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left"/>
<td valign="top" align="left">5979478</td>
<td valign="top" align="left">14.6</td>
<td valign="top" align="left">0.836</td>
<td valign="top" align="left">0.750</td>
<td valign="top" align="left">0.814</td>
<td valign="top" align="left">0.441</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left"/>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left">7016278</td>
<td valign="top" align="left">13.7</td>
<td valign="top" align="left">0.826</td>
<td valign="top" align="left">0.749</td>
<td valign="top" align="left">0.805</td>
<td valign="top" align="left">0.426</td>
</tr>
<tr>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left">&#x221a;</td>
<td valign="top" align="left">
<bold>5715670</bold>
</td>
<td valign="top" align="left">
<bold>13.5</bold>
</td>
<td valign="top" align="left">
<bold>0.841</bold>
</td>
<td valign="top" align="left">
<bold>0.751</bold>
</td>
<td valign="top" align="left">
<bold>0.818</bold>
</td>
<td valign="top" align="left">
<bold>0.448</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold indicates the best experimental results.</p>
<p>The "&#x221a;" symbol indicates the use of the policy, method, or module.</p>
</table-wrap-foot>
</table-wrap>
<p>Overall, the mAP50 of the model was improved after the modules and methods described in Section 2.2.1 were added to the baseline model. Particularly, the recall of tea shoots was significantly enhanced when all the improved methods were used, indicating that our proposed method benefits the detection of tea shoots that are prone to miss-detection. Meanwhile, the number of model parameters and GFLOPs was optimized, which is consistent with our original intention to achieve real-time detection of dense and tiny tea shoots through a lightweight model. Note that the accuracy was significantly improved when the BH strategy was used (aggravating the computation of the early stages of the network). Still, the GFLOPs were also increased by introducing more computation. For this purpose, this study used C3_DSConv to reduce the computational effort, and it can be seen that the GFLOPs were significantly reduced without affecting the accuracy.</p>
<p>Additionally, this study demonstrates the performance of the YOLOv5s model under other BH strategies. The details are presented in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>. First, it can be seen that relative to the distribution of CSP1_n modules of the original YOLOv5s model, the model detection accuracy and especially the recall were significantly improved by using the method of early calculation of the weighted network. Second, the optimal performance was achieved when the number of CSP1_n modules in the four stages of the backbone was set to 8, 8, 3, and 3, respectively.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Performance demonstration of the YOLOv5s model under other BH strategies.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Number of CSP1_n modules</th>
<th valign="top" align="center">Parameters</th>
<th valign="top" align="center">P</th>
<th valign="top" align="center">R</th>
<th valign="top" align="center">mAP50</th>
<th valign="top" align="center">mAP95</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">3,6,9,3</td>
<td valign="top" align="left">7012822</td>
<td valign="top" align="left">0.825</td>
<td valign="top" align="left">0.733</td>
<td valign="top" align="left">0.801</td>
<td valign="top" align="left">0.425</td>
</tr>
<tr>
<td valign="top" align="left">5,7,6,1</td>
<td valign="top" align="left">6859030</td>
<td valign="top" align="left">0.842</td>
<td valign="top" align="left">0.753</td>
<td valign="top" align="left">0.811</td>
<td valign="top" align="left">0.435</td>
</tr>
<tr>
<td valign="top" align="left">4,8,6,2</td>
<td valign="top" align="left">6889814</td>
<td valign="top" align="left">0.838</td>
<td valign="top" align="left">0.740</td>
<td valign="top" align="left">0.803</td>
<td valign="top" align="left">0.427</td>
</tr>
<tr>
<td valign="top" align="left">7,8,3,1</td>
<td valign="top" align="left">6736022</td>
<td valign="top" align="left">0.844</td>
<td valign="top" align="left">0.745</td>
<td valign="top" align="left">0.813</td>
<td valign="top" align="left">0.442</td>
</tr>
<tr>
<td valign="top" align="left">6,7,4,2</td>
<td valign="top" align="left">6694934</td>
<td valign="top" align="left">0.836</td>
<td valign="top" align="left">0.747</td>
<td valign="top" align="left">0.813</td>
<td valign="top" align="left">0.436</td>
</tr>
<tr>
<td valign="top" align="left">10,6,4,1</td>
<td valign="top" align="left">6705238</td>
<td valign="top" align="left">0.843</td>
<td valign="top" align="left">0.746</td>
<td valign="top" align="left">0.810</td>
<td valign="top" align="left">0.439</td>
</tr>
<tr>
<td valign="top" align="left">9,8,2,2</td>
<td valign="top" align="left">6746326</td>
<td valign="top" align="left">0.839</td>
<td valign="top" align="left">0.746</td>
<td valign="top" align="left">0.813</td>
<td valign="top" align="left">0.443</td>
</tr>
<tr>
<td valign="top" align="left">8,7,3,3</td>
<td valign="top" align="left">6705238</td>
<td valign="top" align="left">0.840</td>
<td valign="top" align="left">0.752</td>
<td valign="top" align="left">0.811</td>
<td valign="top" align="left">0.437</td>
</tr>
<tr>
<td valign="top" align="left">9,6,3,3</td>
<td valign="top" align="left">6705238</td>
<td valign="top" align="left">0.837</td>
<td valign="top" align="left">0.747</td>
<td valign="top" align="left">0.807</td>
<td valign="top" align="left">0.437</td>
</tr>
<tr>
<td valign="top" align="left">9,9,3,1</td>
<td valign="top" align="left">6746326</td>
<td valign="top" align="left">0.839</td>
<td valign="top" align="left">0.753</td>
<td valign="top" align="left">0.811</td>
<td valign="top" align="left">0.441</td>
</tr>
<tr>
<td valign="top" align="left">8,9,2,2</td>
<td valign="top" align="left">6746326</td>
<td valign="top" align="left">0.836</td>
<td valign="top" align="left">0.748</td>
<td valign="top" align="left">0.811</td>
<td valign="top" align="left">0.440</td>
</tr>
<tr>
<td valign="top" align="left">8,8,3,3</td>
<td valign="top" align="left">
<bold>6746326</bold>
</td>
<td valign="top" align="left">
<bold>0.841</bold>
</td>
<td valign="top" align="left">
<bold>0.750</bold>
</td>
<td valign="top" align="left">
<bold>0.814</bold>
</td>
<td valign="top" align="left">
<bold>0.444</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold indicates the best experimental results.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3_3_3">
<label>3.3.3</label>
<title>Comparison of multimodal image fusion methods</title>
<p>In this set of experiments, the Dataset3 dataset was used as the experimental dataset, and it was divided into a training set, a validation set, and a test set at the ratio of 8:1:1. However, it is worth noting that the data were preprocessed differently according to different modal fusion methods. This is shown in detail in Section 2.2.2. Also, to further validate the effectiveness and superiority of our proposed baseline model and the multimodal feature fusion model, different experimental models were compared. The performance of the data layer fusion approach was compared on the YOLOv5s baseline and improved models. The performance of the feature layer fusion approach was compared on the CFT model proposed by (<xref ref-type="bibr" rid="B28">Qingyun et&#xa0;al., 2021</xref>), the HINet proposed by (<xref ref-type="bibr" rid="B27">Park, 2022</xref>), and the YOLOv5-Multimodal model designed in this study. Besides, to show the impact of the baseline improvement-based approach and the introduction of the FFA model, Without_FFA and Without_Improve were added as the ablation experiments for the YOLOv5-Multimodal model. No pre-training weights were used for all models in the training process, and the test environments and configurations were identical during the experiments. <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref> presents the specific comparison results.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Comparison of experimental results of different fusion methods and different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Fusion Method</th>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Parameters</th>
<th valign="top" align="center">P</th>
<th valign="top" align="center">R</th>
<th valign="top" align="center">mAP50</th>
<th valign="top" align="center">mAP95</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">Data_Fusion1</td>
<td valign="top" align="left">YOLOv5s_3ch</td>
<td valign="top" align="left">7012822</td>
<td valign="top" align="left">0.833</td>
<td valign="top" align="left">0.740</td>
<td valign="top" align="left">0.804</td>
<td valign="top" align="left">0.446</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5s_improve_3ch</td>
<td valign="top" align="left">5715670</td>
<td valign="top" align="left">0.848</td>
<td valign="top" align="left">0.754</td>
<td valign="top" align="left">0.820</td>
<td valign="top" align="left">0.446</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">Data_Fusion2</td>
<td valign="top" align="left">YOLOv5s_4ch</td>
<td valign="top" align="left">7013974</td>
<td valign="top" align="left">0.832</td>
<td valign="top" align="left">0.742</td>
<td valign="top" align="left">0.808</td>
<td valign="top" align="left">0.436</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5s_improve_4ch</td>
<td valign="top" align="left">5722230</td>
<td valign="top" align="left">0.848</td>
<td valign="top" align="left">0.756</td>
<td valign="top" align="left">0.824</td>
<td valign="top" align="left">0.460</td>
</tr>
<tr>
<td valign="top" rowspan="5" align="left">Feature_Fusion</td>
<td valign="top" align="left">GPT(s)</td>
<td valign="top" align="left">44500982</td>
<td valign="top" align="left">0.840</td>
<td valign="top" align="left">0.745</td>
<td valign="top" align="left">0.810</td>
<td valign="top" align="left">0.426</td>
</tr>
<tr>
<td valign="top" align="left">HINet(s)</td>
<td valign="top" align="left">23738982</td>
<td valign="top" align="left">0.821</td>
<td valign="top" align="left">0.731</td>
<td valign="top" align="left">0.794</td>
<td valign="top" align="left">0.413</td>
</tr>
<tr>
<td valign="top" align="left">Without_FFA</td>
<td valign="top" align="left">11261174</td>
<td valign="top" align="left">0.807</td>
<td valign="top" align="left">0.718</td>
<td valign="top" align="left">0.774</td>
<td valign="top" align="left">0.394</td>
</tr>
<tr>
<td valign="top" align="left">Without_Improve</td>
<td valign="top" align="left">26424892</td>
<td valign="top" align="left">0.834</td>
<td valign="top" align="left">0.742</td>
<td valign="top" align="left">0.809</td>
<td valign="top" align="left">0.429</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5s_Multimodal</td>
<td valign="top" align="left">
<bold>24764092</bold>
</td>
<td valign="top" align="left">
<bold>0.850</bold>
</td>
<td valign="top" align="left">
<bold>0.759</bold>
</td>
<td valign="top" align="left">
<bold>0.827</bold>
</td>
<td valign="top" align="left">
<bold>0.447</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold indicates the best experimental results.</p>
</table-wrap-foot>
</table-wrap>
<p>Overall, the detection accuracy of tea shoots was improved after the multimodal fusion method was used, indicating that the information in different modalities is complementary, and our conjecture in Section 2.2.2 is validated. Regarding the various fusion methods, the multimodal image fusion method using channel-based (Data_Fusion1) achieves a more considerable accuracy gain than the multimodal image fusion method using pixel-by-pixel (Data_Fusion2). However, it increases the number of parameters by a smaller amount. Meanwhile, the multimodal image fusion method with a feature layer introduces more parameters than the multimodal image fusion method based on the data layer. Notably, the mAP50 value of the model decreased when HINet was used directly. Since the HINet model extracts high-frequency information in the frequency domain, so it loses more low-frequency information to guide the detection of small targets. Also, the information is not filtered and aligned in the cross-modal fusion process, thereby introducing some noise that affects the training and convergence of the model. For the GPT model, although the detection accuracy was improved, the use of the multi-head self-attentive mechanism (MHSA) (<xref ref-type="bibr" rid="B38">Vaswani et&#xa0;al., 2017</xref>) in the cross-modal fusion module introduces a large number of parameters and computational effort, which is not acceptable in a low-cost agricultural application environment.</p>
<p>In contrast, the model YOLOv5s_Multimodal proposed in this study significantly reduced the number of parameters by purifying, fusing, and enhancing multimodal information in the frequency domain and obtained the best mAP50 value for the tea shoot detection. Meanwhile, by comparing the use of YOLOv5s and YOLOv5s_improve models in different fusion methods, it was found that both YOLOv5s_improve models performed optimally, which again demonstrated the superiority and robustness of the dense and tiny tea shoot detector designed in this study. Note that when the Without_FFA model was used, i.e., directly summing and fusing the features under two modes, the mAP50 value reached the lowest value, which was even lower than that of the unimodal target based on the YOLOv5s model. To analyze this result, the feature maps and 3D surface maps of the first fusion stage of the Without_FFA model and YOLOv5s_Multimodal model are shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>First fusion stage feature map visualization. <bold>(A)</bold> input RGB image; <bold>(D)</bold> input Depth image; <bold>(G)</bold> input IR image; <bold>(C)</bold> feature map of the first fusion stage in the Without_FFA model; <bold>(B)</bold> 3D surface map corresponding to the feature map of the first fusion stage in the Without_FFA model; <bold>(F)</bold> feature map of the first fusion stage in the YOLOv5s_Multimodal model phase in the YOLOv5s_Multimodal model; <bold>(E)</bold> 3D surface map corresponding to the feature map of the first fusion phase in the YOLOv5s_Multimodal model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g007.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="f7">
<bold>Figures&#xa0;7C, F</bold>
</xref> reveals that when the features extracted in different modalities are directly summed and fused, the resulting feature maps are relatively noisy, and the target edges will be more obvious for the pairs. This is because the coarse and cluttered feature information deteriorates the training and convergence of the model. However, when the FFA module was used to calibrate, purify, and enhance the feature information within and between each modality, the noise in the feature maps was significantly reduced. The tea shoot targets were more prominent, and the edges were more clearly defined. It can be seen from <xref ref-type="fig" rid="f7">
<bold>Figures&#xa0;7B, E</bold>
</xref> that in the 3D image with preserved spatial information, the tea shoots do not show significant gradient differences from the background compared to the direct summation mode of the multimodal feature information. However, after the FFA module was used again, the tea shoots exhibited noticeable gradient differences with the background leaves, which is beneficial for identifying and localizing tea shoots. Also, this demonstrates the effectiveness and superiority of our proposed FFA module on the multimodal tea shoot dataset.</p>
</sec>
<sec id="s3_3_4">
<label>3.3.4</label>
<title>Verification of scale matching</title>
<p>To investigate and validate the effectiveness of the scale-matching-based transfer learning method in tea shoot detection, a set of ablation comparison experiments was designed in this study. In the experiments, the color RGB image datasets in Dataset1 and Dataset2 were used as the experimental datasets, called Tea1 and Tea2, respectively, and they were divided into a training set, a validation set, and a test set at a ratio of 8:1:1, and YOLOv5s and YOLOv5s_improve were used as the experimental models. Firstly, this study compared the performance of the two models on Tea1 and Tea2. Secondly, Tea1 was aligned to the scale of Tea2 according to the scale matching method to obtain Tea1up, and the performance of the two models on Tea1up was compared. Similarly, Tea2 was aligned to the scale of Tea1 according to the scale-matching method to obtain Tea2d, and the performance of the two models on Tea2d was compared. Finally, the best weights obtained by training Tea2 and Tea2d were used as pre-training weights to train the model on Tea1 (denoted as Tea2_Tea1 and Tea2d_Tea1, respectively), and the best weights obtained by training on Tea1 and Tea1up were used as pre-training weights to train the model on Tea2 (denoted as Tea1_Tea2 and Tea1up_Tea2). The specific comparison results are given in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. Note that the test environment and configuration during the experiments are identical.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Comparison of experimental results of different models using objective-based scale matching and migration learning. <bold>(A)</bold> indicates the performance on the YOLOv5s model using different scale datasets and training strategies; <bold>(B)</bold> shows the performance on the YOLOv5s_improve model using different scale datasets and training strategies.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g008.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref> shows that the precision, recall, and mAP50 values of the Tea1 and Tea2 datasets were reduced when their scales were aligned to that of the original dataset. This may be because the difficulty of small object detection was exacerbated by the reduced scale of Tea2. Besides, since Tea1 ignored the small object objects in the image edges when increasing the scale, it resulted in fewer small target samples, thus affecting the training and convergence of the model. However, the model accuracy improvement could be stronger when Tea1 and Tea2 were used to guide each other&#x2019;s learning, and the scale mismatch problem may arise. When the scale-aligned datasets Tea2d and Tea1up were used to guide the model to learn on the Tea1 and Tea2 datasets, respectively, the detection accuracy was significantly improved. Additionally, to more clearly compare the performance of different scale datasets and pre-training strategies during model training and validation, the localization loss curve of the YOLOv5s_improve model on the validation set is shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Plots of box loss curves on the YOLOv5s_improve model for different scale datasets and pre-training strategies. <bold>(A)</bold> Box loss profile plots of Tea1 at different scales and pre-training strategies; <bold>(B)</bold> Box loss profile plots of Tea2 at different scales and pre-training strategies.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g009.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="f9">
<bold>Figures&#xa0;9A, B</bold>
</xref> show that when the pre-training weights were used, the initial values of the localization loss were significantly lower, with relatively small curve oscillations, and the loss converged relatively quickly. However, the localization loss converged best when the corresponding scale was used as the pre-training dataset. This also demonstrates the effectiveness of the target-based scale-matching method used in this study in guiding the small target detection task.</p>
</sec>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Heat map visualization</title>
<p>To more intuitively illustrate the impact of model improvements, explicitly modifying the baseline model for dense and small targets, and the effectiveness of multimodal feature fusion methods, this study used a gradient-weighted class activation mapping (Grad-CAM) (<xref ref-type="bibr" rid="B32">Selvaraju et&#xa0;al., 2016</xref>) to visualize the model considering the target based on tea shoots. Grad-CAM can exploit the gradient of any target concept to flow into the final convolution layer, thereby generating a rough localization map and displaying it in the form of weights, where the weight values are shown in red, yellow, green, and blue colors in decreasing order. The redder the color in the corresponding graph, the more critical the region for tea shoot detection. <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref> shows the heat map visualization results for different models under different inspection conditions.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Heat map visualization results for different models with different detection conditions. <bold>(A&#x2013;E)</bold> The input images; <bold>(F&#x2013;J)</bold> The results of YOLOv5s; <bold>(K&#x2013;O)</bold> The results of YOLOv5s_improve; <bold>(P&#x2013;T)</bold> The results of YOLOv5s-Multimodal.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g010.tif"/>
</fig>
<p>Both YOLOv5s_improve and YOLOv5s-Multimodal models perform better than YOLOv5s in various cases, e.g., the color of tea shoots is similar to the background, tea shoots are relatively sparse, the target scale is rather large, tea shoots are dense and tiny, the color of tea shoots differs from the background, and the target scale is relatively large. Note that when the tea shoot has a similar color to the leaf and its background is difficult, the YOLOv5s model collects minimal information and does not focus on many tiny tea shoot objects. However, YOLOv5s_improve focuses on more tiny tea shoot objects by enhancing the retention and extraction of detailed texture features. However, it is difficult for YOLOv5s and YOLOv5_improve to focus on the groups of tea shoots with high overlap, especially the tiny tea shoots in the overlap case where the tea shoots are relatively dense and overlapping occlusion occurs. However, the multimodal model YOLOv5s-Multimodal has multi-class information input, so it can find more tea shoots and has better segmentation ability for tea shoot groups with high overlap. Besides, it is no longer limited to the part of the stem tip. The model also considers the related connecting stems, leaves, and stems. This demonstrates the superiority of YOLOv5s-Multimodal for tea shoot detection.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Visualization of results</title>
<p>To more intuitively compare the performance of different detection models and different fusion methods on the tea shoot detection task in a natural environment, this study performed a comparative analysis of the visualization results of different types of samples after recognition. In this study, YOLOv5s (single modal), YOLOv5s_improve (single modal), YOLOv5s_improve_3ch (multimodal), YOLOv5s_improve_4ch (multimodal), and YOLOv5s-Multimodal (multimodal) were used on the test set of the corresponding experimental dataset. The inference was conducted, and the performance of these models under different detection conditions is shown in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Visualization results of different detection models and methods under different detection conditions. <bold>(A&#x2013;E)</bold> The test results of YOLOv5s; <bold>(F&#x2013;J)</bold> The test results of YOLOv5s_improve; <bold>(K&#x2013;O)</bold> The test results of YOLOv5s_improve_3ch; <bold>(P&#x2013;T)</bold> The test results of YOLOv5s_improve_4ch; <bold>(U&#x2013;Y)</bold> The test results of YOLOv5s-Multimodal. The green, blue, and red boxes indicate true positive (TP), false positive (FP), and false negative (FN) predictions, respectively.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-14-1224884-g011.tif"/>
</fig>
<p>In <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>, the first column shows relatively sparse and tiny tea shoot targets. The second column shows rather large and sparse tea shoot objects. The third column shows relatively dense and small tea-shoot objects. The fourth column shows relatively large and thick tea shoot objects, and the fifth column shows rather complex tea shoot backgrounds. Overall, under different challenging conditions, YOLOv5s_imporve and multimodal-based fusion methods can substantially reduce false negatives (FN), and there is a significant increase in true positives (TP) of YOLOv5s-Multimodal visualization results, which again demonstrates the superiority and robustness of our proposed method.</p>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusion</title>
<p>This study aims to improve the detection accuracy of dense and tiny tea shoots in a natural environment and realize real-time object detection. In this paper, a real-time dense and small tea shoot target detection algorithm is designed based on multimodal image data, baseline detection model architecture, multimodal image fusion method, scale matching, and migration learning techniques.</p>
<p>First, to make up for dense and tiny tea shoot detection in a complex environment, this paper uses the Conv layer to replace the Focus layer in the YOLOv5s baseline, which is easy to lose detailed information. This helps to extract features for tea shoot detection by enhancing the computation of the early stage of the network while using DSConv to balance the introduced computation and improve the model&#x2019;s attention to detail texture, and the recall of targets at different scales is enhanced by the FPN structure. The improved model achieves an accuracy of 84.1%, a recall of 75.1%, and a mAP50 value of 81.8% on low-resolution RGB tea shoot images, showing an improvement of 1.6%, 1.8%, and 1.7% compared to the original YOLOv5s model.</p>
<p>Second, to make up for the deficiency of RGB image-based tea shoot detection, two data layer-based multimodal fusion method and one feature layer-based multimodal fusion method are investigated in this paper. Compared with the images based on a single modality, the mAP50 values of Data_Fusion1 and Data_Fusion2 are improved by 1.9% and 2.3%, respectively. Besides, the Feature_Fusion method proposed in this paper achieves the highest mAP50 value of 82.7% at a relatively small number of parameters compared to other feature layer-based multimodal fusion methods. This study mainly introduces a frequency domain-based cross-modal attention fusion module to perform purify, align, fuse, and enhance multimodal information with minor computational effort and parameters. Thus, more complementary information beneficial to detecting dense and tiny tea shoots in complex environments is obtained. Although the feature layer-based multimodal fusion approach proposed in this study introduces a larger number of parameters compared with the data layer-based multimodal fusion approach, the former achieves optimal performance, providing a reference for feature layer-based multimodal fusion approaches. In the future, we will continue to consider the feature layer-based multimodal fusion approach in model lightweight.</p>
<p>Finally, to investigate the differences and effects of training at different scales, this study designed comparison experiments on two tea shoot datasets with target scale differences, and their detection results in different periods were compared. It can be found that small-scale target detection is very complex. To improve the accuracy and recall of tea shoot detection in various scales, this study uses migration learning techniques and scale matching to align datasets of different scales and mutually guide the models to learn at the corresponding scales, thereby improving the performance of small target detection.</p>
<p>However, there are still some drawbacks and limitations in this study. First, although the tea shoot samples used for training in this study are about 50,000, the model&#x2019;s generalization still needs to be enhanced because the image data are relatively small and do not contain all natural scenes. Secondly, affected by the data acquisition equipment, there are some voids and noises in the acquired depth maps and infrared images, and in the future, we will consider using techniques such as depth estimation, depth enhancement, and image denoising to obtain high-quality depth images and infrared images. Finally, also affected by the data acquisition equipment, the Kinectv2 device could initially acquire high-resolution RGB images; however, since the color camera has a different field of view from the depth camera, the acquired high-resolution images are not aligned with the depth images and infrared images, and the existing alignment techniques based on traditional image processing have some errors. This cannot be neglected in the detection task of dense and small tea shoots. In the future, we will consider introducing a deep learning-based image alignment method and combining it with super-resolution techniques to further improve the detection performance of dense and tiny tea shoots.</p>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/Supplementary Material.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>LS conceptualized this study, conducted experiments, wrote the original draft, and revised the manuscript. LS and ZL wrote the manuscript and performed the experiments. LS, ZC, and ZL made the experimental plan, supervised the work, and revised the manuscript. BZ and HL performed the data analysis and revised the manuscript. LS and ZC made the experimental plan and revised the manuscript. JM and YW evaluated the developed technique and revised the manuscript. LS designed the experimental plan, supervised the work, and revised the manuscript. All authors have read and agreed to the published version of the manuscript.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>This work was funded by the Research and application of key technologies for intelligent spraying based on machine vision (key technology research project) of Sichuan Provincial Department of Science and Technology (grant number 22ZDYF0095).</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>Thanks to all the partners of AI Studio for their support.</p>
</ack>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web">
<source>Jsbroks/coco-annotator: :pencil2: web-based image segmentation tool for object detection, localization, and keypoints</source>. Available at: <uri xlink:href="https://github.com/jsbroks/coco-annotator">https://github.com/jsbroks/coco-annotator</uri> (Accessed <access-date>May 14, 2023</access-date>).</citation>
</ref>
<ref id="B2">
<citation citation-type="web">
<source>Kinect/PyKinect2: wrapper to expose kinect for Windows v2 API in Python</source>. Available at: <uri xlink:href="https://github.com/Kinect/PyKinect2">https://github.com/Kinect/PyKinect2</uri> (Accessed <access-date>May 14, 2023</access-date>).</citation>
</ref>
<ref id="B3">
<citation citation-type="web">
<source>Releases &#xb7; ultralytics/yolov5</source>. Available at: <uri xlink:href="https://github.com/ultralytics/yolov5/releases">https://github.com/ultralytics/yolov5/releases</uri> (Accessed <access-date>May 15, 2023</access-date>).</citation>
</ref>
<ref id="B4">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2020</year>) <article-title>YOLOv4: optimal speed and accuracy of object detection</article-title>. <source>arXiv preprint</source> arXiv:2004.10934. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2004.10934</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bojie</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Weizhong</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ke</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Research on tea bud identification technology based on HSI/HSV color transformation</article-title>,&#x201d; in <conf-name>2019 6th International Conference on Information Science and Control Engineering (ICISCE)</conf-name>. (<publisher-name>IEEE</publisher-name>), <fpage>511</fpage>&#x2013;<lpage>515</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICISCE48695.2019.00108</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Lightweight tea bud recognition network integrating GhostNet and YOLOv5</article-title>. <source>Math. Biosci. Eng.</source> <volume>19</volume> (<issue>12</issue>), <fpage>12897</fpage>&#x2013;<lpage>12914</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3934/MBE.2022602</pub-id>. (n.d.).</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Majeed</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Karkee</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Faster r&#x2013;CNN&#x2013;based apple detection in dense-foliage fruiting-wall trees using RGB and depth features for robotic harvesting</article-title>. <source>Biosyst. Eng.</source> <volume>197</volume>, <fpage>245</fpage>&#x2013;<lpage>256</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.BIOSYSTEMSENG.2020.07.007</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>W. S.</given-names>
</name>
<name>
<surname>Alchanatis</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Ehsani</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Schueller</surname> <given-names>J. K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Immature green citrus fruit detection using color and thermal images</article-title>. <source>Comput. Electron Agric.</source> <volume>152</volume>, <fpage>117</fpage>&#x2013;<lpage>125</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.COMPAG.2018.07.011</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gen&#xe9;-Mola</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Vilaplana</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Rosell-Polo</surname> <given-names>J. R.</given-names>
</name>
<name>
<surname>Morros</surname> <given-names>J. R.</given-names>
</name>
<name>
<surname>Ruiz-Hidalgo</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Gregorio</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Multi-modal deep learning for Fuji apple detection using RGB-d cameras and their radiometric capabilities</article-title>. <source>Comput. Electron Agric.</source> <volume>162</volume>, <fpage>689</fpage>&#x2013;<lpage>698</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.COMPAG.2019.05.016</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Mei</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Developing situations of tea plucking machine</article-title>. <source>Engineering</source> <volume>06</volume>, <fpage>268</fpage>&#x2013;<lpage>273</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.4236/ENG.2014.66031</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Spatial pyramid pooling in deep convolutional networks for visual recognition</article-title>. <source>IEEE Trans Pattern Anal Mach Intell.</source> <volume>37</volume> (<issue>9</issue>), <fpage>346</fpage>&#x2013;<lpage>361</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-10578-9_23</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision &amp; Pattern Recognition</conf-name>. <fpage>770</fpage>&#x2013;<lpage>778</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Plaza</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Chanussot</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Graph convolutional networks for hyperspectral image classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>59</volume>, <fpage>5966</fpage>&#x2013;<lpage>5978</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2020.3015157</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jocher</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chaurasia</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Stoken</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Borovec</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kwon</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>ultralytics/yolov5: v6. 1-TensorRT, TensorFlow edge TPU and OpenVINO export and inference</article-title>. <source>Zenodo</source> doi:&#xa0;<pub-id pub-id-type="doi">10.5281/zenodo.1234</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Jocher</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chaurasia</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <source>YOLO by Ultralytics (Version 8.0.0)</source>. [Computer software]. <uri xlink:href="https://github.com/ultralytics/ultralytics">https://github.com/ultralytics/ultralytics</uri>. doi:&#xa0;<pub-id pub-id-type="doi">10.5281/zenodo.1234</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karunasena</surname> <given-names>G. M. K. B.</given-names>
</name>
<name>
<surname>Priyankara</surname> <given-names>H. D. N. S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tea bud leaf identification by using machine learning and image processing techniques</article-title>. <source>Int. J. Sci. Eng. Res.</source> <volume>11</volume>, <fpage>624</fpage>&#x2013;<lpage>628</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.14299/IJSER.2020.08.02</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>Y.y.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Automatic recognition of tea bud image based on support vector machine</article-title>,&#x201d; in <conf-name>Advanced Hybrid Information Processing: 4th EAI International Conference, ADHIP 2020</conf-name>, <conf-loc>Binzhou, China</conf-loc>. (<publisher-name>Springer International Publishing</publisher-name>) Vol. <volume>348</volume>, <fpage>279</fpage>&#x2013;<lpage>290</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-67874-6_26</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lyu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>High-efficiency tea shoot detection method <italic>via</italic> a compressed deep learning model</article-title>. <source>Int. J. Agric. Biol. Eng.</source> <volume>15</volume>, <fpage>159</fpage>&#x2013;<lpage>166</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.25165/J.IJABE.20221503.6896</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Weng</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Geng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>) <source>YOLOv6: a single-stage object detection framework for industrial applications</source>. Available at: <uri xlink:href="https://arxiv.org/abs/2209.02976v1">https://arxiv.org/abs/2209.02976v1</uri> (Accessed <access-date>May 15, 2023</access-date>).</citation>
</ref>
<ref id="B20">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T.-Y.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Hariharan</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Belongie</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2016</year>) <source>Feature pyramid networks for object detection</source>. Available at: <uri xlink:href="https://arxiv.org/abs/1612.03144v2">https://arxiv.org/abs/1612.03144v2</uri> (Accessed <access-date>May 14, 2023</access-date>).</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Ying</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Automatic monitoring of lettuce fresh weight by multi-modal fusion based deep learning</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/FPLS.2022.980581</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>In-field citrus detection and localisation based on RGB-d image analysis</article-title>. <source>Biosyst. Eng.</source> <volume>186</volume>, <fpage>34</fpage>&#x2013;<lpage>44</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.BIOSYSTEMSENG.2019.06.019</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>) <source>Path aggregation network for instance segmentation</source>. Available at: <uri xlink:href="https://arxiv.org/abs/1803.01534v4">https://arxiv.org/abs/1803.01534v4</uri> (Accessed <access-date>May 14, 2023</access-date>).</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Majeed</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>R.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Improved kiwifruit detection using pre-trained VGG16 with RGB and NIR information fusion</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>2327</fpage>&#x2013;<lpage>2336</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2962513</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nascimento</surname> <given-names>M. G.</given-names>
</name>
<name>
<surname>Do Prisacariu</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Fawcett</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>DSConv: efficient convolution operator</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE International Conference on Computer Vision</conf-name>. <fpage>5147</fpage>&#x2013;<lpage>5156</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2019.00525</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ning</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Guan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Spratling</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Rethinking the backbone architecture for tiny object detection</article-title>. <source>arXiv preprint</source> arXiv:2303.11267. <fpage>103</fpage>&#x2013;<lpage>114</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.5220/0011643500003417</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Park</surname> <given-names>S.-H. J.-S. S. B. S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>High-frequency interchange network for multispectral object detection</article-title>. <source>J. Korea Institute Inf. Commun. Eng.</source> <volume>26</volume>, <fpage>1121</fpage>&#x2013;<lpage>1129</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.6109/JKIICE.2022.26.8.1121</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qingyun</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Dapeng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhaokui</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Cross-modality fusion transformer for multispectral object detection</article-title>. <source>arXiv preprint</source> arXiv:2111.00273. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2111.00273</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: unified, real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>) <source>YOLOv3: an incremental improvement</source>. Available at: <uri xlink:href="https://arxiv.org/abs/1804.02767v1">https://arxiv.org/abs/1804.02767v1</uri> (Accessed <access-date>May 15, 2023</access-date>).</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
</person-group>(<year>2023</year>). <article-title>Tomato cluster detection and counting using improved YOLOv5 based on RGB-d fusion</article-title>. <source>Comput. Electron Agric.</source> <volume>207</volume>, <elocation-id>107741</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.COMPAG.2023.107741</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Selvaraju</surname> <given-names>R. R.</given-names>
</name>
<name>
<surname>Cogswell</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Das</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Vedantam</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Parikh</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Batra</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Grad-CAM: Visual explanations from deep networks via gradient-based localization</article-title>. <source>Int. J. Comput. Vis.</source> <volume>128</volume>, <fpage>336</fpage>&#x2013;<lpage>359</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11263-019-01228-7</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stefanics</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Fox</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>COCO Annotator: Web-Based image segmentation tool for object detection, localization, and keypoints</article-title>. <source>ACM SIGMultimedia Records</source> <volume>13</volume> (<issue>3</issue>), <fpage>1</fpage>&#x2013;<lpage>1</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3578495.3578502</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Chai</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Noise-tolerant RGB-d feature fusion network for outdoor fruit detection</article-title>. <source>Comput. Electron Agric.</source> <volume>198</volume>, <elocation-id>107034</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.COMPAG.2022.107034</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Deep high-resolution representation learning for human pose estimation</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition</conf-name>. <fpage>5693</fpage>&#x2013;<lpage>5703</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00584</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Automatic apple recognition based on the fusion of color and 3D feature for robotic fruit picking</article-title>. <source>Comput. Electron Agric.</source> <volume>142</volume>, <fpage>388</fpage>&#x2013;<lpage>396</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.COMPAG.2017.09.019</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xue</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Qi</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Detection of passion fruits and maturity classification using red-Green-Blue depth images</article-title>. <source>Biosyst. Eng.</source> <volume>175</volume>, <fpage>156</fpage>&#x2013;<lpage>167</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.BIOSYSTEMSENG.2018.09.004</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Vaswani</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Shazeer</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Parmar</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Uszkoreit</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Jones</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Gomez</surname> <given-names>A. N.</given-names>
</name>
<etal/>
</person-group>. (<year>2017</year>) <source>Attention is all you need. adv neural inf process syst 2017-December 5999&#x2013;6009</source>. Available at: <uri xlink:href="https://arxiv.org/abs/1706.03762v5">https://arxiv.org/abs/1706.03762v5</uri> (Accessed <access-date>May 14, 2023</access-date>).</citation>
</ref>
<ref id="B39">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2022</year>) <source>YOLOv7: trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</source>. Available at: <uri xlink:href="https://arxiv.org/abs/2207.02696v1">https://arxiv.org/abs/2207.02696v1</uri> (Accessed <access-date>May 15, 2023</access-date>).</citation>
</ref>
<ref id="B40">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Mark Liao</surname> <given-names>H. Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y. H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>P. Y.</given-names>
</name>
<name>
<surname>Hsieh</surname> <given-names>J. W.</given-names>
</name>
<name>
<surname>Yeh</surname> <given-names>I. H.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>CSPNet: a new backbone that can enhance learning capability of CNN</article-title>,&#x201d; in <conf-name>IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops</conf-name>. <fpage>390</fpage>&#x2013;<lpage>391</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPRW50498.2020.00203</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Chanussot</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Convolutional neural networks for multimodal remote sensing data classification</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>60</volume>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2021.3124913</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Chanussot</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>UIU-Net: U-Net in U-Net for infrared small object detection</article-title>. <source>IEEE Trans. Image Process.</source> <volume>32</volume>, <fpage>364</fpage>&#x2013;<lpage>376</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIP.2022.3228497</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chanussot</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Tao</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>ORSIm detector: a novel object detection framework in optical remote sensing imagery using spatial-frequency channel features</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>57</volume>, <fpage>5146</fpage>&#x2013;<lpage>5158</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2019.2897139</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiaoxiao</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Shaomin</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yongyu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhihao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Tingting</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Detection algorithm of tea tender buds under complex background based on deep learning</article-title>. <source>J. Hebei University (Natural Sci. Edition)</source> <volume>39</volume>, <fpage>211</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3969/J.ISSN.1000-1565.2019.02.015</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Detection and classification of tea buds based on deep learning</article-title>. <source>Comput. Electron Agric.</source> <volume>192</volume>, <elocation-id>106547</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/J.COMPAG.2021.106547</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Gong</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Scale match for tiny person detection</article-title>,&#x201d; in <conf-name>Proceedings - 2020 IEEE Winter Conference on Applications of Computer Vision, WACV 2020</conf-name>. <fpage>1257</fpage>&#x2013;<lpage>1265</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/WACV45572.2020.9093394</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>