<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2024.1492504</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>BHC-YOLOV8 : improved YOLOv8-based BHC target detection model for tea leaf disease and defect in real-world scenarios</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhan</surname>
<given-names>BaiShao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1498517"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Xiong</surname>
<given-names>Xi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Xiaoli</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1614678"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Luo</surname>
<given-names>Wei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Electrical and Automation Engineering, East China Jiaotong University</institution>, <addr-line>Nanchang</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>College of Biosystems Engineering and Food Science, Zhejiang University</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Yuzhen Lu, Michigan State University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Jiajun Xu, Michigan State University, United States</p>
<p>Ziwei Lyu, Huazhong Agricultural University, China</p>
<p>Zhiming Zhang, Wuhan Textile University, China</p>
<p>Angelo Cardellicchio, National Research Council (CNR), Italy</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: BaiShao Zhan, <email xlink:href="mailto:3050@ecjtu.edu.cn">3050@ecjtu.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>02</day>
<month>12</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1492504</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>09</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>10</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Zhan, Xiong, Li and Luo</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Zhan, Xiong, Li and Luo</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The detection efficiency of tea diseases and defects ensures the quality and yield of tea. However, in actual production, on the one hand, the tea plantation has high mountains and long roads, and the safety of inspection personnel cannot be guaranteed; on the other hand, the inspection personnel have factors such as lack of experience and fatigue, resulting in incomplete and slow testing results. Introducing visual inspection technology can avoid the above problems.</p>
</sec>
<sec>
<title>Methods</title>
<p>Firstly, a dynamic sparse attention mechanism (Bi Former) is introduced into the model backbone. It filters out irrelevant key value pairs at the coarse region level, utilizing sparsity to save computation and memory; jointly apply fine region token to token attention in the remaining candidate regions. Secondly, Haar wavelets are introduced to improve the down sampling module. By processing the input information flow horizontally, vertically, and diagonally, the original image is reconstructed. Finally, a new feature fusion network is designed using a multi-head attention mechanism to decompose the main network into several cascaded stages, each stage comprising a sub-backbone for parallel processing of different features. Simultaneously, skip connections are performed on features from the same layer, and unbounded fusion weight normalization is introduced to constrain the range of each weight value.</p>
</sec>
<sec>
<title>Results</title>
<p>After the above improvements, the confidence level of the current mainstream models increased by 7.1%, mAP0.5 increased by 8%, and reached 94.5%. After conducting ablation experiments and comparing with mainstream models, the feature fusion network proposed in this paper reduced computational complexity by 10.6 GFlops, increased confidence by 2.7%, and increased mAP0.5 by 3.2%.</p>
</sec>
<sec>
<title>Discussion</title>
<p>This paper developed a new network based on YOLOv8 to overcome the difficulties of tea diseases and defects such as small target, multiple occlusion and complex background.</p>
</sec>
</abstract>
<kwd-group>
<kwd>BiFormer</kwd>
<kwd>Haar</kwd>
<kwd>down sampling</kwd>
<kwd>skip connections</kwd>
<kwd>YOLOv8</kwd>
<kwd>tea</kwd>
</kwd-group>
<counts>
<fig-count count="9"/>
<table-count count="5"/>
<equation-count count="1"/>
<ref-count count="37"/>
<page-count count="12"/>
<word-count count="5191"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Tea leaf defects and diseases significantly impact both the yield and quality of tea. Statistics show that these issues result in an annual loss of nearly 5% of tea production (<xref ref-type="bibr" rid="B6">Chen et&#xa0;al., 2020</xref>). Traditional preventive measures heavily rely on farmers&#x2019; experience and manual inspection, which present various challenges (<xref ref-type="bibr" rid="B2">Atila et&#xa0;al., 2021</xref>). Some tea gardens are located in steep terrains, making timely inspections difficult. Additionally, large areas of tea cultivation mean that manual inspection efficiency is low, posing potential risks (<xref ref-type="bibr" rid="B3">Baranwal et&#xa0;al., 2021</xref>). Given the current production landscape, manual identification methods are insufficient to meet the demands of modern large-scale cultivation (<xref ref-type="bibr" rid="B4">Barburiceanu et&#xa0;al., 2021</xref>).</p>
<p>With the continuous development of image processing technology, traditional manual agriculture in China is transitioning towards computerized, intelligent, and digital agriculture (<xref ref-type="bibr" rid="B11">Dhaka et&#xa0;al., 2021</xref>). Utilizing computer vision (<xref ref-type="bibr" rid="B16">Li et&#xa0;al., 2022</xref>) to prevent tea leaf defects not only reduces economic losses from manual labor but also enhances tea yield and quality (<xref ref-type="bibr" rid="B27">Tiwari et&#xa0;al., 2021</xref>). <xref ref-type="bibr" rid="B25">Sun et&#xa0;al. (2019)</xref> proposed a new method combining simple linear iterative cluster and SVM to achieve accurate of tea tree leaf disease salinity maps in a complex background context. With the advancement of deep learning, an increasing number of researchers are exploring its application in detecting crop leaf diseases and pest infestations. The rise of image recognition technologies has particularly highlighted the effectiveness of convolutional neural networks (CNNs) in the automatic classification and identification of plant diseases (<xref ref-type="bibr" rid="B8">Chen et&#xa0;al., 2019</xref>). For example, <xref ref-type="bibr" rid="B8">Chen et&#xa0;al. (2019)</xref> developed a CNN model named LeafNet, designed to automatically extract features from images of tea tree diseases.</p>
<p>While the above methods have performed well in the treatment of crop diseases, they focus solely on either crop disease image identification or classification. In recent years, with the rapid development of chip computing power, deep learning technology relying on computing power has also been applied in the field of image detection and processing. Its advantages are mainly reflected in its powerful feature extraction ability, high accuracy, strong generalization ability, real-time performance, and intelligent processing (<xref ref-type="bibr" rid="B31">Wang et&#xa0;al., 2024b</xref>). Algorithms based on deep learning can learn effective feature representations from massive image data, capturing subtle and complex features, which is crucial for accurate detection; meanwhile, deep learning models can learn advanced features of images and accurately detect and classify new, unseen images. Image detection networks based on deep learning have been categorized into two main types: two-stage and one-stage detection networks (<xref ref-type="bibr" rid="B14">Jiao et&#xa0;al., 2019</xref>). Faster Region-Based Convolutional Neural Networks (Faster R-CNN) stand out as a prominent example of the former. Although Faster R-CNN offers high detection accuracy (<xref ref-type="bibr" rid="B22">Ren et&#xa0;al., 2016</xref>), its slower processing speed fails to meet real-time application demands. In contrast, one-stage detection networks, including You Only Look Once (YOLO) (<xref ref-type="bibr" rid="B20">Redmon et&#xa0;al., 2016</xref>), Single Shot MultiBox Detector (SSD) (<xref ref-type="bibr" rid="B19">Liu et&#xa0;al., 2016</xref>), and RetinaNet (<xref ref-type="bibr" rid="B18">Lin et&#xa0;al., 2017</xref>), are favored for their efficiency. The YOLO family, in particular, has gained significant traction in agriculture due to its ability to deliver both speedy and accurate detections. <xref ref-type="bibr" rid="B26">Tian et&#xa0;al. (2019)</xref> employed YOLOv3 to design a system capable of real-time detection of apples at three different growth stages within an orchard. <xref ref-type="bibr" rid="B23">Roy et&#xa0;al. (2022)</xref> enhanced YOLOv4 to create a high-performance, real-time, fine-grained target detection framework adept at navigating challenges such as dense distribution and irregular morphology. <xref ref-type="bibr" rid="B24">Sun et&#xa0;al. (2022)</xref> introduced an innovative approach by integrating the YOLO-v4 deep learning network with computer graphics algorithms for improved segmentation of overlapping tree crowns. Additionally, <xref ref-type="bibr" rid="B10">Dai and Fan (2022)</xref> developed a crop leaf disease detection method named YOLOv5-CAcT, which is based on the latest YOLOv5 model, showcasing the ongoing evolution and application of these networks in agricultural settings. <xref ref-type="bibr" rid="B32">Weihao et&#xa0;al. (2023)</xref> proposed a tea disease identification model based on YOLOv7, achieving a recognition accuracy of 94.2% for five types of tea diseases. However, these methods were trained on single leaf datasets rather than directly captured from tea plants in real production environments, limiting their applicability in practical scenarios.</p>
<p>In production and daily life, drone inspection is a very practical means. However, in order to ensure their own safety, drones need to be 40-100cm away from tea trees, and the captured images will inevitably capture fallen leaves and weeds in the gaps between tea trees (<xref ref-type="bibr" rid="B35">Yuan et&#xa0;al., 2022</xref>), which will seriously interfere with the accuracy of the model. To solve the above problems, this paper inserts the BiFormer attention module into the backbone layer and adds a detection head to improve the detection success rate in complex backgrounds; at the same time, conventional sampling modules cannot distinguish between fallen leaves and pests and diseases. This paper introduces Haar wavelet function to improve the downsampling module, which can identify disease defects without interference from fallen leaves and weeds. Finally, in order to ensure the lightweighting of the model, a new feature fusion network was designed for the entire model to reduce computational complexity and facilitate deployment on mobile devices.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Image data</title>
<p>Due to the lack of authoritative public tea datasets, the data used in this article was collected in April and May at longitude 115&#xb0;8'14.54''E and latitude 32&#xb0;43'47.75''N. These images were captured under natural light using a Huawei Mate60 portable device and a Sony ILME-FX30B camera, with a total of 4000 data samples collected. The pixel resolution of the image is 3024 * 4032. Among them, tea farmers and tea experts identified 43 images as red leaf spots, 213 images as algal leaf spots, 324 images as bird eye disease, 1102 images as gray wilt, 43 images as white spots, 75 images as anthrax, 1213 images as brown wilt, and 987 images as healthy leaves. Due to the limited data collected on tea defects and diseases, and the fact that the images were taken under clear weather conditions, this paper simulated adverse conditions to improve the generalization performance of the model. These simulation conditions include defocused images, partial data loss, heavy rain and snow. Data augmentation simulated conditions such as partial image loss, motion blur, early morning and dusk lighting, as well as fog, rain, snow, and wind. This method not only simulates various situations encountered in actual production, but also improves the generalization performance of the training dataset. After scaling up the original dataset by 2.5 times, a total of 10000 images were obtained. The dataset includes 10000 annotated bounding boxes (BBOX) for all defect types. Among them, 80% is the training set and 20% is the validation set. Each bounding box is manually annotated using open-source annotation tools to ensure that every defect is fully included in BBOX. <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> shows a subset of the enhanced dataset.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Tomato samples and cross sections.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g001.tif"/>
</fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>YOLOv8 detection algorithm</title>
<p>The model in this paper adopts an improved CSPDarknet53 as the backbone network (<xref ref-type="bibr" rid="B28">Wang et&#xa0;al., 2023</xref>) for YOLOv8. It conducts down sampling on input features five times, resulting in five different scales of features, denoted as B1 to B5. The structure of the backbone network is illustrated in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2F</bold>
</xref>. The CSP (Cross Stage Partial) module in the original backbone network of previous versions is replaced by the C2f module. The structure of the C2f module is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2D</bold>
</xref>, where &#x2018;n&#x2019; represents the number of bottlenecks. The C2f module adopts gradient parallel connections, enriching the information flow of the feature extraction network while maintaining a lightweight design. The ConvModule module conducts convolutional operations on input information, followed by batch normalization, and then utilizes the SiLU activation function to obtain the output result, as shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2C</bold>
</xref>. The backbone network concludes by utilizing an improved down sampling module to pool input feature maps into fixed-size adaptive-sized outputs. Compared to the original Spatial Pyramid Pooling (SPPF) structure, the new connection layers can retain more feature information, as shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2A</bold>
</xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>YOLOv8 architecture diagram. <bold>(A)</bold> Pooling. <bold>(B)</bold> Detect. <bold>(C)</bold> ConvMoudule. <bold>(D)</bold> C2f. <bold>(E)</bold> Bottleneck. <bold>(F)</bold> backbone. <bold>(G)</bold> backbone. <bold>(H)</bold> Head.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g002.tif"/>
</fig>
<p>Inspired by PANet, the original YOLOv8 incorporates a PAN-FPN structure at the neck (<xref ref-type="bibr" rid="B28">Wang et&#xa0;al., 2023</xref>). Compared to the neck structures of YOLOv5 and YOLOv7 models, YOLOv8 removes the convolutional operation after up sampling in the PAN structure, as shown in <xref ref-type="fig" rid="f2"><bold>Figure 2E</bold></xref> achieving model lightweighting while maintaining the original performance. YOLOv8 adopts a top-down and bottom-up network structure to integrate semantic information from deep and shallow features. However, this fusion is superficial. To address this, we designed a new feature fusion network based on the PAN-FPN architecture. Through the analysis of tea leaf defect images, it was determined that spatial positional information of features is not necessary in practical applications. Therefore, part of the feature information flow can be trimmed to reduce computational costs. Simultaneously, feature fusion is achieved by merging different nodes of the same feature layer, retaining more features of tea pests and diseases without increasing computational costs.</p>
<p>The detection part of YOLOv8 adopts a decoupled head structure, as shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2B</bold>
</xref>. This structure employs two independent branches for object classification and bounding box regression prediction, each using different loss functions. For the classification task, binary cross-entropy loss (BCELoss) is used. For the bounding box regression task, Distribution Focal Loss (DFL) and Complete Intersection over Union (CIoU) are employed. This detection structure improves detection accuracy and accelerates model convergence. YOLOv8 is an anchor-free detection model, which simplifies the specification of positive and negative samples. It also utilizes the Task-Aligned Assigner to dynamically assign samples, enhancing the detection accuracy and robustness of the model.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Bi Former</title>
<p>To focus the detection model on tea leaf defects and diseases while reducing attention on other regions, we introduce a dynamic sparse attention mechanism called Bi Former (<xref ref-type="bibr" rid="B37">Zhu et&#xa0;al., 2023</xref>) into the backbone network of the model. Bi Former utilizes adaptive querying to filter out the least relevant key-value pairs in the coarse-grained regions of the input feature map. It then efficiently identifies the key-value pairs with higher relevance and performs attention computation on them. This significantly reduces computational and storage costs, enhancing the model&#x2019;s ability to perceive the input content. YOLOv8 is a convolutional neural network (CNN) model. The essence of a CNN is local processing, which limits its ability to capture relationships between global features. Compared to traditional CNN models, transformers use an attention mechanism to capture the relationships between different pieces of data, providing a global receptive field. An effective attention mechanism can build robust and powerful data-driven models, making them more flexible when handling complex, large-scale data.</p>
<p>The Bi Former module is designed based on a dual-stage routing attention mechanism, as shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. In this block, DW Conv represents depth wise separable convolution, which reduces the number of parameters and the computational load of the model. LN stands for layer normalization, which accelerates training and improves the model&#x2019;s generalization ability. MLP, or multilayer perceptron, further processes and adjusts attention weights, enhancing the model&#x2019;s focus on different features. The addition symbol in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref> represents the concatenation of two feature vectors.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Principle and operational diagram of BI FORMER.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g003.tif"/>
</fig>
<p>The introduction of the Bi Former block into the backbone network in this paper serves two purposes. First, Bi Former considers the limited computational power and storage resources of mobile platforms. Second, the dynamic attention mechanism within this block enhances the model&#x2019;s focus on crucial target information, thereby optimizing the model&#x2019;s detection performance. To fully leverage the efficient attention mechanism of this block, we added the Bi Former block between the model backbone networks B1 and B2.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Down sampling</title>
<p>Down sampling can aggregate local information, expand the receptive field, and reduce computational costs. Conventional down sampling operations mainly involve max-pooling and stride convolution. However, pooling operations on local regions can lead to the loss of important spatial information, which is detrimental to accurate detection. To address this, we introduce down sampling operations based on the Haar (<xref ref-type="bibr" rid="B33">Xu et&#xa0;al., 2023</xref>) wavelet.</p>
<p>The core idea of the new down sampling operation is to use Haar wavelet transformation to reduce the spatial resolution of feature maps while preserving more information. This approach enhances the ability of semantic segmentation and reduces information uncertainty. For 2D image Haar decomposition, it can be seen as performing 1D Haar decomposition separately on all columns and all rows. Depending on the order of decomposing rows and columns, two different decomposition methods can be generated. The specific process is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The operational process of the Haar wavelet.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g004.tif"/>
</fig>
<p>From <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref> it can be seen that the module first preprocesses the input information flow in the horizontal and vertical directions by performing averaging and differencing operations on the information flow separately. Then, down sampling is performed. Next, the processed information flow undergoes diagonal direction processing, where it is averaged and differenced to obtain diagonal subbands. Each of these subbands is then down sampled. This process iteratively repeats for each subband.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Downsampling Module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g005.tif"/>
</fig>
<p>Finally, an inverse transformation is applied to each subband to reconstruct the original image. These steps constitute the lossless feature encoding module primarily based on the Haar wavelet transform. Subsequently, the output information flow undergoes convolution, normalization, and activation function processing to reduce the number of channels.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Feature fusion</title>
<p>YOLOv8 itself uses a simplified FPN-PANet in its neck to perform feature fusion, reducing the loss of information. The core idea of FPN is to construct a feature pyramid at different levels of the image to capture objects at different scales: by up sampling the deep feature maps to match the size of the shallow feature maps (<xref ref-type="bibr" rid="B12">Gong et&#xa0;al., 2021</xref>), and then performing an addition operation. PAN, on the other hand, employs a cascaded operation, which can retain more detailed information, thereby improving detection accuracy (<xref ref-type="bibr" rid="B29">Wang et&#xa0;al., 2019</xref>).</p>
<p>However, the above operations have two drawbacks: first, they do not focus on features at the same level; second, the merging process can introduce delays, leading to suboptimal merging effects. Considering that in tea plantation inspections using drones, multiple photos are taken of the same area, the edge features of a single photo are not our primary concern. At the same time, when the drone takes photos, it is approximately 50 cm away from the top of the tea trees. Each photo contains a large number of tea leaves, which implies there are many instances of defects and diseases.</p>
<p>To address these issues, our approach focuses on refining the feature fusion process to enhance the detection of tea leaf defects and diseases in such scenarios. By prioritizing crucial target features and optimizing the merging process, we aim to achieve more accurate and efficient detection results.</p>
<p>Based on the limited receptive field of CNN networks, they can only localize regions with distinctiveness. As shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, Therefore, the first step is to use a multi-head attention mechanism to segment the image into patches with distinctive features. Since deep features reflect specific information about objects and require global context, a transformer encoder is used to process deep features to enhance object detection performance.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Arithmetic unit structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g006.tif"/>
</fig>
<p>Next, under the condition of unchanged computational resources, allocating more parameters for feature fusion can be achieved by intuitively reducing the backbone layers and expanding the fusion modules. From <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>, To achieve this, the backbone network is decomposed into several smaller cascaded stages, generating richer scale features. Each stage consists of a sub-backbone and a transformation module.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>The running diagram in the model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g007.tif"/>
</fig>
<p>Performing skip connections on the same feature layer helps preserve more feature information. The Transition block utilizes 1x1 convolutions to align the channel numbers in the sampling points and uses bilinear interpolation to align the spatial sizes of features. The Focal Block, on the other hand, enlarges the convolutional kernel to expand the receptive field, thereby acquiring more feature information.</p>
<p>By implementing these modifications, the model can better handle complex scenes with multiple instances of tea leaf defects and diseases, improving detection accuracy and robustness.</p>
<p>The contributions of features from images with different resolutions are unequal, hence an additional weight is introduced for learning. Building upon Unbounded Fusion, normalization of weights is conducted to constrain the value range of each weight. Unbounded Fusion refers to integrating features from different resolutions without explicit boundaries.</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results and discussion</title>
<sec id="s3_1">
<label>3.1</label>
<title>Experimental facilities</title>
<p>To verify the positive impact of each module on the model, YOLOv8n was used as the baseline model, and ablation experiments were conducted separately on the BiFormer module, Haar module, and feature fusion module. In order to ensure the accuracy of the experimental results, the parameter settings in each individual module are the same.</p>
<p>At the same time, in order to ensure that the pre trained weight structure and the target model structure are the same in the experiment, all three experimental groups will undergo weight pre training before the formal experiment, and the weight pre training dataset will use the dataset from Chapter 2, which will not cause overfitting.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Ablation experiment</title>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>The attention mechanism comparative experiment</title>
<p>To verify the superiority of introducing Bi Former, we conducted comparative experiments using Bi Former and some mainstream attention mechanisms on the YOLOv8n baseline model while keeping other training conditions consistent. The experimental results, as shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> indicate that when BiFormer is incorporated into the backbone network of the model, it achieves the best detection performance. Furthermore, the model with the attention module incorporated shows a 16.5% increase in mAP50 compared to when the attention module is not introduced.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Detection results of different attention mechanism.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Metrics</th>
<th valign="top" align="center">Precision/%</th>
<th valign="top" align="center">Recall%</th>
<th valign="top" align="center">mAP0.5/%</th>
<th valign="top" align="center">mAP0.5:0.95/%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Nothing</td>
<td valign="top" align="center">80.4</td>
<td valign="top" align="center">64.8</td>
<td valign="top" align="center">72.2</td>
<td valign="top" align="center">47.7</td>
</tr>
<tr>
<td valign="top" align="center">SE</td>
<td valign="top" align="center">81.1</td>
<td valign="top" align="center">63.0</td>
<td valign="top" align="center">70.1</td>
<td valign="top" align="center">49.2</td>
</tr>
<tr>
<td valign="top" align="center">CBAM</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">71.1</td>
<td valign="top" align="center">70.2</td>
<td valign="top" align="center">49.5</td>
</tr>
<tr>
<td valign="top" align="center">ECA</td>
<td valign="top" align="center">80.3</td>
<td valign="top" align="center">68.5</td>
<td valign="top" align="center">69.9</td>
<td valign="top" align="center">48.3</td>
</tr>
<tr>
<td valign="top" align="center">ContextAggregation</td>
<td valign="top" align="center">84.9</td>
<td valign="top" align="center">75.2</td>
<td valign="top" align="center">83.3</td>
<td valign="top" align="center">61.5</td>
</tr>
<tr>
<td valign="top" align="center">BIFORMER</td>
<td valign="top" align="center">
<bold>89.8</bold>
</td>
<td valign="top" align="center">
<bold>82.3</bold>
</td>
<td valign="top" align="center">
<bold>88.7</bold>
</td>
<td valign="top" align="center">
<bold>65.9</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold indicates the optimal value of the current indicator.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>For achieving optimal performance after adding the Bi Former block, this paper conducted the following comparative experiments. We used YOLOv8n as the baseline model and added Bi Former blocks at different layers of the backbone network. The results are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref> From the experimental results, it can be observed that adding the Bi Former block to deeper layers of the network leads to higher detection performance, but also increases computational complexity. Adding Bi Former to layers B4-B5 increased the computational load by 9.5 times, yet the improvements in various metrics were less than 3%. In order to balance detection performance and computational requirements, this paper added the Bi Former block to layers B1-B2.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Detection results of different depths of Bi Former module.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Precision/%</th>
<th valign="top" align="center">Recall%</th>
<th valign="top" align="center">mAP0.5/%</th>
<th valign="top" align="center">mAP0.5:0.95/%</th>
<th valign="top" align="center">FLOPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">B1-B2</td>
<td valign="top" align="center">89.8</td>
<td valign="top" align="center">82.3</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">65.9</td>
<td valign="top" align="center">17.6G</td>
</tr>
<tr>
<td valign="top" align="center">B2-B3</td>
<td valign="top" align="center">89.1</td>
<td valign="top" align="center">81.4</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">62.2</td>
<td valign="top" align="center">35.2G</td>
</tr>
<tr>
<td valign="top" align="center">B3-B4</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">83.3</td>
<td valign="top" align="center">89.1</td>
<td valign="top" align="center">67.4</td>
<td valign="top" align="center">78.9G</td>
</tr>
<tr>
<td valign="top" align="center">B4-B5</td>
<td valign="top" align="center">91.1</td>
<td valign="top" align="center">83.6</td>
<td valign="top" align="center">90.0</td>
<td valign="top" align="center">68.6</td>
<td valign="top" align="center">168.2G</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In the experimental results, we can see that the total amount calculated by BiFormer varies greatly at different depths, but the difference in results is not significant. This is because the module runs in four stages, each of which reduces the resolution of the input image while increasing the number of channels <italic>C</italic>. The total calculation amount is shown in the following formula:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>3</mml:mn>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mn>3</mml:mn>
<mml:mi>C</mml:mi>
<mml:msup>
<mml:mi>k</mml:mi>
<mml:mrow>
<mml:mfrac>
<mml:mn>2</mml:mn>
<mml:mn>3</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mn>4</mml:mn>
<mml:mn>3</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>k</italic> is the number of regions to participate in.</p>
<p>The number of channels in feature maps with different layers will increase with the increase of layers.</p>
<p>BIformer will divide the input sequence into two parts, performing self-attention calculation and cross attention calculation respectively. The former captures the internal dependencies of the sequence, while the latter captures the dependencies between sequences. Although they perform better when placed at a deeper level, their principle is to filter out key value pairs that are irrelevant to the query at a coarse-grained level, and adaptively focus on the most relevant key value pairs at a fine-grained level; placing it into a deeper network can provide it with more detailed information, but shallower networks can also provide the vast majority of key information, so its performance growth is not significant.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Haar wavelet experiment</title>
<p>In convolutional neural networks, pooling layers are used to reduce the spatial size of data, decrease computational complexity, while retaining important features. Commonly used pooling methods include: max pooling, average pooling, and adaptive average pooling.</p>
<p>Pooling convolutional layers can easily lose feature data and spatial location information, affecting detection performance. In the baseline model of YOLOv8n, spatial pyramid pooling is used when transitioning from the backbone network to deeper layers. The fundamental unit of spatial pyramid pooling is max pooling. Although it improves upon the drawbacks of max pooling, it still cannot entirely avoid the loss of feature information. The paper introduces a down sampling module based on Haar wavelet functions and compares it with common pooling methods. The results are shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. When inputting the same image, it&#x2019;s evident that Haar wavelet-based pooling can preserve feature and spatial information to a greater extent. From <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref> we can see that Haar has higher confidence and mAP than its peers, but its recall rate is not as good as Adbptpool. This is because Adbptpool adaptively calculates weights, which increases its computational load. Maxpool is the most commonly used method, with relatively balanced performance but not as high accuracy as Haar. In summary, we ultimately used Haar as the downsampling module.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Comparison of different pooling methods.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g008.tif"/>
</fig>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of different pooling methods.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Metrics</th>
<th valign="top" align="center">Precision/%</th>
<th valign="top" align="center">Recall%</th>
<th valign="top" align="center">mAP0.5/%</th>
<th valign="top" align="center">mAP0.5:0.95/%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Maxpool</td>
<td valign="top" align="center">66.8</td>
<td valign="top" align="center">55.1</td>
<td valign="top" align="center">63.2</td>
<td valign="top" align="center">37.2</td>
</tr>
<tr>
<td valign="top" align="center">Haar</td>
<td valign="top" align="center">
<bold>70.2</bold>
</td>
<td valign="top" align="center">58.2</td>
<td valign="top" align="center">
<bold>69.5</bold>
</td>
<td valign="top" align="center">
<bold>44.6</bold>
</td>
</tr>
<tr>
<td valign="top" align="center">Avgpool</td>
<td valign="top" align="center">60.5</td>
<td valign="top" align="center">50.6</td>
<td valign="top" align="center">57.1</td>
<td valign="top" align="center">31.4</td>
</tr>
<tr>
<td valign="top" align="center">Adbptpool</td>
<td valign="top" align="center">62.3</td>
<td valign="top" align="center">
<bold>58.9</bold>
</td>
<td valign="top" align="center">61.2</td>
<td valign="top" align="center">36.1</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold indicates the optimal value of the current indicator.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Feature fusion network</title>
<p>In YOLOv8, the feature fusion network in the backbone is FPN-PANet. To validate the improved feature fusion network proposed in this paper, comparative experiments were conducted using v8n as the baseline model. Several mainstream feature fusion structures were also compared. The results are shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. From the table, we can observe that compared to the baseline modelFPN, as the earliest proposed pyramid network, is the foundation for subsequent multi-scale network design. Its disadvantages are twofold. Firstly, it only adopts a top-down path, resulting in insufficient low-level information; secondly, it lacks dynamic weights, leading to underutilization of some important features. PANet introduced bidirectional paths, increasing the complexity of feature fusion, but its performance was not as expected in complex backgrounds. The NAS-FPN architecture is optimized for specific tasks and datasets, with high search costs and complex structures. BiFPN can learn weight dependencies, but it is prone to getting stuck in local optima, resulting in limited performance improvement. The model proposed in this article considers the characteristics of tea disease detection tasks and takes into account practical application situations, partially introducing bidirectional paths and weight dependencies.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Detection results of different feature fusion networks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Precision/%</th>
<th valign="top" align="center">Recall%</th>
<th valign="top" align="center">mAP0.5/%</th>
<th valign="top" align="center">mAP0.5:0.95/%</th>
<th valign="top" align="center">FLOPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">FPN</td>
<td valign="top" align="center">78.9</td>
<td valign="top" align="center">58.1</td>
<td valign="top" align="center">63.1</td>
<td valign="top" align="center">37.2</td>
<td valign="top" align="center">
<bold>7.9G</bold>
</td>
</tr>
<tr>
<td valign="top" align="center">PANet</td>
<td valign="top" align="center">79.2</td>
<td valign="top" align="center">58.3</td>
<td valign="top" align="center">62.4</td>
<td valign="top" align="center">38.4</td>
<td valign="top" align="center">10.4G</td>
</tr>
<tr>
<td valign="top" align="center">NAS-FPN</td>
<td valign="top" align="center">78.5</td>
<td valign="top" align="center">57.6</td>
<td valign="top" align="center">62.6</td>
<td valign="top" align="center">37.6</td>
<td valign="top" align="center">9.3G</td>
</tr>
<tr>
<td valign="top" align="center">FPN-PANet</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">65.8</td>
<td valign="top" align="center">72.2</td>
<td valign="top" align="center">48.1</td>
<td valign="top" align="center">15.2G</td>
</tr>
<tr>
<td valign="top" align="center">BIFPN</td>
<td valign="top" align="center">83.7</td>
<td valign="top" align="center">
<bold>76.3</bold>
</td>
<td valign="top" align="center">83.1</td>
<td valign="top" align="center">62.0</td>
<td valign="top" align="center">22.7G</td>
</tr>
<tr>
<td valign="top" align="center">Ours</td>
<td valign="top" align="center">
<bold>86.4</bold>
</td>
<td valign="top" align="center">75.9</td>
<td valign="top" align="center">
<bold>86.3</bold>
</td>
<td valign="top" align="center">
<bold>63.4</bold>
</td>
<td valign="top" align="center">12.1G</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold indicates the optimal value of the current indicator.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>From the table, we can observe that compared to the baseline model, our feature fusion structure exhibits better detection accuracy, with a 19.5% increase in mAP0.5, while the computational complexity decreases by 20.4%. Therefore, it can be concluded that our structure preserves more feature information during feature fusion with minimal computational overhead.</p>
</sec>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Comparative experiment</title>
<p>To demonstrate the superiority and effectiveness of the proposed improved algorithm, we conducted comparative experiments. First, we compared various models in the YOLO series: YOLOv3 (<xref ref-type="bibr" rid="B21">Redmon and Farhadi, 2018</xref>) nd its lightweight version YOLOv3-tiny (<xref ref-type="bibr" rid="B1">Adarsh et&#xa0;al., 2020</xref>); YOLOv4 (<xref ref-type="bibr" rid="B5">Bochkovskiy et&#xa0;al., 2020</xref>) with the novel backbone network CSPDarknet53; YOLOv5n (<xref ref-type="bibr" rid="B34">Xue et&#xa0;al., 2023</xref>), which improves accuracy using mosaic data augmentation; and YOLOv9s (<xref ref-type="bibr" rid="B30">Wang et&#xa0;al., 2024a</xref>), which introduces new structures based on YOLOv7. Also, we compared the tea detection model developed by YOLO-Tea (<xref ref-type="bibr" rid="B34">Xue et&#xa0;al., 2023</xref>), <xref ref-type="bibr" rid="B13">Hossain et&#xa0;al. (2018)</xref> and TSBA-YOLO (<xref ref-type="bibr" rid="B17">Lin et&#xa0;al., 2023</xref>), which can now be applied to the prevention and control of tea diseases and pests.</p>
<p>From <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> we can analogize the advantages and disadvantages of the model proposed in the above paper. Compared to models such as YOLOv3 and YOLOv5, the later proposed YOLOv8 and v9 have better performance, with mAP reaching over 70%. However, this is still not an ideal accuracy rate. Because these four models are only a framework and do not specifically detect the characteristics of tea pests, diseases, and defects in images. However, YOLOv10b and YOLOv11n are improvements based on YOLOv8 and YOLOv9, still retaining similar shortcomings. Therefore, subsequent research mainly focuses on targeted optimization of this drawback, such as the attention mechanism and feature fusion module proposed in this paper, which take into account the characteristics of tea damage and the features captured during drone inspections. After targeted optimization, our model achieved a precision of 92.2% and an mAP of 94.5%, far exceeding similar models.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Test results of different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">Precision/%</th>
<th valign="top" align="center">Recall%</th>
<th valign="top" align="center">mAP0.5/%</th>
<th valign="top" align="center">mAP0.5:0.95/%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">YOLOv3</td>
<td valign="top" align="center">49.5</td>
<td valign="top" align="center">40.3</td>
<td valign="top" align="center">37.2</td>
<td valign="top" align="center">18.6</td>
</tr>
<tr>
<td valign="top" align="center">YOLOv3-tiny</td>
<td valign="top" align="center">39.2</td>
<td valign="top" align="center">31.7</td>
<td valign="top" align="center">29.5</td>
<td valign="top" align="center">17.9</td>
</tr>
<tr>
<td valign="top" align="center">YOLOv4</td>
<td valign="top" align="center">57.8</td>
<td valign="top" align="center">45.4</td>
<td valign="top" align="center">48.4</td>
<td valign="top" align="center">25.5</td>
</tr>
<tr>
<td valign="top" align="center">YOLOv5n</td>
<td valign="top" align="center">71.0</td>
<td valign="top" align="center">64.8</td>
<td valign="top" align="center">65.7</td>
<td valign="top" align="center">37.6</td>
</tr>
<tr>
<td valign="top" align="center">YOLOv8n</td>
<td valign="top" align="center">80.2</td>
<td valign="top" align="center">69.7</td>
<td valign="top" align="center">72.2</td>
<td valign="top" align="center">47.3</td>
</tr>
<tr>
<td valign="top" align="center">YOLOv9s</td>
<td valign="top" align="center">79.8</td>
<td valign="top" align="center">75.0</td>
<td valign="top" align="center">75.2</td>
<td valign="top" align="center">45.0</td>
</tr>
<tr>
<td valign="top" align="center">YOLOv10b</td>
<td valign="top" align="center">81.2</td>
<td valign="top" align="center">77.8</td>
<td valign="top" align="center">83.9</td>
<td valign="top" align="center">68.2</td>
</tr>
<tr>
<td valign="top" align="center">YOLOv11n</td>
<td valign="top" align="center">86.2</td>
<td valign="top" align="center">80.4</td>
<td valign="top" align="center">87.3</td>
<td valign="top" align="center">70.1</td>
</tr>
<tr>
<td valign="top" align="center">Hossain S</td>
<td valign="top" align="center">72.3</td>
<td valign="top" align="center">74.2</td>
<td valign="top" align="center">68.6</td>
<td valign="top" align="center">43.1</td>
</tr>
<tr>
<td valign="top" align="center">TSBA-YOLO</td>
<td valign="top" align="center">67.6</td>
<td valign="top" align="center">81.5</td>
<td valign="top" align="center">71.5</td>
<td valign="top" align="center">51.2</td>
</tr>
<tr>
<td valign="top" align="center">YOLO-Tea</td>
<td valign="top" align="center">85.1</td>
<td valign="top" align="center">85.7</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">64.7</td>
</tr>
<tr>
<td valign="top" align="center">Ours</td>
<td valign="top" align="center">
<bold>92.2</bold>
</td>
<td valign="top" align="center">
<bold>87.1</bold>
</td>
<td valign="top" align="center">
<bold>94.5</bold>
</td>
<td valign="top" align="center">
<bold>71.4</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold indicates the optimal value of the current indicator.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s4" sec-type="conclusions">
<label>4</label>
<title>Conclusions</title>
<p>Due to the texture, shape, and color characteristics of tea leaves, accurately detecting defects and pest damage is challenging. The small size of the leaves, in particular, renders existing models insufficient for our research needs. Therefore, we have enhanced the YOLOV8n model in various ways to improve its detection capabilities for tea leaf defects and diseases.</p>
<p>(<xref ref-type="bibr" rid="B9">Chen et&#xa0;al., 2024b</xref>) proposed a new ViTNet model, which mainly detects smile pest and disease features by introducing self-attention mechanism and global feature extraction. Secondly, the EMA PANet model was introduced to improve the multi-scale information acquisition ability (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2024a</xref>) proposed using transfer learning and freezing core strategies to improve timely detection ability (<xref ref-type="bibr" rid="B15">Li et&#xa0;al., 2024</xref>) proposed embedding the CA attention mechanism into MobileNetV2 and proposed a multi branch parallel strategy to extract features, which can adapt to different diseases. And use AutoML for Model Compression (AMC) to compress the computational load (<xref ref-type="bibr" rid="B36">Zhou et&#xa0;al., 2024</xref>) proposes to use the GS DeepLabV3 network, only Chen paid attention to the attention mechanism, which can effectively reduce computational complexity and improve accuracy. However, the adaptive attention mechanism used by Chen calculates global features, which requires a large amount of computation; the EMA PANet model is a feature fusion network based on PANet, which improves performance by adding fusion paths, but this can lead to difficulty in training and slow convergence. Transfer learning and freezing core strategies can lead to poor generalization performance of the model and neglect of underlying features. The multi branch parallel strategy proposed by Li for feature extraction is a great method.</p>
<p>Our model combines their strengths and discards their weaknesses. Firstly, because YOLOV8 struggles to focus on small targets such as disease defects, we employed the Bi Former attention mechanism to direct the model&#x2019;s attention towards these areas. Bi Former filters out irrelevant feature information at the upper layers of the network, retaining only a portion of the regions. Within these regions, it then utilizes token-to-token attention for higher precision. The DWconv reduces computational load, and the MLP adjusts the attention weights accordingly (<xref ref-type="bibr" rid="B7">Chen et&#xa0;al., 2024a</xref>).</p>
<p>Secondly, the baseline model&#x2019;s max pyramid pooling employs a max pooling module. As shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>, the effective information retained by max pooling is not highly sensitive to tea leaf defects and diseases. However, pooling operations using the Haar function can preserve more feature information. The Haar function can retain essential feature information to the greatest extent when transmission channel performance is suboptimal, then reconstruct the image for the next layer of computation. During this process, feature maps computed using the Haar function are able to preserve critical information to the maximum extent.</p>
<p>Finally, the new feature fusion network decomposes the backbone network into sub-backbone networks with distinct features under the transform framework. This leverages the parallel processing advantages of GPUs, thereby accelerating computation speed. When processing single features, the model often exhibits better performance. Additionally, by summing the feature maps of the same layer, more feature information can be retained without increasing computational load.</p>
<p>Through a series of improvements, we ultimately developed the BHC-YOLO model for detecting tea leaf defects and diseases. As shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>, the BHC model outperforms other tea leaf detection models available on the market. Notably, the dataset considers the impact of weather factors on practicality, and the algorithm enhances the original images, thereby increasing the model&#x2019;s generalization capability.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Comparison of several excellent model results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1492504-g009.tif"/>
</fig>
<p>However, there are still shortcomings and areas for improvement in this model. Firstly, the computational complexity is still relatively high, which requires a certain level of power consumption for portable artificial intelligence chipsets and is not easy to carry. In the subsequent work, we will prune the entire model to further reduce computational complexity. Secondly, there is a high demand for photo quality, and once in a low light environment, the accuracy will suddenly decrease; the recognition rate of sporadic tea pests and diseases is low, and there is still room for improvement.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s6" sec-type="author-contributions">
<title>Author contributions</title>
<p>BZ: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. XX: Conceptualization, Data curation, Investigation, Methodology, Software, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. XL: Formal analysis, Funding acquisition, Project administration, Resources, Validation, Visualization, Writing &#x2013; review &amp; editing. WL: Project administration, Validation, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s7" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Natural Science Foundation of China Regional Science Foundation Project (Approval number: 62265007 and 32260622). And it also received funding support from the Natural Science Foundation of Jiangxi Province, China, with project number 20224BAB212007.</p>
</sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Adarsh</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Rathi</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>YOLO v3-Tiny: Object Detection and Recognition using one stage improved model</article-title>,&#x201d; in <conf-name>2020 6th international conference on advanced computing and communication systems (ICACCS)</conf-name>. (<publisher-loc>Coimbatore, India</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>687</fpage>&#x2013;<lpage>694</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICACCS48705.2020.9074315</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Atila</surname> <given-names>&#xdc;.</given-names>
</name>
<name>
<surname>U&#xe7;ar</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Akyol</surname> <given-names>K.</given-names>
</name>
<name>
<surname>U&#xe7;ar</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Plant leaf disease classification using EfficientNet deep learning model</article-title>. <source>Ecol. Inf.</source> <volume>61</volume>, <fpage>101182</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2020.101182</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Baranwal</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Arora</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Khandelwal</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Detecting diseases in plant leaves: An optimised deep-learning convolutional neural network approach</article-title>. <source>Int. J. Environ. Sustain. Dev.</source> <volume>20</volume>, <fpage>166</fpage>&#x2013;<lpage>188</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1504/IJESD.2021.114562</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barburiceanu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Meza</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Orza</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Malutan</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Terebes</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Convolutional neural networks for texture feature extraction. Applications to leaf disease classification in precision agriculture</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>160085</fpage>&#x2013;<lpage>160103</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2021.3131002</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Yolov4: Optimal speed and accuracy of object detection</article-title>. <source>arXiv preprint arXiv:2004.10934</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2004.10934</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Nanehkaran</surname> <given-names>Y. A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Using deep transfer learning for image-based plant disease identification</article-title>. <source>Comput. Electron. Agric.</source> <volume>173</volume>, <fpage>105393</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105393</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2024</year>a). <article-title>Improved keypoint localization network for tea bud based on YOLO framework</article-title>. <source>Comput. Electrical Eng.</source> <volume>119</volume>, <fpage>109505</fpage>&#x2013;<lpage>109505</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compeleceng.2024.109505</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Visual tea leaf disease recognition using a convolutional neural network model</article-title>. <source>Symmetry</source> <volume>11</volume>, <fpage>343</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/sym11030343</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Bai</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>b). <article-title>TeaViTNet: tea disease and pest detection model based on fused multiscale attention</article-title>. <source>Agronomy</source> <volume>14</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy14030633</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Fan</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>An industrial-grade solution for crop disease image detection tasks</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>921057</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.921057</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhaka</surname> <given-names>V. S.</given-names>
</name>
<name>
<surname>Meena</surname> <given-names>S. V.</given-names>
</name>
<name>
<surname>Rani</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Sinwar</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Ijaz</surname> <given-names>M. F.</given-names>
</name>
<name>
<surname>Wo&#x17a;niak</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A survey of deep convolutional neural networks applied for prediction of plant leaf diseases</article-title>. <source>Sensors</source> <volume>21</volume>, <fpage>4749</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s21144749</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gong</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Effective fusion factor in FPN for tiny object detection</article-title>,&#x201d; in <conf-name>2021 IEEE Winter Conference on Applications of Computer Vision (WACV)</conf-name>. (<publisher-loc>Waikoloa, HI, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1160</fpage>&#x2013;<lpage>1168</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/WACV48630.2021.00120</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hossain</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Mou</surname> <given-names>R. M.</given-names>
</name>
<name>
<surname>Hasan</surname> <given-names>M. M.</given-names>
</name>
<name>
<surname>Chakraborty</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Razzak</surname> <given-names>M. A.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Recognition and detection of tea leaf's diseases using support vector machine</article-title>,&#x201d; in <conf-name>2018 IEEE 14th International Colloquium on Signal Processing &amp; Its Applications (CSPA)</conf-name>. (<publisher-loc>Penang, Malaysia</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>150</fpage>&#x2013;<lpage>154</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CSPA.2018.8368703</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>A survey of deep learning-based object detection</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>128837</fpage>&#x2013;<lpage>128868</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2019.2939201</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>). <article-title>Enhancing tea leaf disease identification with lightweight mobileNetV2</article-title>. <source>Computers Materials Continua</source> <volume>80</volume>, <fpage>679</fpage>&#x2013;<lpage>694</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.32604/cmc.2024.051526</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Ensemble of the deep convolutional network for multiclass of plant disease classification using leaf images</article-title>. <source>Int. J. Pattern Recognition Artif. Intell.</source> <volume>36</volume>, <fpage>2250016</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1142/S0218001422500161</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Bai</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>TSBA-YOLO: An improved tea diseases detection model based on attention mechanisms and feature fusion</article-title>. <source>Forests</source> <volume>14</volume>, <fpage>619</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/f14030619</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T.-Y.</given-names>
</name>
<name>
<surname>Goyal</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Focal loss for dense object detection</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Computer Vision (ICCV)</conf-name>. (<publisher-loc>Venice, Italy</publisher-loc>: <publisher-name>IEEE</publisher-name>) <fpage>2980</fpage>&#x2013;<lpage>2988</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.324</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Anguelov</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Erhan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Szegedy</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Reed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>C.-Y.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). &#x201c;<article-title>Ssd: Single shot multibox detector</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision 2016</conf-name>, <publisher-loc>Amsterdam, The Netherlands</publisher-loc>: <publisher-name>IEEE</publisher-name>, <conf-date>October 11&#x2013;14, 2016</conf-date>, Vol. <volume>14</volume>. <fpage>21</fpage>&#x2013;<lpage>37</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-46448-0_2</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: Unified, real-time object detection</article-title>,&#x201d; in <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Las Vegas, NV, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Yolov3: An incremental improvement</article-title>. <source>arXiv preprint arXiv:1804.02767</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1804.02767</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Faster R-CNN: Towards real-time object detection with region proposal networks</article-title>,&#x201d; in <conf-name>IEEE Transactions on Pattern Analysis and Machine Intelligence</conf-name>, Vol. <volume>39</volume>. <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roy</surname> <given-names>A. M.</given-names>
</name>
<name>
<surname>Bose</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Bhaduri</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A fast accurate fine-grain object detection model based on YOLOv4 deep neural network</article-title>. <source>Neural Computing Appl.</source> <volume>34</volume>, <fpage>3895</fpage>&#x2013;<lpage>3921</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00521-021-06651-x</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>An</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Individual tree crown segmentation and crown width extraction from a heightmap derived from aerial laser scanning data using a deep learning framework</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>914974</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.914974</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Rao</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>SLIC_SVM based leaf diseases saliency map extraction of tea plant</article-title>. <source>Comput. Electron. Agric.</source> <volume>157</volume>, <fpage>102</fpage>&#x2013;<lpage>109</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2018.12.042</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Apple detection during different growth stages in orchards using the improved YOLO-V3 model</article-title>. <source>Comput. Electron. Agric.</source> <volume>157</volume>, <fpage>417</fpage>&#x2013;<lpage>426</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2019.01.012</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tiwari</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Joshi</surname> <given-names>R. C.</given-names>
</name>
<name>
<surname>Dutta</surname> <given-names>M. K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Dense convolutional neural networks based multiclass plant disease detection and classification using leaf images</article-title>. <source>Ecol. Inf.</source> <volume>63</volume>, <fpage>101289</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2021.101289</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>An</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>UAV-YOLOv8: a small-object-detection model based on improved YOLOv8 for UAV aerial photography scenarios</article-title>. <source>Sensors</source> <volume>23</volume>, <fpage>7190</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s23167190</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Liew</surname> <given-names>J. H.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Panet: Few-shot image semantic segmentation with prototype alignment</article-title>,&#x201d; in <conf-name>2019 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>. (<publisher-loc>Seoul, Korea (South</publisher-loc>): <publisher-name>IEEE</publisher-name>). <fpage>9197</fpage>&#x2013;<lpage>9206</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2019.00929</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Yeh</surname> <given-names>I.-H.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2024</year>a). <article-title>YOLOv9: learning what you want to learn using programmable gradient information</article-title>. <source>arXiv preprint arXiv:2402.13616</source>. <volume>14350</volume>:<page-range>1&#x2013;16</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2402.13616</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>S. M.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>C. P.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>J. H.</given-names>
</name>
<name>
<surname>Ouyang</surname> <given-names>J. X.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Z. M.</given-names>
</name>
<name>
<surname>Xuan</surname> <given-names>Y. M.</given-names>
</name>
<etal/>
</person-group>. (<year>2024</year>b). <article-title>Tea yield estimation using UAV images and deep learning</article-title>. <source>Ind. Crops Products</source> <volume>212</volume>, <fpage>118358</fpage>&#x2013;. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.indcrop.2024.118358</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weihao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wan</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Tao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Peiwen</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Research and application of lightweight yolov7-TSA network in tea disease detection and identification</article-title>. <source>J. Henan Agric. Sci.</source> <volume>52</volume>, <fpage>162</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-023-33270-4</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>He</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Haar wavelet downsampling: A simple but effective downsampling module for semantic segmentation</article-title>. <source>Pattern Recognition</source> <volume>143</volume>, <fpage>109819</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.patcog.2023.109819</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xue</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Bai</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>YOLO-tea: A tea disease detection model improved by YOLOv5</article-title>. <source>Forests</source> <volume>14</volume>, <fpage>415</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/f14020415</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yuan</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Monitoring Thosea sinensis walker in tea plantations based on UAV multi-spectral image</article-title>. <source>Phyton-International J. Exp. Bot.</source> <volume>92</volume>, <fpage>747</fpage>&#x2013;<lpage>761</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.32604/PHYTON.2023.025502</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>GS-DeepLabV3+: A mountain tea disease segmentation network based on improved shuffle attention and gated multidimensional feature extraction</article-title>. <source>Crop Prot.</source> <volume>183</volume>, <fpage>106762</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cropro.2024.106762</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ke</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Lau</surname> <given-names>R. W.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Biformer: Vision transformer with bi-level routing attention</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition(CVPR)</conf-name>. (<publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10323</fpage>&#x2013;<lpage>10333</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00995</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>