<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1773377</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A lightweight YOLO-TinyFuse model for small target detection of olive fruits</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Yang</surname><given-names>Xinyu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3317002/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Lin</surname><given-names>Yichun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3325917/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Xiao</surname><given-names>Qiwen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3351495/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Liang</surname><given-names>Ziyao</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ma</surname><given-names>Luyao</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>ShuoGuo</surname><given-names>Yaxi</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ade</surname><given-names>Kugu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Tong</surname><given-names>Zhaoguo</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chen</surname><given-names>Yu</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Cao</surname><given-names>Ying</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3318845/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>College of Information Engineering, Sichuan Agricultural University</institution>, <city>Ya&#x2019;an</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>College of Food Science, Sichuan Agricultural University</institution>, <city>Ya&#x2019;an</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>College of Water Conservancy and Hydropower Engineering, Sichuan Agricultural University</institution>, <city>Ya&#x2019;an</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>Panxi Crop Improvement Key Laboratory of Sichuan Province, Xichang University</institution>, <city>Xichang</city>, <state>Sichuan</state>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff5"><label>5</label><institution>School of Arts and Media, Sichuan Agricultural University</institution>, <city>Ya&#x2019;an</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Ying Cao, <email xlink:href="mailto:caoying@sicau.edu.cn">caoying@sicau.edu.cn</email>; Yu Chen, <email xlink:href="mailto:41519@sicau.edu.cn">41519@sicau.edu.cn</email>; Zhaoguo Tong, <email xlink:href="mailto:olivetong@163.com">olivetong@163.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-24">
<day>24</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1773377</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>22</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Yang, Lin, Xiao, Liang, Ma, ShuoGuo, Ade, Tong, Chen and Cao.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Yang, Lin, Xiao, Liang, Ma, ShuoGuo, Ade, Tong, Chen and Cao</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-24">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>In response to the challenges posed by the large number of small targets, complex backgrounds and significant computational load involved in detecting olives, this study presents YOLO-TinyFuse, a lightweight detection model developed based on YOLOv8n. This model incorporates the P2 high-resolution feature layer, a ModifiedNeck cross-scale fusion structure (ModifiedNeck) and a bidirectional feature pyramid network (BiFPN) dynamic weighting module within a unified architecture. This architecture simultaneously preserves high-resolution feature representations, enhances bidirectional multi-scale interaction and optimises weighted feature aggregation. This synergistic design substantially improves the recognition of small objects while reducing model complexity further. Evaluations conducted on a multi-scenario olive phenotyping dataset demonstrate that YOLO-TinyFuse achieves an mAP50 of 92.3% and a Recall of 84.5%. This represents improvements of 2.6% and 3.2% respectively over YOLOv8n, while reducing the parameter count by 6.76%. These results confirm that the proposed model provides a deployable, computationally efficient, real-time solution for target recognition on mainstream edge computing platforms in automated olive harvesting scenarios, and offers a reusable, lightweight framework for agricultural small-object detection tasks requiring high performance and optimised computational efficiency.</p>
</abstract>
<kwd-group>
<kwd>BiFPN</kwd>
<kwd>lightweight model design</kwd>
<kwd>ModifiedNeck</kwd>
<kwd>Olea europaea</kwd>
<kwd>P2 layer</kwd>
<kwd>small object detection</kwd>
<kwd>YOLOv8</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the Youth Project of Sichuan Provincial Natural Science Foundation (Grant No. 2025ZNSFSC1303) and the Ya&#x2019;an Science and Technology Project (Grant No. 2025-CGZH-00013-NC).</funding-statement>
</funding-group>
<counts>
<fig-count count="15"/>
<table-count count="3"/>
<equation-count count="11"/>
<ref-count count="49"/>
<page-count count="22"/>
<word-count count="12933"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>The olive is a high-value economic crop that occupies a pivotal position in the global agricultural economy (<xref ref-type="bibr" rid="B19">Jimenez-Lopez et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B36">Romani et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B31">Mazzocchi et&#xa0;al., 2019</xref>). Its fruits are the main source of high-quality edible oil and are widely used in the food, cosmetics and related industries, generating substantial value throughout the industrial chain (<xref ref-type="bibr" rid="B7">Cinardi et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B13">Garc&#xed;a-Serrano et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B8">Difonzo et&#xa0;al., 2021</xref>). Throughout the olive growth cycle, tasks such as fruit detection are critical components of Precision orchard management. Accurate and efficient detection enables growers to monitor fruit numbers in real time and provides a scientific basis for determining the optimal harvest period. Furthermore, reliable fruit detection is essential for the development of automated olive harvesting technologies, playing a vital role in promoting intelligent production and supporting the sustainable development of the olive industry. In the field of object detection, the definition of a small object must be based on general criteria as well as scene-specific characteristics. As summarised in mainstream survey literature (<xref ref-type="bibr" rid="B28">Lin et&#xa0;al., 2017</xref>), a small object is typically defined as having a pixel size smaller than 32 &#xd7; 32, occupying less than 0.01% of the image area or presenting feature-map dimensions below 10 &#xd7; 10. However, in the detection of olives, small objects exhibit pronounced scene-specific properties: the mean pixel size of young fruits is often below 30 px, their phenotypic features are weak, and their grayscale contrast with the surrounding branches and leaves is often less than 15. These challenges are further compounded by the fine-grained feature loss induced by deep-network downsampling (<xref ref-type="bibr" rid="B30">Mamalis et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B32">Osco-Mamani et&#xa0;al., 2025</xref>), collectively increasing the difficulty of reliable detection. Considering these characteristics, olive fruit detection faces four primary challenges. First, small object features are highly prone to loss, as young fruits contain limited fine detail information that conventional models struggle to capture effectively (<xref ref-type="bibr" rid="B27">Li et&#xa0;al., 2023b</xref>). Secondly, substantial background interference occurs (<xref ref-type="bibr" rid="B16">Hou et&#xa0;al., 2022</xref>), with branch-leaf occlusion rates ranging from 20% to 80%, fruit clusters containing two to six overlapping fruits and frequent extreme illumination fluctuations, all of which reduce detection robustness. Thirdly, there is considerable variation in object scale, with fruit size differing by more than a factor of five from early developmental stages to maturity, which exceeds the adaptation capacity of many traditional models (<xref ref-type="bibr" rid="B32">Osco-Mamani et&#xa0;al., 2025</xref>). Fourthly, deployment conditions are constrained as mountainous orchards often rely on unmanned aerial vehicles (UAVs) and handheld edge devices. This imposes strict requirements on multi-scale feature extraction, cross-layer fusion and lightweight deployment efficiency (<xref ref-type="bibr" rid="B48">Zhu et&#xa0;al., 2025a</xref>; <xref ref-type="bibr" rid="B18">Ji et&#xa0;al., 2025</xref>).</p>
<p>Traditional approaches based on manual expertise or conventional image-processing techniques clearly have limitations in terms of both detection accuracy and cross-scene generalisation. Deep learning has therefore emerged as the dominant solution paradigm, giving rise to three principal technical pathways, each with inherent constraints in terms of adapting to different scenarios. Early convolutional neural networks (CNN) laid the groundwork for hierarchical phenotypic feature extraction, while the YOLO family has become the go-to choice for field applications thanks to its end-to-end architecture and real-time detection capability. More recently, Transformer-based models such as DETR have shown great promise due to their strong semantic association modelling capabilities. However, all three approaches still face practical limitations relating to environmental robustness, lightweight model design and real-time computational efficiency, which restrict their deployment in complex agricultural scenarios.</p>
<p>Firstly, convolutional neural networks (CNN) progressively refine fruit features through stacked convolution and pooling operations, and were among the earliest deep learning approaches applied to the detection of small objects in agriculture. However, existing CNN-based methods still face two fundamental limitations: First, insufficient adaptability to highly variable environmental and scene conditions; second, limited effectiveness in detecting small objects, particularly when fine-grained phenotypic features are weak or are lost easily during downsampling.</p>
<p><xref ref-type="bibr" rid="B5">Cano Marchal et&#xa0;al. (2021)</xref> proposed an infrared-based automated system for detecting defects in olives. This system integrates an iterative active-contour algorithm with a decision-tree classifier and achieves a detection rate of 81.22%. However, the system lacks illumination-adaptive regulation and therefore cannot autonomously adapt to complex outdoor environments; manual parameter readjustment is required under field conditions with variable lighting. <xref ref-type="bibr" rid="B3">Beltr&#xe1;n et&#xa0;al. (2023)</xref> developed an edible olive quality inspection system with an optimized topological design that achieved a detection success rate of 99.8%. Nevertheless, the samples used were limited to ideal conditions, such as white trays, natural indoor lighting and fixed imaging distances, without accounting for practical challenges such as branch-leaf occlusion or uneven illumination. Consequently, the system has not been validated for robustness under real-world field conditions.</p>
<p><xref ref-type="bibr" rid="B20">Jing et&#xa0;al. (2024)</xref> proposed MRD-YOLO, a lightweight detection framework which adopts YOLOv8n as the baseline. It achieves model compression by reducing redundant computations in the backbone, enhancing multi-scale feature fusion efficiency and incorporating a lightweight attention mechanism. The model achieved an mAP50 of 97.4% on a melon dataset constructed in-house. However, as its training samples primarily consisted of clear, medium-sized fruits, the model has limited capacity for effective phenotypic feature extraction from low-resolution, blurred, or extremely small fruits commonly encountered under field conditions. Consequently, the model remains susceptible to missed detections in complex agricultural environments.</p>
<p>Secondly, the YOLO family is characterised by its single-stage, end-to-end detection architecture. This achieves an effective balance between processing speed and accuracy, making it the mainstream solution for real-time detection of small objects in orchards. However, existing YOLO-based improvement strategies still have limitations in terms of lightweight adaptation and robustness under complex field conditions.</p>
<p><xref ref-type="bibr" rid="B32">Osco-Mamani et&#xa0;al. (2025)</xref> used the YOLOv8m model to detect olive fruit and achieved an mAP50 of 94.96%. However, with 25.9 million parameters, the model is not suitable for agricultural field scenarios that require lightweight architectures and optimised computational efficiency. Using a public RGB dataset of rice planthoppers, <xref ref-type="bibr" rid="B18">Ji et&#xa0;al. (2025)</xref> proposed the SwinTYOLOv8n-p2 model, which integrates a Swin Transformer with YOLOv8n-p2 and incorporates SCConv to enhance the C2f module. This results in an mAP50 of 86.8%. Nevertheless, with 65.2 million parameters and a computational cost of 307.4 GFLOPs, the model exceeds the capacity of resource-constrained edge devices such as UAVs and robotic harvesting platforms. Similarly, <xref ref-type="bibr" rid="B37">Sapkota et&#xa0;al. (2024)</xref> introduced the YOLOv9 Gelan-e and Gelan-base models, which achieved an mAP50 of 93.5%. However, their inference latency is approximately eight times that of YOLOv8n, rendering them unsuitable for real-time monitoring in orchard environments.</p>
<p><xref ref-type="bibr" rid="B12">Fu et&#xa0;al. (2024)</xref> proposed the YOLOv5-AT model to overcome the challenges posed by small dataset sizes and the high colour similarity between green fruits and the surrounding foliage. Following architectural optimisation, the model achieved an mAP50 of 84.6%. However, under complex field conditions, such as dense branch-leaf occlusion and severe illumination fluctuations, the model&#x2019;s ability to discriminate fine-grained phenotypic features declines markedly, resulting in substantial degradation of detection performance. <xref ref-type="bibr" rid="B48">Zhu et&#xa0;al. (2025a)</xref> improved an olive fruit maturity detection algorithm based on YOLOv11n by replacing the backbone with an EfficientNet-B0 and integrating LSKA and BiFPN modules to improve lightweight adaptability. Nevertheless, when exposed to extreme orchard conditions, including midday overexposure, reflective water accumulation on leaves after rainfall and complete fruit occlusion by branches, the model&#x2019;s missed detection rate increased sharply to over 15%, indicating insufficient robustness for reliable deployment.</p>
<p>Powered by the Transformer self-attention mechanism, DETR can capture long-range semantic dependencies between fruits and their background. This offers a new research pathway for detecting small objects in agriculture in complex environments. However, current DETR-based approaches still struggle to meet real-time field requirements due to limitations in inference efficiency and insufficient lightweight adaptability. <xref ref-type="bibr" rid="B42">Wang et&#xa0;al. (2021)</xref> introduced the SwinGD model, which enhances feature association capabilities through an optimised Transformer architecture, achieving an mAP50 of 94% for grape cluster detection in complex vineyard conditions. Nevertheless, the model relies heavily on deep semantic computation, resulting in reduced inference speed and limited computational efficiency. Consequently, it remains unsuitable for agricultural automation scenarios such as UAV-based real-time monitoring or synchronised detection in robotic harvesting systems, thereby constraining its practical deployment in orchard environments.</p>
<p>In summary, this study proposes an enhanced model, YOLO-TinyFuse, developed on the YOLOv8n baseline, to address the major technical bottlenecks in field-based olive-fruit detection. These include the loss of fine-grained small-object features during deep downsampling, the low efficiency of feature fusion under complex backgrounds, and the computational and endurance constraints of edge devices. This model is the first to integrate the ModifiedNeck structure (<xref ref-type="bibr" rid="B38">Subedi, 2024</xref>), the BiFPN bidirectional weighted feature fusion module (<xref ref-type="bibr" rid="B2">Ar&#x131;soy and Uysal, 2025</xref>; <xref ref-type="bibr" rid="B43">Wu, 2025</xref>; <xref ref-type="bibr" rid="B26">Li et&#xa0;al., 2024</xref>) and the P2 high-resolution feature layer (<xref ref-type="bibr" rid="B49">Zhu et&#xa0;al., 2025b</xref>; <xref ref-type="bibr" rid="B29">Ma et&#xa0;al., 2023</xref>). Specifically, the ModifiedNeck structure mitigates the issue of unidirectional information loss in traditional FPNs by introducing bidirectional pathways and channel unification strategies. The BiFPN module employs learnable weights to optimise cross-scale feature aggregation, while the P2 layer compensates for the absence of high-resolution representations, which are essential for the reliable detection of small objects. The synergistic interaction of these components establishes a comprehensive framework for phenotypic feature extraction, multi-scale fusion and optimised validation in YOLO-TinyFuse, effectively overcoming the key challenges inherent in olive fruit detection under real-world agricultural conditions.</p>
<p>This model innovatively integrates three core components: the P2 layer operates at a high resolution of 160&#xd7;160 and preserves fine-grained details after 4&#xd7; downsampling to enable reliable small object detection. The enhanced neck module effectively overcomes the inherent unidirectional information loss and redundancy issues of traditional FPN structures through bidirectional paths and channel unification. The bidirectional feature pyramid network module enhances the contribution of occlusion region features through dynamically weighted aggregation. This synergistic multi-module design achieves a 2.6% improvement in mAP50, a 3.2% increase in recall, and a 6.76% reduction in parameters, balancing detection performance, computational efficiency, and deployment costs.</p>
<p>To enhance model robustness under complex field conditions, a comprehensive scene-specific dataset was constructed covering three olive fruit development stages early, enlarged, and mature, diverse lighting conditions including sunny, cloudy, foggy, and rainy environments, and extensive occlusion levels such as 20-80% foliage coverage and 2&#x2013;6 fruit overlaps per cluster, significantly boosting model generalization.</p>
<p>With a lightweight configuration of only 2.96 million parameters and an inference speed of 18.6 frames per second, YOLO-TinyFuse can be directly deployed on edge computing platforms like the Raspberry Pi 4B and drone systems, meeting end-to-end detection needs across the entire olive production chain. Furthermore, its technical framework exhibits strong reusability, efficiently adapting to other crops including wheat and cherries, small target objects such as bees, and highly occluded crops like mangoes and apples. This drives the evolution of agricultural phenotyping detection from single-crop customization toward multi-crop universal applicability.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Data and preprocessing</title>
<sec id="s2_1_1">
<label>2.1.1</label>
<title>Dataset construction</title>
<p>The images in this dataset were collected in collaboration with the Beihe and Yuehua olive plantations within the Xichang Olive Industry Science and Technology Demonstration Zone in Sichuan, China, at 102.24&#xb0;E and 27.74&#xb0;N. The images were acquired using a Nikon D7200 camera at a resolution of 6000 x 4000 pixels in September 2024. The output resolution was set to 300 dpi with a colour depth of 24 bits and the sRGB colour mode. The camera was configured with an f/5.3 aperture, a 1/250 s exposure time and a 90 mm focal length to ensure clear visualisation of the key phenotypic features of the olives. These settings enabled effective separation of fruits from background vegetation under varying illumination conditions, while the controlled depth of field and precise focus highlighted fruit colour, texture, and morphological structure. This minimised detail loss caused by occlusion or rapid light fluctuations. Data collection encompassed three critical developmental stages-young fruit, enlargement and maturity, and included diverse illumination scenarios, such as midday sunlight, overcast conditions, and low-light evening environments. The dataset also captured representative orchard backgrounds, including dense branch-leaf occlusion, overlapping fruit clusters and single isolated fruits. This comprehensive, multi-scenario coverage avoids the limitations of generalization typically associated with single-condition datasets, providing a high-quality foundation for robust small-object detection and phenotypic feature extraction tasks in olive orchards.</p>
<p>The dataset was segmented using a stratified random sampling strategy. To prevent model overfitting caused by high inter-frame correlation, this strategy strictly adheres to the dual independence principle of &#x201c;tree individuals + scenes.&#x201d; Specifically, all olive trees collected from the Beihe and Yuehua plantations were first assigned unique identification numbers, grouping all images of each tree into independent &#x201c;individual clusters&#x201d;; Simultaneously, a three-dimensional stratification was applied based on &#x201c;growth stage + light conditions + shading level &#x201c; to ensure consistent scene distribution across the training, validation, and test sets. During segmentation execution, independent individual groups were first randomly sampled at a 7:2:1 ratio. Images within each group underwent secondary sampling based on scene dimensions, ultimately retaining 12,259 high-quality images: 8,549 (69.7%) for the training set, 2,442 (19.9%) for the validation set, and 1,268 (10.4%) for the test set. This segmentation method ensures that images from the same plant do not span across datasets, effectively reducing inter-frame correlation and guaranteeing the authenticity and reliability of model generalization capability assessment. <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref> shows representative samples from the dataset.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Representative samples and data processing results. <bold>(A)</bold> Young Fruit Stage. <bold>(B)</bold> Enlargement Stage. <bold>(C)</bold> Maturity Stage. <bold>(D)</bold> Sunlight. <bold>(E)</bold> Overcast. <bold>(F)</bold> Low-light evening. <bold>(G)</bold> Mosaic Enhanced. <bold>(H)</bold> Colour Space Adjustment.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g001.tif">
<alt-text content-type="machine-generated">Multi-panel figure showing representative samples and data processing. Panel A shows young green olives. Panel B presents enlarging developing olives. Panel C displays mature dark purple to black olives. Panel D illustrates olives in bright sunlight. Panel E depicts olives in cloudy conditions. Panel F shows olives in low-light evening. Panel G demonstrates mosaic augmentation. Panel H presents colour space adjustment in preprocessing.</alt-text>
</graphic></fig>
</sec>
<sec id="s2_1_2">
<label>2.1.2</label>
<title>Annotation protocol and quality control</title>
<p>Image annotation was conducted using the open-source LabelImg 1.8.6 tool, with annotations stored in.txt format. This ensures full compatibility with the YOLO family of detection algorithms and facilitates direct integration into subsequent model training workflows. Following annotation, a dual quality-control mechanism combining manual inspection and automated verification was implemented to ensure the reliability of the annotations. A 10% manual sampling rate was applied, paying particular attention to verifying the completeness and Precision of annotations for small objects. Automated validation scripts were used to check the accuracy of the bounding box coordinates and class labels. This two-tiered quality control procedure resulted in final annotations for the dataset, thereby meeting the rigorous quality requirements necessary for high-performance model training.</p>
</sec>
<sec id="s2_1_3">
<label>2.1.3</label>
<title>Data preprocessing and augmentation strategies</title>
<p>To improve the robustness of the model in complex orchard environments while keeping computational costs under control, targeted pre-processing and augmentation procedures were applied to the training dataset. First, all images were uniformly resized to 640 &#xd7; 640 pixels using bilinear interpolation and normalised within the RGB colour space to ensure consistent input characteristics across samples. A series of scene-adaptive augmentation operations were then implemented to increase the model&#x2019;s resilience to variations in illumination, occlusion, and fruit morphology. These operations included Mosaic augmentation, horizontal flipping, geometric transformations, and HSV colour-space adjustments (<xref ref-type="bibr" rid="B20">Jing et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B26">Li et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B47">Zheng et&#xa0;al., 2025</xref>). It is important to note that Mixup augmentation and rotation-based transformations were intentionally excluded. Mixup can blend fruit and background pixels, which could degrade phenotypic feature extraction. Excessive rotation may also distort fruit morphology and introduce annotation misalignment. Both scenarios could adversely affect training stability and reduce detection accuracy. <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref> shows representative outputs of the applied augmentation strategies.</p>
</sec>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Overview of the YOLO-TinyFuse architecture</title>
<p>This study introduces an enhanced detection model, YOLO-TinyFuse, to address the limitations of the original YOLOv8n model, which framework is depicted in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>. These limitations include a high missed-detection rate for small objects, low efficiency of feature fusion under complex orchard backgrounds, and substantial computational overhead. This new architecture incorporates three essential components for the first time: the ModifiedNeck, the BiFPN and the P2. YOLO-TinyFuse retains the three-stage Backbone-Neck-Head architecture typical of the YOLO series (<xref ref-type="bibr" rid="B35">Redmon et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B40">Ultralytics Team, 2023</xref>), while optimising the flow of feature information and the detection pipeline across three levels.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>YOLO-TinyFuse framework and details. The backbone network adds a P2 high-resolution layer, paired with the C2f module to retain fine textures, and expands the receptive field through SPPF to generate P2-P5 multi-scale features. The neck module first passes through the ModifiedNeck, uses 1x1 convolution to unify the channels, and transmits information through bidirectional cross-scale fusion. Then, BiFPN is used to introduce learnable weights, combined with depth-wise separable convolution to amplify the contribution of each scale and optimize feature aggregation. Detection head: A P2-P5 four-scale decoupled head is adopted, where the classification and regression branches operate independently, and NMS is used to suppress redundant predictions, achieving accurate recognition of small, medium, and large targets. Arrows indicate downsampling upsampling and aggregation flows.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g002.tif">
<alt-text content-type="machine-generated">Flowchart diagram of a neural network architecture for object detection, labeled with Backbone, Neck, and Head sections, showing sequences of convolution, pooling, addition, concatenation, and detection modules, with color coding and example input and output images of tree branches.</alt-text>
</graphic></fig>
<p>In the backbone, which is derived from the CSPDarknet structure of YOLOv8n, a new P2 feature layer has been introduced to retain more fine-grained details, which are essential for extracting the phenotypic features of small objects. Alongside the P3, P4 and P5 layers, the P2 layer forms a multi-scale feature pyramid that progressively captures information ranging from low-level textures to high-level semantic representations. These multi-scale features are then forwarded to the Neck.</p>
<p>The Neck is an improvement on the simplified FPN used in YOLOv8n, incorporating both the ModifiedNeck and BiFPN modules. The ModifiedNeck module performs preliminary cross-scale fusion through lateral connections, combining top-down and bottom-up information flows. The BiFPN module then conducts more refined feature aggregation based on this output, introducing learnable weighting mechanisms to achieve deep, scale-adaptive fusion and strengthen the expressiveness of the fused feature maps.</p>
<p>In the head, a dedicated P2 detection layer has been added for the specific purpose of detecting small objects, which substantially enhances the accuracy of detecting young olives. The P2, P3, P4 and P5 layers correspond to targets of different sizes. Their outputs are then merged and processed using non-maximum suppression (NMS) to generate the final predictions, which include bounding box coordinates, class labels and confidence scores.</p>
<p>Overall, YOLO-TinyFuse retains the real-time inference capabilities of the YOLO family, while also improving detection performance and model compactness. This dual optimisation makes the model highly suitable for practical field-scale olive-fruit detection in resource-constrained orchard environments.</p>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>Backbone feature extraction: CSPDarknet</title>
<p>YOLO-TinyFuse uses a lightweight backbone based on the CSPDarknet architecture, which Structure is depicted in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. This architecture uses stacked multi-stage convolutional blocks and feature-aggregation modules to efficiently extract multi-scale representations from input images. Through progressive downsampling, the network constructs a hierarchical feature pyramid that preserves low-level textural details while simultaneously capturing high-level semantic information. This provides rich, multidimensional feature support for subsequent object-detection stages. The specific feature-extraction workflow is outlined as follows:</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>CSPDarknet structure. This streamlined backbone uses initial convolution to extract base textures. C2f blocks progressively yield P2-P5 multi-scale features. SPPF enlarges the receptive field. Downsampling ratios P2/4/8/16/32 correspond to detail and semantics suitable for different object sizes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating the CSPDarknet deep learning backbone. An input photo of olives on branches passes through sequential stages labeled P1 through P5, representing feature maps of decreasing spatial size, each indicated by arrows showing processing flow.</alt-text>
</graphic></fig>
<p>1. Initial feature mapping: The input is a 640 &#xd7; 640 &#xd7; 3 RGB image which is first processed by an initial convolutional layer comprising Conv + Batch Normalisation (BN) + Sigmoid Linear Unit (SiLU) activation. This expands the channel dimension from 3 to 16 and performs the first downsampling via stride-2 convolution, resulting in a feature map measuring 320 &#xd7; 320 &#xd7; 16.</p>
<p>2. Shallow Feature Extraction: After a stride-2 convolution operation increases the channel dimension to 32, the resulting feature map is fed into the first C2f module. This module employs a multi-branch parallel bottleneck architecture to refine the feature representation, ultimately generating the P2/4 feature layer (obtained through initial convolutional downsampling and refinement by the C2f module, with a resolution of 160&#xd7;160 pixels and 32 channels, corresponding to 4x downsampling). This 4x downsampling design precisely matches the 640&#xd7;640 input size: for olive fruits averaging less than 30px in size, a 20&#xd7;20 px small target corresponds to 5&#xd7;5 feature points, while an extremely small 10&#xd7;10 px target still corresponds to 2.5&#xd7;2.5 feature points. This effectively avoids the loss of fine-grained information compression caused by traditional 8x downsampling. Simultaneously, the P2 layer exhibits strong responsiveness to small target features, capturing core fine-grained, low-dimensional phenotypic characteristics such as texture and edges. The multi-branch parallel convolution structure of the C2f module further refines these feature representations, providing high-distinctiveness phenotypic information for small object detection. This successfully addresses the detection challenge posed by the low grayscale contrast (<italic>&lt;</italic>15) between olive fruits and surrounding foliage.</p>
<p>3. Progressive Multi-Scale Feature Generation: The feature map is downsampled further via a stride-2 convolution to 80 &#xd7; 80 &#xd7; 64, then processed through a C2f module to produce the P3/8 feature layer (obtained after downsampling via stride-2 convolution on the P2/4 feature layer and refinement by the C2f module; resolution 80&#xd7;80, 64 channels, corresponding to 8x downsampling; core feature representation tailored for medium-scale objects). This layer primarily targets medium-scale object representations. Repeating this sequence of downsampling and C2f refinement generates the P4/16 feature layer (obtained after downsampling via stride-2 convolution on the P3/8 feature layer and refinement through the C2f module; resolution 40&#xd7;40, 128 channels; corresponds to 16x downsampling; core layer for representing mid-level semantic and morphological features of medium-to-large-scale objects) and the P5/32 feature layer (obtained after downsampling via stride-2 convolution on the P4/16 feature layer and refinement through the C2f module; resolution 20&#xd7;20, 256 channels; corresponds to 32x downsampling; focuses on extracting high-dimensional global semantic features for large-scale targets; output fed into the SPPF module to enhance global feature representation). After the P5 layer, the C2f output is fed into the SPPF module (<xref ref-type="bibr" rid="B21">Jocher et&#xa0;al., 2020</xref>), which uses a kernel size of 5 and three consecutive max-pooling operations to perform multi-scale pooling. This process enriches receptive-field diversity and strengthens global semantic feature expression.</p>
<p>Finally, the backbone outputs four feature layers (P2, P3, P4 and P5) with spatial dimensions and channel depths of 160 &#xd7; 160 &#xd7; 32, 80 &#xd7; 80 &#xd7; 64, 40 &#xd7; 40 &#xd7; 128 and 20 &#xd7; 20 &#xd7; 256 respectively. These layers cover downsampling ratios ranging from 4&#xd7; to 32&#xd7;, enabling the learning of dedicated representations for small, medium, and large targets. Together, they provide a hierarchical, structured foundation of features for subsequent multi-scale fusion in the Neck module.</p>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Neck multi-scale feature fusion</title>
<p>This study introduces a two-stage fusion architecture composed of the ModifiedNeck and BiFPN modules to address the limitations of the simplified FPN structure in the original YOLOv8n, specifically insufficient information flow across scales and suboptimal weighting of multi-scale features. This architecture markedly enhances the efficiency and accuracy of cross-scale feature integration through a progressive fusion strategy, thereby improving the model&#x2019;s capability for robust small-object detection in complex orchard environments.</p>
<sec id="s2_2_2_1">
<label>2.2.2.1</label>
<title>ModifiedNeck feature fusion module</title>
<p>The design of the ModifiedNeck module focuses on bidirectional path interaction, channel unification and residual enhancement, which Structure is depicted in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>. It receives four feature layers from the Backbone, which are processed through a channel unification operation to produce standardised feature maps (P2_out, P3_out, P4_out and P5_out) with a depth of 64 channels (<xref ref-type="bibr" rid="B2">Ar&#x131;soy and Uysal, 2025</xref>). <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref> illustrates the overall structure of the module, and its implementation involves four core steps, which are outlined below:</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>ModifiedNeck structure of the feature fusion module. P2 to P5 channels are unified to 64 via 1x1 convolution. Features are then fused by dual top-down and bottom-up paths. Upsampling passes semantics downward while downsampling feeds details upward. C2f refines the fused maps producing a unified P2 out to P5 out pyramid.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g004.tif">
<alt-text content-type="machine-generated">Diagram illustrating a neural network architecture called ModifiedNeck, divided into  Channel unification, Top-Down path, and Bottom-Up path. It shows hierarchical feature maps P1 to P5 processed through convolution, upsampling, concatenation, and combination operations, with outputs labeled P2_out to P5_out.</alt-text>
</graphic></fig>
<p>1. Lateral connections and channel unification: To mitigate fusion difficulties arising from substantial discrepancies in channel dimensions across feature layers, the channel depth of each input feature map is first standardised using a 1 &#xd7; 1 convolution. This operation uniformly adjusts the P2, P3, P4 and P5 feature layers to 64 channels. This reduces the computational burden of subsequent fusion operations and ensures dimensional consistency across all feature representations. This process can be expressed mathematically in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>lateral</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>backbone</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Among them, <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>backbone</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> denotes the i-th feature map produced by the Backbone (i=2,3,4,5); <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msub><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the 1 &#xd7; 1 convolution operation; <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>lateral</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> refers to the lateral feature map obtained after channel unification.</p>
<p>2. Top-Down path:P5 is first upsampled by a factor of two and then fused with the corresponding <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msubsup><mml:mi>p</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>lateral</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> through element-wise addition. The fused features are subsequently refined using a C2f module, enabling effective downward propagation of semantic information. This process is formally expressed in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>td</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mtext>C</mml:mtext><mml:mn>2</mml:mn><mml:mtext>f</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>lateral</mml:mtext></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mtext>Upsample</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>td</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Among them, Upsample(&#xb7;) denotes the two-fold bilinear interpolation for upsampling; C2f(&#xb7;) refers to the C2f-based feature enhancement module.</p>
<p>3. Bottom-Up pathway: starting from <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mn>2</mml:mn><mml:mrow><mml:mtext>td</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, the feature map is downsampled using a stride-2 3&#xd7;3 convolution and then concatenated with the corresponding <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>lateral</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, resulting in a combined feature map with 128 channels. This fused representation is then enhanced through a C2f module and reduced back to 64 channels, enabling effective upward feedback of fine-grained information. The process is mathematically expressedtd in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>:</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mtext>C</mml:mtext><mml:mn>2</mml:mn><mml:mtext>f</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mtext>Concat</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>lateral</mml:mtext></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mtext>Downsample</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Among these, Downsample(&#xb7;) denotes the stride-2&#x2013;3 x 3 convolution for downsampling; Concat(&#xb7;) refers to the feature concatenation operation; C2f(&#xb7;) indicates the C2f enhancement and fusion module.</p>
<p>4. Lightweight optimisation: After the bidirectional fusion process is complete, each of the four feature layers is passed through an output projection stage for final refinement. This produces a unified, 64-channel, multi-scale feature pyramid consisting of P2_out, P3_out, P4_out and P5_out.</p>
<p>These optimisations substantially improve the efficiency of feature fusion and enhance the representational capacity of features across different spatial scales through the ModifiedNeck module.</p>
</sec>
<sec id="s2_2_2_2">
<label>2.2.2.2</label>
<title>BiFPN weighted fusion mechanism</title>
<p>The BiFPN module was modified by translating the original P3- P7 feature hierarchy into a P2 - P5 configuration in order to better accommodate the requirements of smallobject detection. The adapted BiFPN operates on the 256-channel feature maps produced by the ModifiedNeck,i.e. <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mn>2</mml:mn><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msubsup><mml:mo>&#x223c;</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mn>5</mml:mn><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> and employs bidirectional iterative fusion together with a learnable weighting mechanism to efficiently aggregate multiscale features at different scales (<xref ref-type="bibr" rid="B25">Li et&#xa0;al., 2023a</xref>). The resulting architecture is depicted in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>, and its key features are summarised below:</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>A comparison diagram of the original and improved BiFPN structures. <bold>(A)</bold> shows the original BiFPN based on P3-P7. <bold>(B)</bold> presents the improved version which shifts to P2-P5. It adds learnable weights and depthwise separable convolutions to reduce computation and enhance high-resolution small-object features achieving more balanced bidirectional fusion.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g005.tif">
<alt-text content-type="machine-generated">Network diagrams comparing original and improved BiFPN structures. Panel A shows the original BiFPN structure using P3-P7 feature levels. Panel B shows the improved BiFPN structure using P2-P5 feature levels. Circular nodes represent feature levels, while colored circles within dashed outlines denote repeated blocks and their cross-level connections.</alt-text>
</graphic></fig>
<p>1. Core computational logic: For the input feature maps <italic>P<sub>i</sub></italic>,<italic>i</italic> &#x2208; {2, 3, 4, 5}, learnable scalar weights <italic>w<sub>i</sub></italic> are applied and normalised to compute a weighted sum. This adaptively amplifies the contribution of the high-resolution P2 features for small object regions and the deep semantic P5 features for complex background regions (<xref ref-type="bibr" rid="B4">Bonte et&#xa0;al., 2023</xref>). The core computational formulation is expressed in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>:</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:msub><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x404;</mml:mi></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2265;</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:math>
</disp-formula>
<p>Among these, <italic>w<sub>i</sub></italic> denotes learnable scalar weights constrained to be non-negative via ReLU activation during training, thus avoiding negative contributions to feature fusion (<xref ref-type="bibr" rid="B15">He et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B26">Li et&#xa0;al., 2024</xref>); <italic>&#x404;</italic> denotes a small constant (set to 1 &#xd7; 10<sup>&#x2212;4</sup> to prevent division by zero and ensure numerical stability); <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represents input feature maps at different spatial resolutions; <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> denotes the fused output feature map.</p>
<p>2. Bidirectional fusion mechanism: The BiFPN implements top-down and bottom-up pathways to propagate features hierarchically. In the top-down pathway, higher-level feature maps are upsampled using nearest-neighbour interpolation and subsequently fused with intermediate and lower-level feature maps via weighted aggregation. In the bottom-up pathway, lower-level features are downsampled using 3&#xd7;3 max pooling with a stride of 2, and are then subjected to three-way weighted fusion with the corresponding lateral connections and the incoming top-down features. Each fusion node is followed by a depthwise separable convolution and a SiLU activation function (<xref ref-type="bibr" rid="B34">Ramachandran et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B17">Howard et&#xa0;al., 2017</xref>) to refine the fused representations while maintaining a 64-channel output.</p>
<p>3. Advantages of Hierarchical Adjustment: Compared with the original BiFPN design, which spans P3 - P7, the P2 - P5 configuration offers several advantages. Including the P2 layer preserves higher spatial resolution, enabling the retention of fine-grained details of small targets. The removal of P6 and P7 reduces computational overhead while retaining sufficient semantic richness in the P5 layer. Consequently, the four-tier P2 - P5 feature pyramid strikes a better balance between detection accuracy and computational efficiency, making it more suitable for lightweight, small-object detection in complex orchard environments.</p>
<p>4. Lightweight design: Standard convolutional operations have been replaced by depthwise separable convolutions, which substantially reduces the number of parameters. A shared-weight strategy has also been employed to decrease storage overhead and computational redundancy, thereby improving the model&#x2019;s suitability for deployment on resource-constrained edge platforms. The BiFPN produces a four-level, 64-channel, enhanced, multi-scale feature pyramid, which is then sent to the detection head for subsequent bounding-box regression and classification.</p>
</sec>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Head:P2 detection layer</title>
<p>To address the original YOLOv8n model&#x2019;s limited sensitivity to small objects, a dedicated P2 detection layer was introduced, along with a four-scale detection architecture comprising P2 - P3 - P4 - P5, to ensure precise and stratified coverage of targets of different sizes. To improve detection accuracy for young olive fruits, the design of the detection head was decoupled (<xref ref-type="bibr" rid="B23">Li et&#xa0;al., 2022</xref>), whereby each scale-specific output is processed by independent classification and regression branches. Specifically, the feature maps for each detection scale are routed to a classification branch that performs class probability estimation and a regression branch that predicts bounding-box offsets and objectness scores. Both branches employ lightweight convolutional blocks and scale-appropriate anchors to preserve inference efficiency while enhancing localisation and classification performance. The detailed architecture is presented below:</p>
<p>1. Multi-scale detection layer design: The detection head receives four 64-channel feature maps from the BiFPN, with each feature map corresponding to a specific target size range: the P2 layer focuses on ultra-small targets (less than 32&#xd7;32 pixels), the P3 layer processes small-sized targets (32&#xd7;32 pixels to 64&#xd7;64 pixels), the P4 layer is responsible for medium-sized targets (64&#xd7;64 pixels to 128&#xd7;128 pixels), and the P5 layer detects large-sized targets (greater than 128&#xd7;128 pixels). The 160&#xd7;160 high-resolution feature representation provided by the P2 layer significantly enhances the architecture&#x2019;s spatial localization accuracy and feature representation capability for small targets. The collaborative design of the P2 detection layer and the decoupled detection head further ensures the efficient utilization of fine-grained details &#x2014; the regression branch gradually optimizes the localization accuracy of small target bounding boxes through a three-layer convolutional structure, effectively avoiding feature conflicts with the classification task. The classification branch focuses on capturing fine-grained phenotypic features retained by the P2 layer, such as fruit peel texture and local grayscale variations. This enables effective distinction from background vegetation even when the target size is less than 30 pixels.</p>
<p>2. Bounding-box regression branch: An independent regression branch is instantiated for each of the four detection scales to process the 64-channel input feature maps. Each branch uses a three-layer convolutional sequence consisting of a 3 &#xd7; 3 convolution, a second 3&#xa0;&#xd7; 3 convolution and a final 1 &#xd7; 1 convolution. The first 3&#xd7;3 layer preserves the 64-channel dimensionality while extracting spatial features, the second 3&#xd7;3 layer further strengthens the representation of local features, and the final 1&#xd7;1 layer produces a 64 channel tensor that encodes the prediction features of the bounding box. Progressively refining localisation information through stacked convolutions improves the accuracy of bounding-box regression.</p>
<p>3. Classification prediction branch: This operates in parallel with the regression branch and uses a three-layer convolutional architecture to predict classes. The initial 3 &#xd7; 3 convolution increases the number of channels from 64 to 80, and the subsequent 3 &#xd7; 3 convolution further refines the feature representation. A final 1 &#xd7; 1 convolution then produces an 80-channel tensor of class-confidence scores. Having an independent classification branch reduces feature conflict between the classification and regression tasks, thereby enhancing the overall performance of multi-task learning.</p>
<p>4. Advantages of the decoupled head: Compared with conventional shared-convolution architectures, the decoupled detection head implements separate feature-extraction pathways for classification and regression. This enables the model to learn task-specific representations. The regression branch is optimised for precise bounding-box localisation and therefore emphasises spatial sensitivity, whereas the classification branch is optimised for semantic discrimination and thus emphasises feature separability. By isolating these tasks, mutual interference is avoided, leading to improved detection accuracy and faster convergence during training. This design also allows each branch to be optimised for computational efficiency, which is beneficial for deployment on resource-constrained edge platforms.</p>
<p>5. Loss computation and post-processing: Outputs from the regression branch are decoded into bounding-box coordinates using Distribution Focal Loss (DFL) decoding (<xref ref-type="bibr" rid="B24">Li et&#xa0;al., 2020</xref>), whereas outputs from the classification branch are converted into class probabilities via Sigmoid activation. During training, positive and negative samples are assigned using the Task-Aligned Assigner (<xref ref-type="bibr" rid="B11">Feng et&#xa0;al., 2021</xref>), which considers classification confidence and IoU quality together to make more reliable sample selections. During inference, predictions from the four detection scales are decoded in an anchor-free manner (<xref ref-type="bibr" rid="B39">Tian et&#xa0;al., 2019</xref>), and redundant detections are suppressed using non-maximum suppression (NMS) (<xref ref-type="bibr" rid="B10">Felzenszwalb et&#xa0;al., 2009</xref>) to produce the final set of detections. The four-scale detection architecture enables the model to capture targets across multiple spatial resolutions. The inclusion of the high-resolution P2 layer notably improves the Recall and localisation accuracy of small objects.</p>
</sec>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Evaluation indicators</title>
<p>Accurate detection of olives requires a balance of localisation accuracy, completeness of detection, reliability of results and lightweight deployability; therefore, no single metric can comprehensively characterise model performance. Guided by the standard evaluation criteria used in the object detection community, as well as the specific requirements of olive fruit detection &#x2014; namely, the presence of a high proportion of small targets and complex backgrounds, and the need for edge deployment &#x2014; this study adopts seven core metrics: Precision, Recall, the F1 score, mean average precision at 50% (mAP50), parameter count, frames per second (FPS), and giga floating-point operations (GFLOPs). These metrics enable a multifaceted assessment of the proposed YOLO-TinyFuse model across seven dimensions: localisation performance, detection accuracy, overall effectiveness, model complexity, real-time capability and computational efficiency. The corresponding mathematical definitions are given in <xref ref-type="disp-formula" rid="eq5">Equations 5</xref>&#x2013;<xref ref-type="disp-formula" rid="eq11">11</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mtext>Recall</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mtext>F</mml:mtext><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mtext>Score</mml:mtext><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mfrac><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>+</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mtext>mAP</mml:mtext><mml:mn>50</mml:mn><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mi>A</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mtext>IoU</mml:mtext><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mtext>&#x2003;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mi>N</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mtext>Only&#x2004;the&#x2004;category&#x2004;of&#x2004;oil&#x2004;olives</mml:mtext><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:mtext>Parameters</mml:mtext><mml:mo>=</mml:mo><mml:mtext>out</mml:mtext><mml:mo>_</mml:mo><mml:mtext>channels</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mtext>in</mml:mtext><mml:mo>_</mml:mo><mml:mtext>channels</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mi>k</mml:mi><mml:mi>h</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>k</mml:mi><mml:mi>w</mml:mi><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mtext>out</mml:mtext><mml:mo>_</mml:mo><mml:mtext>channels&#x2004;if&#x2004;bias</mml:mtext><mml:mo>=</mml:mo><mml:mtext>True&#x2004;else&#x2004;</mml:mtext><mml:mn>0</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:mtext>FPS</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mtext>infer</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:mtext>GFLOPs</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>L</mml:mi></mml:msubsup><mml:mtext>FLOP</mml:mtext><mml:msub><mml:mi>s</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mn>10</mml:mn></mml:mrow><mml:mn>9</mml:mn></mml:msup></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>Here, <italic>TP</italic> denotes the number of instances of olive fruit correctly identified by the model. <italic>FP</italic> denotes the number of negative samples, such as branches, specular highlights or other elements of the orchard background, incorrectly classified as olive fruit. <italic>FN</italic> denotes the number of instances of olive fruit incorrectly classified as non-olive; <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> denotes the total number of frames used for FPS evaluation; <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mtext>infer</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> denotes the total inference time required by the model to process <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> frames. GFLOPs are calculated by summing the floating-point operations of all layers&#xa0;in the model, where each convolutional layer contributes 2&#xa0;&#xd7; C_out &#xd7; C_in &#xd7; k_h &#xd7; k_w &#xd7; H_out &#xd7; W_out operations, and dividing by 10<sup>9</sup>. <italic>L</italic> denotes the number of layers in the model. <italic>FLOPs<sub>l</sub></italic> denotes the floating-point operations of the <italic>l</italic>-th layer.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Experimental evaluation</title>
<sec id="s3_1">
<label>3.1</label>
<title>Experimental environment</title>
<p>The experimental setup is shown in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Experimental environment configuration.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Category</th>
<th valign="middle" align="center">Configuration item</th>
<th valign="middle" align="center">Specific parameters</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Hardware Specifications</td>
<td valign="middle" align="center">CPU<break/>RAM<break/>GPU<break/>GPU Memory</td>
<td valign="middle" align="center">Intel (R) Core (TM) i9-13900K<break/>(Base Frequency 3.00GHz)<break/>64GB<break/>NVIDIA GeForce RTX 4090<break/>24GB</td>
</tr>
<tr>
<td valign="middle" align="left">Software Environment</td>
<td valign="middle" align="center">Operating System<break/>Programming Language<break/>Deep Learning Framework<break/>CUDA Version</td>
<td valign="middle" align="center">Linux<break/>Python3.8<break/>PyTorch 2.1.0<break/>11.8</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Algorithm training</title>
<p>To address the issues of loss of fine-scale features, poor occlusion robustness and interference from complex backgrounds encountered in olive fruit detection, an improved YOLOv8n-based model was developed that integrates the ModifiedNeck, BiFPN and a dedicated P2 high-resolution detection layer. This enhanced model was then trained and compared with the original baseline model. The model&#x2019;s hyperparameters were selected to optimise scene adaptability and overall performance. Training was conducted for 100 epochs with a batch size of eight and an input resolution of 640 &#xd7; 640 pixels to balance phenotypic feature retention and computational efficiency (<xref ref-type="bibr" rid="B14">He et&#xa0;al., 2022</xref>). The optimiser was configured in auto mode with an initial learning rate of 0.01, a decay coefficient of 0.01, momentum of 0.937 and weight decay of 0.0005 to promote the efficient learning of features of small objects. Data augmentation settings included a mosaic factor of 1.0, a 50% probability of horizontal flipping, and HSV colour space adjustments. Mixup was disabled (mixup = 0.0) to prevent mixing of fruit and background pixels. These settings reduce confusion of small fruit features and effectively increase the relative representation of small target instances, thereby improving the model&#x2019;s robustness in complex field environments.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Experimental results and analysis</title>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Comparative experiment</title>
<p>The comparative set comprised several representative detection architectures. DETR-R50 uses a ResNet-50 backbone and its four hierarchical feature maps are reduced in channel dimensionality using 1 &#xd7; 1 convolutions. These are then concatenated to form a single-scale feature map with a stride of 32, which is then input to the Transformer encoder. DETR-R50-DC5 (<xref ref-type="bibr" rid="B6">Carion et&#xa0;al., 2020</xref>) is also based on ResNet-50, but it uses a DC5 modification which removes conv5 x downsampling in order to preserve a single-scale feature map with a stride of 16. YOLOv5n (<xref ref-type="bibr" rid="B21">Jocher et&#xa0;al., 2020</xref>) uses a CSPDarknet backbone, combining CSP modules with an FPN+PAN neck to balance computational cost while achieving multi-scale feature fusion. YOLOv9t (<xref ref-type="bibr" rid="B41">Wang et&#xa0;al., 2024</xref>) further refines feature propagation on an enhanced CSPDarknet backbone by adjusting the width of the channels and the size of the convolutional kernels to balance inference speed and representational capacity. YOLOv11n (<xref ref-type="bibr" rid="B22">Jocher et&#xa0;al., 2024</xref>) uses a modified CSPDarknet backbone with simplified structures to reduce computational overhead. It also uses an optimised FPN+PAN and detection-head hierarchy to improve detection of small objects. Comparative results are presented in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>, which reports Precision, Recall, GFLOPs and mAP50 for each model, and visually demonstrates the relative detection advantages of YOLO-TinyFuse.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparison of different object detection models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">Precision(%)&#x2191;</th>
<th valign="middle" align="center">Recall(%)&#x2191;</th>
<th valign="middle" align="center">GFLOPs&#x2193;</th>
<th valign="middle" align="center">mAP50(%)&#x2191;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Yolov5n</td>
<td valign="middle" align="center">85.3</td>
<td valign="middle" align="center">81.0</td>
<td valign="middle" align="center">7.18</td>
<td valign="middle" align="center">89.4</td>
</tr>
<tr>
<td valign="middle" align="left">Yolov9t</td>
<td valign="middle" align="center">84.8</td>
<td valign="middle" align="center">80.4</td>
<td valign="middle" align="center">8.48</td>
<td valign="middle" align="center">88.7</td>
</tr>
<tr>
<td valign="middle" align="left">Yolov11n</td>
<td valign="middle" align="center">85.7</td>
<td valign="middle" align="center">81.3</td>
<td valign="middle" align="center"><bold>6.44</bold></td>
<td valign="middle" align="center">89.7</td>
</tr>
<tr>
<td valign="middle" align="left">DetrR50</td>
<td valign="middle" align="center">34.4</td>
<td valign="middle" align="center">79.3</td>
<td valign="middle" align="center">36.82</td>
<td valign="middle" align="center">68.5</td>
</tr>
<tr>
<td valign="middle" align="left">DetrR50-Dc5</td>
<td valign="middle" align="center">33.4</td>
<td valign="middle" align="center">78.2</td>
<td valign="middle" align="center">62.32</td>
<td valign="middle" align="center">66.3</td>
</tr>
<tr>
<td valign="middle" align="left">YOLO-TinyFuse</td>
<td valign="middle" align="center"><bold>86.9</bold></td>
<td valign="middle" align="center"><bold>84.5</bold></td>
<td valign="middle" align="center">20.41</td>
<td valign="middle" align="center"><bold>92.3</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold value means the highest value, the symbol (&#x2191;) means the higher value is better, and the symbol (&#x2193;) means the lower value is better.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Comparative experiments demonstrate that YOLO-TinyFuse achieves superior performance to DETR-R50-DC5, DETR-R50, YOLOv5n, YOLOv9t and YOLOv11n in terms of Precision, Recall, GFLOPs and mAP50 metrics. YOLO-TinyFuse achieves an mAP50 of 0.923, indicating substantially higher detection accuracy at an intersection over union (IoU) of 0.5 than the evaluated baselines. The model achieves a GFLOPs of 20.41, indicating substantially lower computational cost than the evaluated baselines. The model&#x2019;s Recall is 0.845, reflecting a significant reduction in missed detections of true targets. Meanwhile, its Precision is 0.869, indicating a low false-positive rate, thereby improving the validity and reliability of the detection outputs.</p>
<p>Convergence analysis based on comparative experiments indicates that YOLO-TinyFuse achieves an mAP50 of 0.923. This corresponds to increases of 26.0, 23.8, 2.9, 3.6 and 2.6 percentage points relative to DETR-R50-DC5, DETR-R50, YOLOv5n, YOLOv9t and YOLOv11n respectively. As shown in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref>, the mAP50 trajectories recorded during training emphasise YOLO-TinyFuse&#x2019;s superiority in accurate recognition and localisation, and demonstrate a marked improvement in overall detection performance.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>mAP50 comparison of six object detection models. Performance on the olive test set shows YOLO-TinyFuse outperforms DETR-R50 DETR-R50-Dc5 YOLOv5n YOLOv9t and YOLOv11n. This validates the effectiveness of the proposed modules.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g006.tif">
<alt-text content-type="machine-generated">Line chart comparing mean Average Precision at fifty percent (mAP50) over one hundred epochs for six object detection models, showing YOLO-TinyFuse with the highest final mAP50 of ninety-two point three percent, followed by Yolov11n, Yolov9t, and Yolov5n, while DetrR50 and DetrR50-Dc5 achieve the lowest values.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Comparison of YOLO-TinyFuse with the Yolov8n baseline model</title>
<p>The detection performance of the different model variants was evaluated using core metrics on the test set. <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref> shows how YOLOv8n and YOLO-TinyFuse compare across key indicators, including mAP50, mAP50-95, F1 score and parameter count. YOLO-TinyFuse shows improvements of 2.6% in mAP50, 3.5% in mAP50&#x2013;95 and 3.7% in the F1 score compared with YOLOv8n, while reducing the number of parameters by 6.33%. These results suggest that YOLO-TinyFuse offers superior detection performance and greater model compactness than YOLOv8n.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Performance comparison of YOLO-TinyFuse and YOLOv8n. <bold>(A)</bold> Overall performance metrics. <bold>(B)</bold> Performance by occlusion level and object size. YOLO-TinyFuse improves mAP50 by 2.6%, mAP50&#x2013;95 by 3.5, and F1 by 3.7% compared to YOLOv8n, while reducing parameters by 6.3% demonstrating a balance of accuracy and compactness. The analysis of mAP50 across target sizes (<italic>&lt;</italic>16 px, 16&#x2013;32 px, <italic>&gt;</italic>32 px) demonstrates that the P2 layer effectively addresses small object detection challenges. Furthermore, YOLO-TinyFuse achieves higher mAP50 values across all occlusion levels (<italic>&lt;</italic>20%, 20&#x2013;50%, 50&#x2013;80%, <italic>&gt;</italic>80%), which confirms that the BiFPN module enhances detection robustness under varying occlusion conditions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g007.tif">
<alt-text content-type="machine-generated">Radar chart comparison of YOLO-TinyFuse and YOLOv8n models on two metrics sets: Chart A shows model metrics (mAP50, mAP50-95, Recall, Precision, F1, Parameters) with YOLO-TinyFuse outperforming YOLOv8n in most categories. Chart B demonstrates performance based on occlusion level and object size, with YOLO-TinyFuse generally achieving higher scores than YOLOv8n across all segments.</alt-text>
</graphic></fig>
<p>In addition, <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref> further analyzes performance by object size (less than 16 px, 16&#x2013;32 px, more than 32 px) and occlusion level (less than 20%, 20-50%, 50-80%, more than 80%). The results indicate that YOLO-TinyFuse consistently outperforms YOLOv8n across all object size categories, which demonstrates that the P2 layer effectively mitigates the challenges of small object detection. Meanwhile, YOLO-TinyFuse achieves higher mAP50 values across all occlusion levels, which confirms that the BiFPN module enhances detection robustness under varying occlusion conditions.</p>
<p>To provide deeper insights into detection failures and validate the effectiveness of the P2 + BiFPN combination in addressing small object and occlusion challenges, a comprehensive COCO-style error analysis was conducted following the methodology proposed. As shown in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>, the error analysis categorizes detection failures into four types: missed detections, background errors, localization errors, and correct detections. Compared with YOLOv8n, YOLO-TinyFuse achieves optimization in detection error metrics: missed detections reduced from 47.4% to 47.2%, localization errors reduced from 42.7% to 42.5%, and correct detections increased from 1.1% to 1.4%. Although the improvements are modest, the systematic reduction in error rates, particularly in missed detections and localization errors, validates that the P2 + BiFPN combination effectively mitigates small object detection challenges and occlusion problems by enhancing feature representation and cross-scale fusion.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>COCO-style error analysis comparison: YOLOv8n vs YOLO-TinyFuse. The error analysis categorizes detection failures into four types following COCO evaluation methodology: Missed detections, Background errors, Localization errors, Correct detections.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g008.tif">
<alt-text content-type="machine-generated">Bar chart comparing error types between YOLOv8 and YOLO-TinyFuse object detection models. Missed Detections are highest around forty-seven percent for both, Localization Errors are about forty-two percent, Background Errors are nearly nine percent, and Correct Detections are lowest, about one percent.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_3_3">
<label>3.3.3</label>
<title>Melting experiment</title>
<p>The experiment was designed to quantify the performance contributions of three modifications: including a P2 high-resolution layer to improve the representation of small objects, incorporating a ModifiedNeck to optimise cross-scale feature fusion and integrating a BiFPN module to strengthen multi-scale feature interaction. A set of ablation experiments was conducted using YOLOv8n as the baseline to evaluate the independent effects and synergistic impact of these components. Each model variant was evaluated using the validation set, and the influence of each module on Precision, Recall, mAP50 and mAP50&#x2013;95 was measured. The quantitative results are presented in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Ablation study results.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">BiFPN</th>
<th valign="middle" align="center">ModifiedNeck</th>
<th valign="middle" align="center">P2</th>
<th valign="middle" align="center">Precision(%)&#x2191;</th>
<th valign="middle" align="center">Recall(%)&#x2191;</th>
<th valign="middle" align="center">mAP50(%)&#x2191;</th>
<th valign="middle" align="center">mAP50-95(%)&#x2191;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">85.3</td>
<td valign="middle" align="center">81.3</td>
<td valign="middle" align="center">89.7</td>
<td valign="middle" align="center">68.3</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">85.5</td>
<td valign="middle" align="center">83.1</td>
<td valign="middle" align="center">91.1</td>
<td valign="middle" align="center">68.7</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">85.2</td>
<td valign="middle" align="center">82.4</td>
<td valign="middle" align="center">90.6</td>
<td valign="middle" align="center">68.0</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">85.5</td>
<td valign="middle" align="center">82.9</td>
<td valign="middle" align="center">91.0</td>
<td valign="middle" align="center">69.0</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">85.9</td>
<td valign="middle" align="center">84.3</td>
<td valign="middle" align="center">92.0</td>
<td valign="middle" align="center">71.4</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">86.7</td>
<td valign="middle" align="center">84.3</td>
<td valign="middle" align="center">92.2</td>
<td valign="middle" align="center">71.3</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">85.2</td>
<td valign="middle" align="center">82.8</td>
<td valign="middle" align="center">91.0</td>
<td valign="middle" align="center">68.6</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center">&#x2713;</td>
<td valign="middle" align="center"><bold>86.9</bold></td>
<td valign="middle" align="center"><bold>84.5</bold></td>
<td valign="middle" align="center"><bold>92.3</bold></td>
<td valign="middle" align="center"><bold>71.8</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x2713;represents the use of this module in the model. The bold value means the highest value and the symbol (&#x2191;) means the higher value is better.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Ablation experiments indicate that the YOLOv8n baseline achieved Precision of 0.853, Recall of 0.813, mAP50 of 0.897 and mAP50&#x2013;95 of 0.683. The isolated addition of BiFPN produced modest improvements across all metrics, with Recall increasing by 1.8 percentage points. Notably, the isolated addition of ModifiedNeck leads to a 0.3 percentage point decrease in mAP50-95, which is mainly caused by two key factors. First, the ModifiedNeck is designed to standardize feature channel dimensions via 1&#xd7;1 convolutions and optimize cross-scale alignment through bidirectional propagation, but without the high-resolution spatial details provided by the P2 layer, the bottom-up feedback pathway of ModifiedNeck can only transmit low-quality downsampled features, where fine-grained cues such as edge and texture information of small olive fruits are severely blurred. Second, the bidirectional feature flow of ModifiedNeck introduces additional computational steps without complementary spatial information, which not only increases model complexity but also amplifies noise interference in mid-level semantic features, thereby reducing the detection confidence of low-to-medium confidence targets. In contrast, incorporating the P2 layer alone resulted in significant improvements, with mAP50&#x2013;95 rising by 0.7 percentage points. These observations suggest that augmenting a single module offers limited benefit and reveals inherent limitations in olive-fruit recognition.</p>
<p>For two-module combinations, the ModifiedNeck plus P2 configuration delivered increases of 1.0, 2.4 and 1.4 percentage points in mAP50, mAP50&#x2013;95 and Recall respectively, thereby validating the effectiveness of bidirectional feature propagation. However, a significant negative synergy effect is observed in the combination of BiFPN and ModifiedNeck. Without the P2 layer, although the metrics of this combination are slightly improved compared to using ModifiedNeck or the P2 layer alone, they are all inferior to those of BiFPN used independently. The core reason for this phenomenon lies in the inherent differences in core propagation mechanisms and functional positioning between the two modules, despite both focusing on cross-scale feature fusion: BiFPN centers on dynamic weighted fusion, enhancing the contribution of high-level semantic features through learnable weights and achieving lightweight design via depthwise separable convolutions; ModifiedNeck focuses on channel unification and bidirectional path connectivity, standardizing feature dimensions through 1&#xd7;1 convolutions and optimizing cross-scale alignment of low-level spatial features via bidirectional flows. In the absence of high-resolution spatial cues provided by the P2 layer, the two modules suffer from functional mismatch due to the lack of feature complementarity&#x2014;the detailed information fed back by ModifiedNeck from bottom to top is already insufficient, and BiFPN&#x2019;s weighting mechanism further suppresses mid-level spatial features, leading to a disconnect between fine-grained details and semantic context fusion. Coupled with redundant computations and noise interference caused by repeated bidirectional processing, the detection accuracy of low-to-medium confidence targets is ultimately reduced, resulting in all metrics of the combined model being lower than those of the single BiFPN module.</p>
<p>The three-module integrated system (BiFPN + ModifiedNeck + P2) achieved the best overall performance: Precision is 0.869, Recall is 0.845, mAP50 is 0.923, and mAP50&#x2013;95 is 0.718. Compared with the ModifiedNeck-only variant, the three-module model improved mAP50&#x2013;95 by 3.8 percentage points and Recall by 2.1 percentage points. These results collectively confirm that single-module enhancements are limited, that module combinations provide substantial synergistic gains when paired with the P2 layer to mitigate functional conflicts, and that three-module integration yields optimal performance. This offers a clear direction for optimising small-object detection in agricultural phenotyping.</p>
</sec>
<sec id="s3_3_4">
<label>3.3.4</label>
<title>Visualization analysis</title>
<p>Distributional inconsistencies arising from domain shifts caused by illumination variation and atypical target appearance can degrade detection performance. Representative detection examples under diverse environmental conditions (<xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>) demonstrate the YOLO-TinyFuse model&#x2019;s robustness across multiple scenarios. Even in challenging circumstances, such as low illumination and blurring caused by occlusion, the model maintains relatively high confidence scores for true targets, demonstrating its strong adaptability and resilience to common field perturbations.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Target detection results in different environments. <bold>(A)</bold> Low-light Environment. <bold>(B)</bold> High-brightness Environment. <bold>(C)</bold> Fruit Obstruction. <bold>(D)</bold> Fruit Blur.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g009.tif">
<alt-text content-type="machine-generated">Four photographs, labeled A, B, C, and D, display target detection results on olive tree branches in different environments. Panel A shows detection in a low-light environment, Panel B in a high-brightness environment, Panel C under fruit obstruction, and Panel D with fruit blur. Olives are identified by blue bounding boxes with numerical confidence scores, demonstrating model performance across varying lighting, occlusion, and image clarity conditions.</alt-text>
</graphic></fig>
<p>The comparison between manual annotations and model predictions (<xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>) provides quantitative validation of the detection performance. In the figure, val batch0 labels denotes the ground-truth annotations produced by human annotators. These encompass olive fruits of varying sizes and morphologies and serve as the evaluation benchmark.val batch0 pred denotes the model&#x2019;s predicted detections and reflects its capacity for phenotypic feature representation and object recognition. Direct comparison of these two sets of annotations allows prediction accuracy to be assessed intuitively and further substantiates the model&#x2019;s learning effectiveness and generalisation capability under diverse field conditions.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Comparison chart of manual annotation and model prediction results. <bold>(A)</bold> shows human-annotated ground truth. <bold>(B)</bold> presents model predictions. The side-by-side view highlights alignment on multi-scale olive fruits and differences in misses and false alarms enabling intuitive accuracy assessment.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g010.tif">
<alt-text content-type="machine-generated">Side-by-side comparison of manual annotation and model prediction results for olive detection. Panel A shows human-annotated ground truth with blue bounding boxes labeled "olive". Panel B presents model predictions, with blue bounding boxes that include numerical confidence scores. The grid layout highlights alignment on multi-scale olive fruits, as well as differences in misses and false alarms for intuitive accuracy assessment.</alt-text>
</graphic></fig>
<p>To complement the visualisation analysis, <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref> shows the differences in receptive field distribution between YOLO-TinyFuse and YOLOv8n at the P3 and P2 feature levels. Panels (a) and (b) show the feature activation maps for the backbone of YOLOv8n at the P3 level (80 &#xd7; 80) and YOLO-TinyFuse at the P2 level (160 &#xd7; 160), respectively. These visualisations show that activations at the P2 level in YOLO-TinyFuse are more spatially concentrated and have a stronger response, proving that despite the model being compressed to 2.96 million parameters, the modified backbone can still capture the fine-grained phenotypic features of small olive fruits, thereby improving the accuracy of small object detection.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Contrast Map of the Receptive Field. <bold>(A)</bold> YOLOv8n at P3 feature layer; <bold>(B)</bold> YOLO-TinyFuse at P2 feature layer. Comparing YOLOv8n at P3 with YOLO-TinyFuse at P2 reveals more focused and stronger activations at P2. This indicates the high-resolution branch better captures small-object details.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g011.tif">
<alt-text content-type="machine-generated">Side-by-side heatmaps compare receptive fields of YOLOv8n Backbone at 80 by 80 resolution (left) and YOLO-TinyFuse Backbone at 160 by 160 resolution (right), with normalized feature activation color scale from purple to yellow.</alt-text>
</graphic></fig>
<p>The comparative heatmap analysis (<xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref>) further illustrates the efficiency of the lightweight architecture. The visualisations show that YOLO-TinyFuse produces more concentrated, higher-intensity feature responses within the olive-fruit regions. The heatmap is predominantly shifted towards higher activation values, which correspond to increased object-confidence scores. These observations suggest that the model&#x2019;s lightweight design effectively focuses representational capacity on salient regions while suppressing background interference. The model achieved an inference speed of 18.57 frames per second when deployed on a Raspberry Pi 4B (4GB RAM version) at the target edge. The experimental environment was configured with Raspbian operating system, PyTorch Mobile framework, and OpenCV-based hardware acceleration. Average value calculated from 1000 consecutive frames, with a standard deviation of &#xb1;0.3 frames per second. Corroborates the accuracy of the model&#x2019;s target localization and the high confidence of its output results, thereby enhancing the detection accuracy and reliability in practical hardware deployment.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Heatmap Comparison Chart. <bold>(A)</bold> YOLOv8n. <bold>(B)</bold> YOLO-TinyFuse. Heatmaps show YOLO-TinyFuse concentrates high-intensity responses on olive regions while suppressing background. This demonstrates that bidirectional weighted fusion and lightweight design enhance saliency and confidence.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g012.tif">
<alt-text content-type="machine-generated">Side-by-side attention heatmaps compare YOLOv8n and YOLO-TinyFuse detection on a dense foliage scene. Both images show labeled olive detection boxes with confidence scores and color-coded feature responses from purple to yellow, with a legend below indicating attention heatmap intensity from zero to one.</alt-text>
</graphic></fig>
<p>In summary, the comprehensive visualisation framework, which encompasses quantitative detection performance, direct comparison between manual annotations and model predictions, receptive-field analysis and attention/heatmap inspection, provides convergent evidence that YOLO-TinyFuse exhibits strong adaptability, high accuracy and computational efficiency for olive fruit detection. The visual analyses corroborate the model&#x2019;s superior detection accuracy and highlight its suitability for use with hardware that has limited resources, thanks to concentrated feature responses and high-confidence outputs. This supports the practical deployment of the model in agricultural production systems.</p>
</sec>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<sec id="s4_1">
<label>4.1</label>
<title>Key technical contributions of YOLO-TinyFuse</title>
<p>The enhanced YOLO-TinyFuse model proposed in this study maximizes parameter simplicity while simultaneously improving recognition accuracy and reducing false negative rates. By integrating a P2 fine-grained detection layer, a Bidirectional Feature Pyramid Network (BiFPN) feature fusion mechanism, a ModifiedNeck cross-scale fusion strategy, and a decoupled detection head, the model&#x2019;s capability to detect small and partially occluded targets is significantly enhanced. The P2 layer extends the detection range to the shallow feature map, which retains richer texture details and spatial location information of small olive fruits&#x2014;information that is easily lost in deep feature downsampling. This mechanism ensures that the model captures fine-grained features of small targets at the early stage of feature extraction, laying a foundation for accurate localization. Furthermore, its lightweight architectural design reduces computational costs while improving detection efficiency, thus meeting the requirements of practical agricultural applications. Experimental results demonstrate that YOLO-TinyFuse outperforms YOLOv8n across multiple key performance metrics, validating the effectiveness of the proposed improvements. Benefiting from the lightweight design and reduced parameter count, the model achieves synchronous improvements in accuracy and computational efficiency. Specifically, the integration of the P2 layer substantially enhances small target detection performance, while the ModifiedNeck module optimizes cross-scale feature fusion within the Feature Pyramid Network (FPN) structure, strengthening feature transmission and reducing interference from non-informative representations. The ModifiedNeck introduces cross-scale attention mechanisms to adaptively weight feature maps of different scales, prioritizing the fusion of discriminative features related to olive fruits and suppressing redundant background information. This adaptive weighting mechanism solves the problem of uneven feature contribution in traditional FPN, where shallow and deep features are fused with equal weight regardless of their informativeness. In addition, the BiFPN module significantly improves feature utilization efficiency and amplifies the contribution of salient features through bidirectional feature propagation. Unlike unidirectional FPN, BiFPN establishes bidirectional connections between adjacent feature layers, enabling top-down semantic feature propagation and bottom-up detail feature feedback. This bidirectional interaction enriches the semantic information of shallow detail features and supplements the spatial detail information of deep semantic features, forming a more comprehensive feature representation for occluded olive targets.</p>
<p>Beyond the aforementioned modules, the adoption of a decoupled detection head constitutes another critical technical contribution. Compared with conventional shared convolution architectures, the decoupled detection head designs independent feature extraction pathways for classification and regression tasks, enabling the model to learn task-specific representations. The regression branch is optimized for accurate bounding box localization, thereby emphasizing spatial sensitivity, while the classification branch is optimized for semantic discrimination, thus focusing on feature separability. By isolating these two tasks, mutual interference is avoided, leading to improved detection accuracy and accelerated training convergence. The fundamental mechanism behind this improvement lies in the distinct feature requirements of classification and regression: classification relies on global semantic features to distinguish olive fruits from backgrounds and other objects, while regression depends on local spatial features to precisely locate the bounding box. The shared convolution in traditional heads forces a single feature map to satisfy both requirements, resulting in a trade-off between semantic discrimination and spatial localization. The decoupled design eliminates this trade-off by allocating dedicated pathways, allowing each branch to converge to its optimal feature space. This design also allows independent optimization of each branch for computational efficiency, facilitating deployment on resource-constrained edge devices&#x2014;a core requirement for on-site olive orchard detection.</p>
<p>Ablation experiments further verify the effectiveness of the multi-module collaborative optimization strategy, including the synergistic effect between the decoupled detection head and other modules. The combination of ModifiedNeck and BiFPN mitigates the &#x201c;semantic-detail disconnection&#x201d; commonly existing in traditional FPN architectures. This disconnection arises because traditional FPN only conducts one-way feature transmission, leading to the loss of detail information in deep semantic layers and insufficient semantic information in shallow detail layers. The ModifiedNeck&#x2019;s adaptive weighting and BiFPN&#x2019;s bidirectional propagation work synergistically to bridge this gap: BiFPN provides bidirectional feature flow, and ModifiedNeck optimizes the fusion weight of each feature component, ensuring that semantic and detail information are fully integrated across scales. The synergistic effect of these two modules, coupled with the performance enhancement of the decoupled detection head, increases the mean Average Precision at 50% intersection over union (mAP50) for small target detection by 1.3 percentage points. Tailored to the small size of olive fruits, the P2 layer alleviates the underfitting problem caused by limited small target samples and significantly improves the Recall of small targets. This is because the P2 layer extracts features from the shallowest feature map with the highest resolution, capturing more pixel-level details of small olive fruits. For small target samples with limited quantity, these detailed features provide more discriminative information, reducing the model&#x2019;s reliance on large sample sizes and thus alleviating underfitting. In addition to performance improvements, the model achieves a substantial reduction in size and reduces reliance on high-performance computing hardware, meeting the real-time detection requirements of olive orchards. The lightweight design also lowers training costs and inference time, reduces the risk of overfitting, and enhances detection accuracy and stability. Collectively, these improvements&#x2014;including the decoupled detection head, P2 layer, BiFPN, and ModifiedNeck&#x2014;demonstrate that the model strikes a balance between high performance and computational efficiency.</p>
<p>The YOLO-TinyFuse model demonstrates strong adaptability in a variety of scenarios, enabling efficient detection of olives while generalising effectively to a wide range of agricultural and biological tasks involving small objects, including wheat, cherries, bees, mangoes and apples. The datasets for these small-target and high-occlusion crops were obtained from Kaggle, with specific dataset sources referenced as follows: wheat dataset (<xref ref-type="bibr" rid="B9">enddl22, 2024</xref>), cherry dataset (<xref ref-type="bibr" rid="B9">enddl22, 2024</xref>), bee dataset (<xref ref-type="bibr" rid="B1">Andrew, 2024</xref>), mango dataset (<xref ref-type="bibr" rid="B9">enddl22, 2024</xref>), and apple dataset (<xref ref-type="bibr" rid="B33">Projects, 2024</xref>). For each target, the model was trained from scratch on their respective datasets with consistent experimental settings: 100 training epochs, a batch size of 8, and identical hyperparameters including learning rate, weight decay, and momentum. Specifically, wheat, cherries, bees, mangoes, and apples were all subjected to independent training from scratch with the aforementioned uniform configuration. As shown in <xref ref-type="fig" rid="f13"><bold>Figure&#xa0;13</bold></xref>, the model achieves highly precise recognition of dense small targets, with mAP50 values of 91.9% for wheat, 86.7% for cherries, and 95.8% for bees. This meets the demands of real-time detection in field environments characterised by high object density. For crops with substantial occlusion, such as mangoes and apples, the model achieves stable detection performance, with mAP50 values of 87.76% and 93.74% respectively.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Changes in mAP50 for different targets during training. Training curves on apple mango cherry wheat and bee show the single architecture converges quickly. It achieves high mAP50 across diverse small and occluded targets demonstrating strong reusability.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g013.tif">
<alt-text content-type="machine-generated">Line chart titled Multiple mAP Curves displays mAP versus training epoch for five categories: apple, mango, cherry, wheat, and bee. Bee category achieves the highest mAP at 95.8 percent, followed by apple at 93.7 percent, wheat at 91.9 percent, mango at 87.8 percent, and cherry at 86.7 percent by epoch one hundred. A legend matches line colors to each label.</alt-text>
</graphic></fig>
<p>To provide a detailed comparative analysis demonstrating the model&#x2019;s superiority over the baseline, cherry detection was selected as a representative case study. Cherries were chosen for this comparison because they exhibit typical characteristics of small agricultural targets&#x2014;including high density, variable occlusion levels, and morphological similarity to background elements&#x2014;while also presenting moderate complexity that allows clear visualization of performance differences between YOLO-TinyFuse and YOLOv8n. As illustrated in <xref ref-type="fig" rid="f14"><bold>Figure&#xa0;14</bold></xref>, YOLO-TinyFuse demonstrates superior detection performance compared to YOLOv8n, with improved precision in identifying small cherry targets and reduced false positives in complex orchard backgrounds. Visual examples of detection results across all evaluated crops, including wheat, cherries, bees, mangoes, and apples, are presented in <xref ref-type="fig" rid="f15"><bold>Figure&#xa0;15</bold></xref>, showcasing the model&#x2019;s consistent performance across diverse agricultural scenarios.</p>
<fig id="f14" position="float">
<label>Figure&#xa0;14</label>
<caption>
<p>Cherry mAP50 comparison: YOLOv8n vs YOLO-TinyFuse. YOLO-TinyFuse achieves higher mAP50 than YOLOv8n, indicating improved performance for cherry detection.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g014.tif">
<alt-text content-type="machine-generated">Line graph comparing mAP50 performance of YOLOv8n and YOLO-TinyFuse models on a cherry dataset over 100 training epochs. YOLO-TinyFuse reaches 86.70 percent, while YOLOv8n attains 81.11 percent. YOLO-TinyFuse maintains a higher mAP50 than YOLOv8n for most of the training process.</alt-text>
</graphic></fig>
<fig id="f15" position="float">
<label>Figure&#xa0;15</label>
<caption>
<p>Visual detection results of YOLO-TinyFuse on multiple agricultural and biological targets. <bold>(A)</bold> Wheat. <bold>(B)</bold> Cherry. <bold>(C)</bold> Bee. <bold>(D)</bold> Mango. <bold>(E)</bold> Apple.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1773377-g015.tif">
<alt-text content-type="machine-generated">Five labeled photographs display object detection results for different agricultural and biological targets: wheat heads in grass (A), cherries on a branch (B), bees near a hive (C), mangoes hanging from branches (D), and apples on a tree (E). Each detected object is outlined in blue with a corresponding confidence score and class label.</alt-text>
</graphic></fig>
<p>With only 2.96 million parameters, the model can be rapidly adapted to different crop detection tasks without any structural modifications, overcoming the traditional limitation of single-crop-specific models. These results provide an efficient, reusable technical foundation for developing unified, multi-crop detection systems for specialised agricultural production.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Structural comparison with recent agricultural small object detectors</title>
<p>To further highlight the advantages of YOLO-TinyFuse in structural design and practical applicability, this section conducts a systematic comparison with three representative recent agricultural small-object detectors, namely YOLOv8-p2, Slim-BiFPN, and Lightweight Transformer. The comparison focuses on core structural designs, technical focuses, performance trade-offs, and scenario adaptability, without being limited to numerical differences in metrics. The key comparative analysis is as follows:</p>
<p>1. Structural Design and Technical Focus Comparison</p>
<p>YOLOv8-p2 is based on the YOLOv8n backbone, adopts MobileNetV3 for lightweight optimization, integrates BiFPN for feature fusion, and adds a P2 high-resolution detection head to enhance small-target detail capture (<xref ref-type="bibr" rid="B45">Xu et&#xa0;al., 2024</xref>). Its core technical focus is &#x201c;shallow feature retention + multi-scale fusion&#x201d;, but it lacks targeted optimization for cross-scale feature weight adaptation and task decoupling. The EMA attention mechanism embedded in the C2f module mainly enhances local feature extraction, without solving the problem of uneven contribution of multi-scale features.</p>
<p>Slim-BiFPN takes lightweight deployment as the core goal, uses DWConv and GhostConv to reduce computational complexity, simplifies the BiFPN structure to form Slim-BiFPN, and embeds CBAM attention to focus on pest features (<xref ref-type="bibr" rid="B46">Xu et&#xa0;al., 2025</xref>). Its design focuses on &#x201c;computational efficiency + target attention&#x201d;, but the simplified feature fusion structure weakens the interaction between deep and shallow features, and it does not involve high-resolution detection layers, leading to limited performance in detecting ultra-small targets such as tiny olive fruits.</p>
<p>Lightweight Transformer combines Transformer&#x2019;s global attention with CNN&#x2019;s local feature extraction advantages, and adopts GhostNet&#x2019;s lightweight idea to optimize the backbone (<xref ref-type="bibr" rid="B44">Xia et&#xa0;al., 2025</xref>). Its technical focus is &#x201c;global-local feature integration&#x201d;, but Transformer&#x2019;s inherent computational complexity still restricts its real-time performance, and the lack of a dedicated cross-scale fusion module results in insufficient adaptation to multi-scale small targets in complex agricultural backgrounds.</p>
<p>YOLO-TinyFuse Ours integrates four key modules P2 layer, BiFPN, ModifiedNeck, decoupled detection head to form a &#x201c;detail retention-fusion optimization-task specialization&#x201d; collaborative framework. Compared with the above models, it not only retains shallow high-resolution features through the P2 layer similar to YOLOv8-p2 but also solves the problem of uneven multi-scale feature contribution through ModifiedNeck&#x2019;s adaptive attention weighting making up for the deficiencies of YOLOv8-p2 and Slim-BiFPN. Meanwhile, the decoupled detection head realizes task-specific feature learning a design not involved in the other three models, and the lightweight structure ensures deployment feasibility comparable to Slim-BiFPN.</p>
<p>2. Performance and Scenario Adaptability Trade-offs</p>
<p>In terms of accuracy vs. lightweight balance, YOLOv8-p2 achieves an mAP50 of 91.9% with 7.39M parameters, showing high accuracy but relatively large model size, which is not conducive to edge device deployment. Slim-BiFPN reduces the parameter count to 1.0M, but its mAP performance on ultra-small targets is limited due to the lack of a P2 layer. The Lightweight Transformer has a mAP of only 90.7% on tea bud datasets, indicating poor adaptation to dense small targets. YOLO-TinyFuse achieves a balance with 2.96M parameters and high mAP50 87.76% for mangoes, 93.74% for apples, being 60% lighter than YOLOv8-p2 while maintaining comparable accuracy, and outperforming Slim-BiFPN and Lightweight Transformer in ultra-small and dense target scenarios.</p>
<p>Regarding scenario specialization vs. generalization, YOLOv8-p2 is optimized for UAV agricultural imagery, and Slim-BiFPN is specialized for tiny pest detection, both lacking cross-scenario adaptability. The Lightweight Transformer is designed for tea bud detection, with poor generalization to occluded targets. YOLO-TinyFuse targets universal small-object detection challenges detail loss, occlusion, complex backgrounds rather than crop-specific features, achieving stable performance across olive, wheat, cherry, mango, and apple detection mAP50 86.7%-95.8%, demonstrating superior generalization.</p>
<p>3. Core Advantage of YOLO-TinyFuse</p>
<p>The multi-module collaborative design of YOLO-TinyFuse addresses the limitations of single-module optimization in existing models: 1) The combination of P2 layer and BiFPN makes up for the &#x201c;semantic-detail disconnection&#x201d; in Slim-BiFPN and Lightweight Transformer; 2) ModifiedNeck&#x2019;s adaptive weighting solves the problem of equal-weight fusion in YOLOv8-p2 and Slim-BiFPN; 3) The decoupled detection head eliminates the task interference in shared convolution architectures, which is not considered in the other three models. This structural synergy enables YOLO-TinyFuse to balance accuracy, lightweight, and generalization, making it more suitable for complex agricultural field environments requiring real-time edge deployment.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Limitations</title>
<p>Although the YOLO-TinyFuse model exhibits significant advantages in olive fruit detection tasks, it still has several limitations. Firstly, the definition of &#x201c;real-time&#x201d; in this study is that the model can be deployed on mainstream edge computing platforms or UAV-based systems with an inference speed meeting the practical application requirement of 15 frames per second (FPS). However, in olive orchard scenarios, if the model is deployed on low-end resource-constrained mobile or embedded devices such as low-power microcontrollers and entry-level single-board computers, its inference speed cannot meet the requirements for real-time deployment. When running on such devices, the model is prone to inference latency; when applied in environments with dense fruits or complex backgrounds, the frame rate will further decrease, leading to delayed detection results. Such inference latency and detection delay will not only affect the continuity and accuracy of target tracking but also reduce the detection coverage efficiency of large-scale olive orchard inspections, making it difficult to meet the practical demand for efficient detection in large-scale operation scenarios.</p>
<p>Secondly, the performance of the model during training is limited by the quality and diversity of the available dataset. While the current dataset includes challenging conditions such as low illumination, strong highlights, fruit occlusion and motion blur, it lacks scenarios involving fruit-surface lesions caused by diseases or pests. Consequently, the model&#x2019;s robustness in adverse weather conditions and in the presence of abnormal fruit colouration or other atypical visual appearances remains insufficiently validated, and its applicability to a broader range of real-world scenarios cannot yet be fully ensured.</p>
<p>In addition to the aforementioned limitations, the model also has certain constraints and boundary conditions, which not only define its valid application scope but also reveal the intrinsic defects of its structural design. The model tends to fail under specific extreme scenarios, and heatmap visualization of the P2 layer and deep semantic layers further explains the root causes: when olive fruits are in the early fruiting stage and coexist with small impurities, the model easily misclassifies impurities as small fruits or misses tiny fruits entirely, as the visualization results show that the P2 layer cannot effectively distinguish fine-grained texture differences between targets with similar pixel sizes, leading to confused attention focus. In mixed-crop orchards with interplanted small-fruited plants, the feature fusion mechanism optimized for olives struggles to separate morphologically similar heterologous targets, and visual analysis of ModifiedNeck feature weighting indicates it overemphasizes shape features shared by similar targets, resulting in a sharp increase in false positives. Under extreme weather conditions such as heavy fog, dense haze, or intense backlighting, BiFPN fails to filter noise features, and the blurred visualization results of cross-scale feature transmission confirm invalid noise interference, causing significant degradation in detection accuracy. These failure cases, supported by visual evidence, delineate the model&#x2019;s clear boundary conditions&#x2014;it performs stably only for fruits without external impurities attached, in single-crop orchards, and under moderate environmental conditions. Meanwhile, the &#x201c;black box&#x201d; nature of feature transmission and weighting in the model leads to insufficient interpretability: when the fruit occlusion rate is high, visualization cannot clarify whether texture, shape, or color dominates the detection decision, making it difficult to diagnose the root causes of failures and hindering targeted optimization.</p>
<p>Furthermore, the model has limitations when faced with severe fruit occlusion. When olive fruits are heavily obscured or almost completely concealed by branches and foliage, the Recall rate drops significantly, making accurate identification and localisation difficult. This is because the current feature extraction and fusion mechanisms are unable to capture enough discriminative and semantically valid features in cases of extreme occlusion. This results in reduced reliability of detection for highly concealed targets.</p>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>In response to the challenges posed by complex orchard backgrounds, small fruit size, and frequent occlusion in the detection of olives, this study presents an enhanced version of the YOLO-TinyFuse model. The model achieves simultaneous advances in detection performance and lightweight design by integrating three key enhancements: the P2 high-resolution detection layer; the ModifiedNeck cross-scale fusion module; and the BiFPN bidirectional feature-aggregation mechanism. YOLO-TinyFuse improves recognition accuracy under leaf occlusion and heterogeneous illumination, and enhances hardware adaptability through its reduced parameter count. This delivers a detection framework that balances high Precision with computational efficiency.</p>
<p>Experimental results show that YOLO-TinyFuse achieves an mAP50 of 0.923 and an F1 score of 0.869 on the olive fruit detection test set with only 2.96 million parameters. Not only does the model markedly outperform mainstream detectors such as DetrR50-Dc5, DetrR50, YOLOv5n, YOLOv9t and YOLOv11n, it also substantially reduces computational redundancy due to its compact architecture. This lightweight design enables direct deployment on edge-computing platforms, such as the Raspberry Pi 4B, and on unmanned aerial vehicles. It overcomes the conventional constraint that achieving high accuracy requires a high computational cost. In terms of practical applicability, YOLO-TinyFuse closely aligns with the full-chain operational requirements of the olive industry. Furthermore, its technical framework is highly reusable, facilitating rapid adaptation to small-object tasks involving wheat, cherries, and bees, as well as crops with high occlusion rates, such as mangoes and apples. This generalisation capability supports the transition from single-crop-specific detection to unified, multi-crop intelligent solutions, thereby reducing costs and improving efficiency in agricultural production.</p>
<p>We recognise the current limitations of the model in terms of inference speed, training data coverage, and detection of highly occluded targets. We regard these aspects as key areas for future improvement. Subsequent work will explore pruning, quantisation and other lightweight optimisation techniques to reduce model parameters and computational cost further, while integrating Transformer-based attention mechanisms to enhance feature capture efficiency. In parallel, the dataset will be expanded to include more challenging scenarios, such as fruit affected by pests or diseases, and severe occlusion. Techniques such as targeted data augmentation, fine-grained annotation and collaborative training will be employed to improve the model&#x2019;s adaptability to adverse weather conditions and atypical fruit appearances. Furthermore, we will refine the feature-fusion and feature-extraction mechanisms to strengthen the model&#x2019;s ability to identify and localise heavily occluded fruit. This will enhance its overall practical utility and generalisation capacity.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>XY: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Resources, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. YL: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Resources, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. QX: Conceptualization, Data curation, Investigation, Methodology, Validation, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. ZL: Formal analysis, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. LM: Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. YS: Data curation, Investigation, Methodology, Writing &#x2013; original draft. KA: Data curation, Investigation, Methodology, Writing &#x2013; original draft. ZT: Conceptualization, Resources, Supervision, Writing &#x2013; review &amp; editing. YCh: Conceptualization, Resources, Supervision, Writing &#x2013; review &amp; editing. YCa: Conceptualization, Funding acquisition, Project administration, Resources, Supervision, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Andrew</surname> <given-names>L. C. A.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Bee image object detection</article-title>. Available online at: <uri xlink:href="https://www.kaggle.com/datasets/andrewlca/bee-image-object-detection">https://www.kaggle.com/datasets/andrewlca/bee-image-object-detection</uri> (Accessed <date-in-citation content-type="access-date">August 13, 2025</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ar&#x131;soy</surname> <given-names>M.</given-names></name>
<name><surname>Uysal</surname> <given-names>I.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Bifpn-enhanced swindat-based cherry variety classification with yolov8n</article-title>. <source>Sci. Rep.</source> <volume>15</volume>, <fpage>5427</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-025-89624-7</pub-id>, PMID: <pub-id pub-id-type="pmid">39948150</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Beltr&#xe1;n</surname> <given-names>A.</given-names></name>
<name><surname>Jim&#xe9;nez</surname> <given-names>A.</given-names></name>
<name><surname>Prieto</surname> <given-names>F.</given-names></name>
<name><surname>Garrido</surname> <given-names>D.</given-names></name>
<name><surname>Mu&#xf1;oz</surname> <given-names>A.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Mask r-cnn based automatic quality control system for table olives</article-title>. <source>Food Bioprocess Technol.</source> <volume>16</volume>, <fpage>2281</fpage>&#x2013;<lpage>2295</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-023-14668-8</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bonte</surname> <given-names>T.</given-names></name>
<name><surname>Philbert</surname> <given-names>M.</given-names></name>
<name><surname>Coleno</surname> <given-names>E.</given-names></name>
<name><surname>Bertrand</surname> <given-names>E.</given-names></name>
<name><surname>Imbert</surname> <given-names>A.</given-names></name>
<name><surname>Walter</surname> <given-names>T.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Learning with minimal effort: Leveraging in silico labeling for cell and nucleus segmentation</article-title>. <source>Proc. Comput. Vision &#x2013; ECCV 2022 Workshops</source> <volume>13804</volume>, <fpage>454</fpage>&#x2013;<lpage>469</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-031-25069-928</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cano Marchal</surname> <given-names>P.</given-names></name>
<name><surname>Satorres Mart&#xed;nez</surname> <given-names>S.</given-names></name>
<name><surname>G&#xf3;mez Ortega</surname> <given-names>J.</given-names></name>
<name><surname>G&#xe1;mez Garc&#xed;a</surname> <given-names>J.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Automatic system for the detection of defects on olive fruits in an oil mill</article-title>. <source>Appl. Sci.</source> <volume>11</volume>, <fpage>8167</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app11178167</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Carion</surname> <given-names>N.</given-names></name>
<name><surname>Massa</surname> <given-names>F.</given-names></name>
<name><surname>Synnaeve</surname> <given-names>G.</given-names></name>
<name><surname>Usunier</surname> <given-names>N.</given-names></name>
<name><surname>Kirillov</surname> <given-names>A.</given-names></name>
<name><surname>Zagoruyko</surname> <given-names>S.</given-names></name>
</person-group> (<year>2020</year>). <source>End-to-end object detection with transformers</source>. (<publisher-loc>Ithaca, NY, USA</publisher-loc>: 
<publisher-name>arXiv.org</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2005.12872</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cinardi</surname> <given-names>G.</given-names></name>
<name><surname>D&#x2019;Urso</surname> <given-names>P. R.</given-names></name>
<name><surname>Arcidiacono</surname> <given-names>C.</given-names></name>
<name><surname>Ingrao</surname> <given-names>C.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Accounting for circular economy principles in life cycle assessments of extra-virgin olive oil supply chains &#x2013; findings from a systematic literature review</article-title>. <source>Sci. Total Environ.</source> <volume>945</volume>, <elocation-id>173977</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.scitotenv.2024.173977</pub-id>, PMID: <pub-id pub-id-type="pmid">38879018</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Difonzo</surname> <given-names>G.</given-names></name>
<name><surname>Troilo</surname> <given-names>M.</given-names></name>
<name><surname>Squeo</surname> <given-names>G.</given-names></name>
<name><surname>Pasqualone</surname> <given-names>A.</given-names></name>
<name><surname>Caponio</surname> <given-names>F.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Functional compounds from olive pomace to obtain high-added value foods - a review</article-title>. <source>J. Sci. Food Agric.</source> <volume>101</volume>, <fpage>15</fpage>&#x2013;<lpage>26</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/jsfa.10478</pub-id>, PMID: <pub-id pub-id-type="pmid">32388855</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="web">
<person-group person-group-type="author"><collab>enddl22</collab>
</person-group> (<year>2024</year>). 
<article-title>deepnir 11 fruits annotations</article-title>. Available online at: <uri xlink:href="https://www.kaggle.com/datasets/enddl22/deepnir-11fruits">https://www.kaggle.com/datasets/enddl22/deepnir-11fruits</uri>.
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Felzenszwalb</surname> <given-names>P. F.</given-names></name>
<name><surname>Girshick</surname> <given-names>R. B.</given-names></name>
<name><surname>McAllester</surname> <given-names>D.</given-names></name>
<name><surname>Ramanan</surname> <given-names>D.</given-names></name>
</person-group> (<year>2009</year>). 
<article-title>Object detection with discriminatively trained part-based models</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>32</volume>, <fpage>1627</fpage>&#x2013;<lpage>1645</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2009.167</pub-id>, PMID: <pub-id pub-id-type="pmid">20634557</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Feng</surname> <given-names>C.</given-names></name>
<name><surname>Zhong</surname> <given-names>Y.</given-names></name>
<name><surname>Gao</surname> <given-names>Y.</given-names></name>
<name><surname>Scott</surname> <given-names>M.</given-names></name>
<name><surname>Huang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Tood: Task-aligned one-stage object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>).
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Fu</surname> <given-names>X.</given-names></name>
<name><surname>Zhao</surname> <given-names>S.</given-names></name>
<name><surname>Wang</surname> <given-names>C.</given-names></name>
<name><surname>Tang</surname> <given-names>X.</given-names></name>
<name><surname>Tao</surname> <given-names>D.</given-names></name>
<name><surname>Li</surname> <given-names>G.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Green fruit detection with a small dataset under a similar color background based on the improved yolov5-at</article-title>. <source>Foods</source> <volume>13</volume>, <elocation-id>1060</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/foods13071060</pub-id>, PMID: <pub-id pub-id-type="pmid">38611366</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Garc&#xed;a-Serrano</surname> <given-names>P.</given-names></name>
<name><surname>Romero</surname> <given-names>C.</given-names></name>
<name><surname>Brenes</surname> <given-names>M.</given-names></name>
<name><surname>Garcia-Garcia</surname> <given-names>P.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Enrichment in phenolic compounds of black ripe olives through nano-filtration and vacuum evaporation techniques</article-title>. <source>Innovative Food Sci. Emerging Technol.</source> <volume>51</volume>, <fpage>73</fpage>&#x2013;<lpage>79</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ifset.2018.03.010</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>J.</given-names></name>
<name><surname>Liu</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>K.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Improved yolov5-based method for detecting dense and small fruits from mango canopy images</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.1016589</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Ren</surname> <given-names>S.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>IEEE Computer Society</publisher-name>) <fpage>770</fpage>&#x2013;<lpage>778</lpage>.
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hou</surname> <given-names>C.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Tang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhuang</surname> <given-names>J.</given-names></name>
<name><surname>Tan</surname> <given-names>Z.</given-names></name>
<name><surname>Huang</surname> <given-names>H.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Detection and localization of citrus fruit based on improved you only look once v5s and binocular vision in the orchard</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.972445</pub-id>, PMID: <pub-id pub-id-type="pmid">35968138</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Howard</surname> <given-names>A. G.</given-names></name>
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Chen</surname> <given-names>B.</given-names></name>
<name><surname>Kalenichenko</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Weyand</surname> <given-names>T.</given-names></name>
<etal/>
</person-group>. (<year>2017</year>). <source>Mobilenets: Efficient convolutional neural networks for mobile vision applications</source>. (<publisher-loc>Ithaca, NY, USA</publisher-loc>: 
<publisher-name>arXiv.org</publisher-name>). 
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ji</surname> <given-names>X.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Cai</surname> <given-names>X.</given-names></name>
<name><surname>Ye</surname> <given-names>X.</given-names></name>
<name><surname>Gouda</surname> <given-names>M.</given-names></name>
<name><surname>He</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Driving by a publicly available rgb image dataset for rice planthopper detection and counting by fusing swin transformer and yolov8-p2 architectures in field landscapes</article-title>. <source>Agriculture</source> <volume>15</volume>, <elocation-id>1366</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture15131366</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jimenez-Lopez</surname> <given-names>C.</given-names></name>
<name><surname>Carpena</surname> <given-names>M.</given-names></name>
<name><surname>Louren&#xe7;o-Lopes</surname> <given-names>C.</given-names></name>
<name><surname>Gallardo-Gomez</surname> <given-names>M.</given-names></name>
<name><surname>Lorenzo</surname> <given-names>J. M.</given-names></name>
<name><surname>Barba</surname> <given-names>F. J.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>Bioactive compounds and quality of extra virgin olive oil</article-title>. <source>Foods</source> <volume>9</volume>, <fpage>1014</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/foods9081014</pub-id>, PMID: <pub-id pub-id-type="pmid">32731481</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jing</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>D.</given-names></name>
<name><surname>Pan</surname> <given-names>W.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Melon ripeness detection by an improved object detection algorithm for resource constrained environments</article-title>. <source>Plant Methods</source> <volume>20</volume>, <fpage>127</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13007-024-01259-3</pub-id>, PMID: <pub-id pub-id-type="pmid">39152496</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Jocher</surname> <given-names>G. R.</given-names></name>
<name><surname>Stoken</surname> <given-names>A.</given-names></name>
<name><surname>Borovec</surname> <given-names>J.</given-names></name>
<name><surname>NanoCode</surname></name>
<name><surname>ChristopherSTAN</surname></name>
<name><surname>Liu</surname> <given-names>C.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). <source>Ultralytics yolov5</source>. (<publisher-loc>Geneva, Switzerland</publisher-loc>: 
<publisher-name>Zenodo</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.5281/zenodo.7347926</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jocher</surname> <given-names>G.</given-names></name>
<name><surname>Qiu</surname> <given-names>J.</given-names></name>
<name><surname>Chaurasia</surname> <given-names>A.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Ultralytics YOLO (Version 8.0.0) [Computer software]</article-title>. <source>GitHub Repository</source> <volume>11</volume>. Available online at: <uri xlink:href="https://github.com/ultralytics/ultralytics">https://github.com/ultralytics/ultralytics</uri>.
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<name><surname>Jiang</surname> <given-names>H.</given-names></name>
<name><surname>Weng</surname> <given-names>K.</given-names></name>
<name><surname>Geng</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). <source>Yolov6: A single-stage object detection framework for industrial applications</source>. (<publisher-loc>Ithaca, NY, USA</publisher-loc>: 
<publisher-name>arXiv.org</publisher-name>).
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Wu</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>S.</given-names></name>
<name><surname>Hu</surname> <given-names>X.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>Generalized focal loss: Learning qualified and distributed bounding boxes for dense object detection</article-title>. In: <source>Proceedings of the 34th Conference on Neural Information Processing Systems (NeurIPS)</source>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>Neural Information Processing Systems Foundation</publisher-name>). 
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Wen</surname> <given-names>Y.</given-names></name>
<name><surname>He</surname> <given-names>L.</given-names></name>
</person-group> (<year>2023</year>a). &#x201c;
<article-title>Scconv: Spatial and channel reconstruction convolution for feature redundancy</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>6153</fpage>&#x2013;<lpage>6162</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00596</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>N.</given-names></name>
<name><surname>Ye</surname> <given-names>T.</given-names></name>
<name><surname>Zhou</surname> <given-names>Z.</given-names></name>
<name><surname>Gao</surname> <given-names>C.</given-names></name>
<name><surname>Zhang</surname> <given-names>P.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Enhanced yolov8n with bifpn-simam for precise defect detection in miniature capacitors</article-title>. <source>Appl. Sci.</source> <volume>14</volume>, <elocation-id>429</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app14010429</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>W.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Zou</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>b). &#x201c;
<article-title>Small target detection algorithm based on two-stage feature extraction</article-title>,&#x201d; in <conf-name>Proceedings of the 6th International Conference on Software Engineering and Computer Science (CSECS)</conf-name>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CSECS60003.2023.10428237</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>T.-Y.</given-names></name>
<name><surname>Doll&#xe1;r</surname> <given-names>P.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Hariharan</surname> <given-names>B.</given-names></name>
<name><surname>Belongie</surname> <given-names>S.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>IEEE Computer Society</publisher-name>) <fpage>2117</fpage>&#x2013;<lpage>2125</lpage>.
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ma</surname> <given-names>L.</given-names></name>
<name><surname>Zhao</surname> <given-names>L.</given-names></name>
<name><surname>Wang</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<name><surname>Chen</surname> <given-names>G.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Detection and counting of small target apples under complicated environments by using improved yolov7-tiny</article-title>. <source>Agronomy</source> <volume>13</volume>, <elocation-id>1419</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy13051419</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mamalis</surname> <given-names>M.</given-names></name>
<name><surname>Kalampokis</surname> <given-names>E.</given-names></name>
<name><surname>Kalfas</surname> <given-names>I.</given-names></name>
<name><surname>Tarabanis</surname> <given-names>K.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Deep learning for detecting verticillium fungus in olive trees: Using yolo in uav imagery</article-title>. <source>Algorithms</source> <volume>16</volume>, <elocation-id>343</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/a16070343</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mazzocchi</surname> <given-names>A.</given-names></name>
<name><surname>Leone</surname> <given-names>L.</given-names></name>
<name><surname>Agostoni</surname> <given-names>C.</given-names></name>
<name><surname>Pali-Sch&#xf6;ll</surname> <given-names>I.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>The secrets of the mediterranean diet. does [only] olive oil matter</article-title>? <source>Nutrients</source> <volume>11</volume>, <fpage>2941</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/nu11122941</pub-id>, PMID: <pub-id pub-id-type="pmid">31817038</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Osco-Mamani</surname> <given-names>E.</given-names></name>
<name><surname>Santana-Carbajal</surname> <given-names>O.</given-names></name>
<name><surname>Chaparro-Cruz</surname> <given-names>I.</given-names></name>
<name><surname>Ochoa-Donoso</surname> <given-names>D.</given-names></name>
<name><surname>Alcazar-Alay</surname> <given-names>S.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>The detection and counting of olive tree fruits using deep learning models in tacna, per&#xfa;</article-title>. <source>AI</source> <volume>6</volume>, <elocation-id>25</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/ai6010025</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="web">
<person-group person-group-type="author"><collab>Projects</collab>
</person-group> (<year>2024</year>). 
<article-title>Applebbch81</article-title>. Available online at: <uri xlink:href="https://www.kaggle.com/datasets/projectlzp201910094/applebbch81">https://www.kaggle.com/datasets/projectlzp201910094/applebbch81</uri>.
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ramachandran</surname> <given-names>P.</given-names></name>
<name><surname>Zoph</surname> <given-names>B.</given-names></name>
<name><surname>Le</surname> <given-names>Q. V.</given-names></name>
</person-group> (<year>2017</year>). 
<article-title>Swish: a self-gated activation function</article-title>. (<publisher-loc>Ithaca, NY, USA</publisher-loc>: 
<publisher-name>arXiv.org</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1710.05941</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Redmon</surname> <given-names>J.</given-names></name>
<name><surname>Divvala</surname> <given-names>S.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>Farhadi</surname> <given-names>A.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>You only look once: Unified, real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Romani</surname> <given-names>A.</given-names></name>
<name><surname>Ieri</surname> <given-names>F.</given-names></name>
<name><surname>Urciuoli</surname> <given-names>S.</given-names></name>
<name><surname>Noce</surname> <given-names>A.</given-names></name>
<name><surname>Marrone</surname> <given-names>G.</given-names></name>
<name><surname>Nediani</surname> <given-names>C.</given-names></name>
<etal/>
</person-group>. (<year>2019</year>). 
<article-title>Health effects of phenolic compounds found in extra-virgin olive oil, by-products, and leaf of olea europaea l</article-title>. <source>Nutrients</source> <volume>11</volume>, <fpage>1776</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/nu11081776</pub-id>, PMID: <pub-id pub-id-type="pmid">31374907</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sapkota</surname> <given-names>R.</given-names></name>
<name><surname>Meng</surname> <given-names>Z.</given-names></name>
<name><surname>Churuvija</surname> <given-names>M.</given-names></name>
<name><surname>Du</surname> <given-names>X.</given-names></name>
<name><surname>Ma</surname> <given-names>Z.</given-names></name>
<name><surname>Karkee</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Comprehensive performance evaluation of yolov12, yolo11, yolov10, yolov9 and yolov8 on detecting and counting fruitlet in complex orchard environments</article-title>. doi:&#xa0;<pub-id pub-id-type="doi">10.13140/RG.2.2.36524.66845</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Subedi</surname> <given-names>A.</given-names></name>
</person-group> (<year>2024</year>). <source>Improving generalization performance of yolov8n for camera trap object detection</source>. (<publisher-loc>Cincinnati, OH, USA</publisher-loc>: 
<publisher-name>University of Cincinnati</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2412.14211</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tian</surname> <given-names>Z.</given-names></name>
<name><surname>Shen</surname> <given-names>C.</given-names></name>
<name><surname>Chen</surname> <given-names>H.</given-names></name>
<name><surname>He</surname> <given-names>T.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Fcos: Fully convolutional one-stage object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>. (<publisher-loc>Washington, D.C., USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>9627</fpage>&#x2013;<lpage>9636</lpage>.
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="web">
<person-group person-group-type="author"><collab>Ultralytics Team</collab>
</person-group> (<year>2023</year>). 
<article-title>Yolov8: Real-time object detection and instance segmentation</article-title>. Available online at: <uri xlink:href="https://docs.ultralytics.com/zh/models/Yolov8/">https://docs.ultralytics.com/zh/models/Yolov8/</uri>.
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>C.-Y.</given-names></name>
<name><surname>Yeh</surname> <given-names>I.-H.</given-names></name>
<name><surname>Liao</surname> <given-names>H.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Yolov9: Learning what you want to learn using programmable gradient information</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2402.13616</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Zhang</surname> <given-names>Z.</given-names></name>
<name><surname>Luo</surname> <given-names>L.</given-names></name>
<name><surname>Zhu</surname> <given-names>W.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Swingd: A robust grape bunch detection model based on swin transformer in complex vineyard environment</article-title>. <source>Horticulturae</source> <volume>7</volume>, <elocation-id>492</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/horticulturae7110492</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>X.</given-names></name>
</person-group> (<year>2025</year>). &#x201c;
<article-title>Optimization of yolov8n traffic sign object detection based on bifpn feature pyramid and cbam attention module</article-title>,&#x201d; in <conf-name>Proceedings of the 3rd International Conference on Machine Learning and Automation</conf-name>. (<publisher-loc>Aachen, Germany</publisher-loc>: 
<publisher-name>CEUR-WS.org</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.54254/2755-2721/2025.26120</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xia</surname> <given-names>G.</given-names></name>
<name><surname>Guo</surname> <given-names>Y.</given-names></name>
<name><surname>Wei</surname> <given-names>Q.</given-names></name>
<name><surname>Cen</surname> <given-names>Y.</given-names></name>
<name><surname>Feng</surname> <given-names>L.</given-names></name>
<name><surname>Yu</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Yolo-lmtb: A lightweight detection model for multi-scale tea buds in agriculture</article-title>. <source>Sensors</source> <volume>25</volume>, <fpage>6400</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s25206400</pub-id>, PMID: <pub-id pub-id-type="pmid">41157454</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>W.</given-names></name>
<name><surname>Cui</surname> <given-names>C.</given-names></name>
<name><surname>Ji</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Li</surname> <given-names>S.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Yolov8-mpeb small target detection algorithm based on uav images</article-title>. <source>Heliyon</source> <volume>10</volume>, <elocation-id>e29501</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e29501</pub-id>, PMID: <pub-id pub-id-type="pmid">38681580</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>W.</given-names></name>
<name><surname>Yang</surname> <given-names>R.</given-names></name>
<name><surname>Karthikeyan</surname> <given-names>R.</given-names></name>
<name><surname>Shi</surname> <given-names>Y.</given-names></name>
<name><surname>Su</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Gbidc-pest: A novel lightweight model for real-time multiclass tiny pest detection and mobile platform deployment</article-title>. <source>J. Integr. Agric.</source> <volume>24</volume>, <fpage>2749</fpage>&#x2013;<lpage>2769</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jia.2024.12.017</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>X.</given-names></name>
<name><surname>Bi</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>K.</given-names></name>
<name><surname>Zhang</surname> <given-names>G.</given-names></name>
<name><surname>Jiang</surname> <given-names>P.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Smn-yolo: Lightweight yolov8-based model for small object detection in remote sensing images</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>22</volume>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2025.3546034</pub-id>
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>F.</given-names></name>
<name><surname>Wang</surname> <given-names>S.</given-names></name>
<name><surname>Liu</surname> <given-names>M.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Feng</surname> <given-names>W.</given-names></name>
</person-group> (<year>2025</year>a). 
<article-title>A lightweight algorithm for detection and grading of olive ripeness based on improved yolov11n</article-title>. <source>Agronomy</source> <volume>15</volume>, <elocation-id>1030</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy15041030</pub-id>
</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhu</surname> <given-names>Y.</given-names></name>
<name><surname>Zhao</surname> <given-names>Y.</given-names></name>
<name><surname>He</surname> <given-names>Y.</given-names></name>
<name><surname>Wu</surname> <given-names>B.</given-names></name>
<name><surname>Su</surname> <given-names>X.</given-names></name>
</person-group> (<year>2025</year>b). 
<article-title>Yolo-wildasm: An object detection algorithm for protected wildlife</article-title>. <source>Animals</source> <volume>15</volume>, <elocation-id>2699</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/ani15182699</pub-id>, PMID: <pub-id pub-id-type="pmid">41007943</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1444450">Xu Zheng</ext-link>, University of Electronic Science and Technology of China, China</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3239828">Hao Chen</ext-link>, Xi&#x2019;an University of Technology, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3332248">Yujie Yang</ext-link>, Chinese Academy of Agricultural Sciences, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3332252">Chunya Ma</ext-link>, Chinese Academy of Agricultural Sciences, China</p></fn>
</fn-group>
</back>
</article>