<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1778795</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>MDFE-Net: a multiscale dilated feature enhancement network for small object detection</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Liu</surname><given-names>Tianzhe</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3351858/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Lin</surname><given-names>Shihang</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3339067/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Zhang</surname><given-names>Jiayi</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Li</surname><given-names>Bin</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhu</surname><given-names>Junyan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2836352/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Fujian Police College</institution>, <city>Fuzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>College of Computer and Information Science, Fujian Agriculture and Forestry University</institution>, <city>Fuzhou</city>, <state>Fujian</state>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Bin Li, <email xlink:href="mailto:libin@fafu.edu.cn">libin@fafu.edu.cn</email>; Junyan Zhu, <email xlink:href="mailto:junyanzhu@fafu.edu.cn">junyanzhu@fafu.edu.cn</email></corresp>
<fn fn-type="equal" id="fn003">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-24">
<day>24</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1778795</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>30</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Liu, Lin, Zhang, Li and Zhu.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Liu, Lin, Zhang, Li and Zhu</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-24">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Due to the lack of feature information and complex background, the task of small object detection is very challenging. To solve these problems, this paper proposes two small object detection performance enhancement modules for multiple detection tasks and an efficient small object detection network called Multiscale Dilate Feature Enhancement Network (MDFE-Net). MDFE-Net includes two innovative plug-and-play modules: the multi-scale dilated feature aggregation (MDFA) module and the context feature enhancement (CFE) module. MDFA improves the efficiency of multi-scale feature fusion, which is used to capture multi-scale context information and improve the expression of underlying feature information. CFE improves the local feature perception and preserves and extracts the effective information of small image objects to the maximum extent. The network enhances the perception of small objects' feature information and restrains the problem of complex and confusing backgrounds to some extent. We used two public datasets (VisDrone and GTSDB) and a self-built agricultural small object dataset (PSD-Node) to verify the effectiveness of the method. On the above three datasets, the AP50 of MDFE-Net reached 0.304, 0.952, and 0.895, and the AP is 0.172, 0.805, and 0.476, respectively, which exceeded the benchmark model and the current SOTA method.This research presents an innovative small object detection network and provides a reliable technical solution for agricultural small object detection.</p>
</abstract>
<kwd-group>
<kwd>context feature</kwd>
<kwd>dilated convolution</kwd>
<kwd>feature enhancement</kwd>
<kwd>multiscale</kwd>
<kwd>small object detection</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the Major Scientific Research Project for Technology Promotes Police under Grant 2025YZ040003.</funding-statement>
</funding-group>
<counts>
<fig-count count="8"/>
<table-count count="5"/>
<equation-count count="8"/>
<ref-count count="39"/>
<page-count count="13"/>
<word-count count="7979"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>In object detection, small-object detection is an important yet challenging task. In recent years, due to the rapid development and exploration of remote sensing technology (<xref ref-type="bibr" rid="B24">Tong et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B21">Shimoni et&#xa0;al., 2019</xref>) in the fields of UAV aerial photography, traffic monitoring, and smart agriculture, the research on small object detection has made remarkable progress. However, small target detection has two main difficulties: (1) limited feature representation caused by small object size and low pixel count, and (2) frequent occlusion and confusion arising from complex image backgrounds (<xref ref-type="bibr" rid="B20">Ruan et&#xa0;al., 2023</xref>), which leads to additional difficulties in the detection of small objects by models. Therefore, small object detection has always been one of the most challenging tasks in object detection (<xref ref-type="bibr" rid="B31">Yang et&#xa0;al., 2015</xref>).</p>
<p>Recently, numerous studies have contributed significantly to addressing small object detection. Kisantal et&#xa0;al (<xref ref-type="bibr" rid="B9">Kisantal et&#xa0;al., 2019</xref>). fully analyzed the principle of small object detection and proposed an oversampling image method to improve the small object detection performance of the model. Yuan et&#xa0;al (<xref ref-type="bibr" rid="B34">Yuan and Zhang, 2021</xref>). introduced an improved coupled network to solve the localization problem of small object detection. Zhang et&#xa0;al (<xref ref-type="bibr" rid="B36">Zhang et&#xa0;al., 2022</xref>). designed an adaptive dense pyramid network, which achieved excellent performance in dense small object detection tasks.</p>
<p>Motivated by the aforementioned challenges, we propose two novel plug-and-play modules specifically designed for improving small object detection across diverse scenarios. The key to alleviating the problem of insufficient feature information and background confusion lies in feature fusion and feature enhancement. For feature fusion, we combine multi-scale features with dilated convolution to make full use of multi-scale context information to effectively enhance the network&#x2019;s perception of small objects. MDFA module is proposed to enrich the context features, increase the model&#x2019;s receptive field, and promote the aggregation of more abundant underlying feature information so that the network can extract comprehensive information. The detection performance of small objects is improved. In terms of feature enhancement, we use a multi-branch convolution structure to extract richer semantic information and introduce dilated convolution to obtain richer local context information while expanding the receptive field. We propose a CFE module to enhance small object features in the process of network propagation by enhancing feature saturation and expanding the receptive field.</p>
<p>In this paper, we propose MDFE-Net, a multi-scale dilated feature enhancement network that incorporates two plug-and-play modules to improve small-object detection. The main contributions are summarized as follows:</p>
<list list-type="simple">
<list-item>
<p>1. We propose the MDFA module to enrich low-level feature representations by capturing multi-scale contextual information, alleviating the limited feature cues caused by small object sizes and few pixels. The module can be integrated into detection heads to enhance detection capability.</p></list-item>
<list-item>
<p>2. We design the CFE module to mitigate occlusion and background confusion by retaining and extracting informative cues for small objects through receptive-field expansion and multi-branch feature aggregation. The module can be embedded along the feature path from the backbone to the neck to strengthen feature information capture.</p></list-item>
<list-item>
<p>3. By integrating MDFA and CFE into YOLO11N, we build MDFE-Net and evaluate it on two public datasets (VisDrone and GTSDB) and one self-built dataset (PSD-Node). The experimental results show consistent improvements over strong baselines and competitive performance compared with state-of-the-art methods.</p></list-item>
</list>
<p>The rest of this paper is organized as follows: in the Section 2, the related work of small objects detection is introduced. Section 3 details the proposed MDFA and CFE modules and describes the architecture of MDFE-Net. Section 4 provides comprehensive experimental setups, comparative evaluations, and ablation studies demonstrating module effectiveness. Section 5 is the conclusion of this paper.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Small object detection</title>
<p>The definition of small objects is usually divided into two categories: relative size and absolute size. The relative size emphasizes the relationship between the object and the image size and generally emphasizes the proportion of the object to the image area. Chen et&#xa0;al (<xref ref-type="bibr" rid="B4">Girshick et&#xa0;al., 2014</xref>). made a specific definition of the small object through research: in the same category, the median ratio of the border area of the small object to the entire image area should be between 0.08% and 0.58%. The definition of the absolute size of a small object focuses on the pixel value of the object itself. Among them, the COCO dataset proposes that the pixels of small objects should be less than 32&#xd7;32 pixels (<xref ref-type="bibr" rid="B14">Lin et&#xa0;al., 2014</xref>). In addition, different public datasets have different definitions of the absolute size of small objects, such as the WiderFace dataset (<xref ref-type="bibr" rid="B30">Yang et&#xa0;al., 2016</xref>), which defines a small object with a pixel range of 10 to 50, and the TinyPerson dataset (<xref ref-type="bibr" rid="B33">Yu et&#xa0;al., 2020</xref>), which defines a small object with a pixel range of 20 to 32. Recent works have proposed specialized designs for dense small-object detection. FBRT-YOLO (<xref ref-type="bibr" rid="B28">Xiao et&#xa0;al., 2025</xref>) introduces task-oriented improvements to enhance detection robustness in crowded scenes by strengthening feature representation and detection strategies for small targets. EDSOD (<xref ref-type="bibr" rid="B12">Li et&#xa0;al., 2025</xref>) presents a dedicated small-object detector that improves feature extraction and localization quality under challenging backgrounds, demonstrating competitive performance on public benchmarks. Small objects typically occupy only a few pixels in an image, which limits available visual cues and makes small-object detection one of the most challenging tasks in object detection.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Multi-scale feature fusion</title>
<p>Multi-scale means that in the process of deep learning, images or features with different resolutions are input, and different resolutions represent different scales. In object detection tasks, objects often appear in various sizes, and it is difficult to capture the features of all objects effectively with a single scale feature extraction. Therefore, multi-scale methods can better detect objects of different sizes by extracting features at different scales. Feature pyramid is a structure that deals with multi-scale feature information. By using many scale features, the network can extract more comprehensive information, so as to improve the detection effect of the network model on small objects. Feature Pyramid Network (<xref ref-type="bibr" rid="B13">Lin et&#xa0;al., 2017</xref>), as an enhanced feature architecture, is proposed to improve multi-scale problems well, and the performance of network models can be well improved by adding top-down paths to integrate multi-scale features. Feature pyramid-based object detection methods and many applied research methods have also achieved remarkable results in subsequent visual tasks. On the basis of the feature pyramid, PANet (<xref ref-type="bibr" rid="B17">Liu et&#xa0;al., 2018b</xref>) further improves the positioning capability of the feature pyramid by adding additional bottom-up feature paths, which can shorten the information path from the low layer to the high layer to enhance the feature hierarchy. EfficientDet (<xref ref-type="bibr" rid="B22">Tan et&#xa0;al., 2020</xref>) proposed a weighted bidirectional feature pyramid network (BiFPN), which introduced learnable weights for different input features in the fusion process, and a composite feature pyramid network scaling method. ASFF (<xref ref-type="bibr" rid="B16">Liu et&#xa0;al., 2019</xref>) research shows that small objects are usually associated with lower-level feature layers, while large objects are usually associated with higher-level feature layers. An adaptive spatial feature fusion method is proposed to promote the fusion of feature information by learning the correlation between different feature maps. SCRDet (<xref ref-type="bibr" rid="B32">Yang et&#xa0;al., 2019</xref>) designed a sampling fusion network, which fused multi-layer features with anchor sampling to improve the detection performance of the network model for small objects.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Feature enhancement</title>
<p>In object detection, the semantic expression of the model can be further enhanced by feature enhancement before the feature fusion. In this process, the feature expression and discrimination ability of the feature are improved through the fine processing of feature maps of different scales, so as to provide more abundant and accurate information for the subsequent feature fusion. Feature enhancement can be achieved through the attention mechanism: Cheng et&#xa0;al (<xref ref-type="bibr" rid="B2">Cheng et&#xa0;al., 2021</xref>). enhanced features by using the dual attention mechanism before fusion, which enhanced the sensitivity of the network to different features and enhanced the network detection performance. Zhang and Shen (<xref ref-type="bibr" rid="B37">Zhang and Shen, 2022</xref>) combined spatial attention mechanism and channel attention mechanism to form a feature enhancement module to enhance the features of the network. In addition to the attention mechanism, there are also two common feature enhancement methods: multi-branch convolution and transformer encoders (<xref ref-type="bibr" rid="B18">Liu et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B10">Li et&#xa0;al., 2023b</xref>).</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>Overview</title>
<p>This section presents two plug-and-play modules and the overall architecture of MDFE-Net. Specifically, the MDFA module captures multi-scale contextual information, and the CFE module enhances feature representation for small objects. We integrate these two modules together with a P2-level extra detect head (EDH) into the lightweight YOLO11N baseline, resulting in the proposed MDFE-Net.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>MDFA module</title>
<p>In object detection, hierarchical features are responsible for detecting the object of the corresponding size, and small objects are often small in size and lack sufficient features, which is difficult to accurately locate and detect. These problems will affect the accuracy and robustness of its detection. Low-level features contain rich location and local details. Effective use of low-level feature information can improve the localization and detection ability of smaller objects. In view of these problems, recent studies show that effective use of low-level features can significantly improve the detection ability of small objects. Low-level features (such as shallow convolution features) contain abundant location information and local details, which are of great significance for the accurate localization of small objects. However, relying only on low-level features may lack sufficient contextual information and global awareness, so a mechanism needs to be designed to combine low-level features with high-level semantic features to achieve comprehensive capture of multi-scale objects. Inspired by Dilate Former (<xref ref-type="bibr" rid="B5">Jiao et&#xa0;al., 2023</xref>), we proposed a new multi-scale dilated feature aggregation (MDFA) module, as shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>, designed to effectively capture multi-scale context and enhance small-object detection performance.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Overall structure of the MDFA module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g001.tif">
<alt-text content-type="machine-generated">Block diagram illustrating a neural network attention module pipeline. Input passes sequentially through CAM and SAM modules with element-wise products, then branches into Q, K, and V streams, each processed by dilatative attention units with different rates, concatenated, and merged to produce the output. Diagram includes legend for element-wise product, concatenation, and dilatative attention.</alt-text>
</graphic></fig>
<p>MDFA module aims to enhance the model&#x2019;s ability to extract small object features by integrating the advantages of attention mechanism and dilated convolution while capturing rich context information in a multi-scale range. This module is divided into two main parts: convolutional block attention module and multi-scale dilated attention module (<xref ref-type="bibr" rid="B27">Woo et&#xa0;al., 2018</xref>), which work together to improve feature expressiveness and multi-scale adaptability. Firstly, <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref> shows the channel attention module and the spatial attention module used in the previous part of the network structure. The channel attention mechanism highlights the more significant feature channels by weighted aggregation on the dimension of feature channels, thus improving the ability of the network to pay attention to important features. Accordingly, the spatial attention mechanism enhances the feature representation of key regions by applying attention weight in the spatial dimension. These two attention mechanisms are combined to form a convolutional block attention module to fully mine local details in features. The work flow of the convolutional block attention module is as follows: Input features are first processed by channel attention and spatial attention to generate two sets of attention weights; Then, by multiplying element by element, the attention weight is combined with the input features to obtain the enhanced feature representation. This process can effectively improve the sensitivity and representation ability of the network when dealing with small objects, especially in the complex background to better capture the key information of small objects. Secondly, we design a multi-scale dilated attention module inspired by the multi-head attention mechanism and dilated convolution. The core idea is to combine the characteristics of multi-head attention mechanism and dilated convolution to capture multi-scale context information. Specifically, the module first maps the features of the output of the convolution attention module to the query (Q), key (K), and value (V) spaces by linear projection, based on the principle of multi-head attention mechanism. Then, in order to enhance the expressiveness of features at different scales, we divide the feature channels into four groups and input them into four Dilated Attention head with dilated convolution with different dilated rates (r=1,2,3,4 respectively). Each Dilated Attention heads uses dilated convolution to expand the properties of the receptive field without increasing parameters, extracting features from different scales and context ranges. After processing with a Dilated Attention head, all feature outputs are fused through multi-scale aggregation operations to integrate information from multi-scale contexts.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Overall structure of the CAM module and SAM module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a two-part channel and spatial attention mechanism. The top section (CAM) uses MaxPool and AvgPool followed by MLP, sigmoid, and element-wise product. The bottom section (SAM) applies MaxPool and AvgPool, concatenates the outputs, processes through a three-by-three convolution, sigmoid, and element-wise product. Legends define concatenation and element-wise product operations.</alt-text>
</graphic></fig>
<p>However, relying solely on dilated convolution may have the following problems: On the one hand, the sparse sampling characteristics of dilated convolution may cause some fine-grained features to be ignored; On the other hand, a large dilated rate may result in an uneven distribution of receptive fields, resulting in insufficient attention to the characteristics of some regions. To address these issues, we designed a skip connection to introduce raw features into the aggregation process, further complementing fine-grained information and global consistency. This design not only avoids the loss of feature information but also improves the utilization efficiency of the original feature. The dilated attention mechanism enlarges the receptive field through dilated convolution, thereby demonstrating a greater ability to capture contextual information. In addition, compared with the traditional downsampling operation, this mechanism can retain more spatial details without reducing the resolution of the input image, which makes the model perform better in small object detection tasks. By combining the flexibility of the multi-head attention mechanism and the multi-scale characteristics of dilated convolution, the MDFA module achieves the efficient capture and utilization of multi-scale features, greatly improving the detection accuracy and robustness of the model when dealing with small objects. The formulas processed by the MDFA module are expressed as follows: <xref ref-type="disp-formula" rid="eq1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="eq4">4</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mi>M</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mi>A</mml:mi><mml:mi>M</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2297;</mml:mo><mml:mi>F</mml:mi><mml:mrow><mml:mo stretchy="false">]</mml:mo><mml:mo>&#x2297;</mml:mo><mml:mo stretchy="false">[</mml:mo></mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mi>A</mml:mi><mml:mi>M</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2297;</mml:mo><mml:mi>F</mml:mi></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>D</mml:mi><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>r</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2264;</mml:mo><mml:mi>i</mml:mi><mml:mo>&#x2264;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2295;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>Where <italic>f<sub>CAM</sub></italic>(.) and <italic>f<sub>SAM</sub></italic>(.) represent the input CAM and SAM modules for calculation operations, <italic>DA</italic>(.) represents dilated attention mechanism operates, <italic>Linear</italic> represent feature linear mapping, <italic>Concat</italic> represents feature mapping concatenation operation, &#x2297; represents Element-wise Multiplication, &#x2295; represents Element-wise Addition, <italic>F</italic> is the input feature, <italic>W</italic><sub>1</sub> is the output feature map that obtained after CAM and SAM module operation, <italic>K, Q, V</italic> represent value obtained after a linear mapping operation, <italic>H<sub>i</sub></italic> is the output features that obtained after an dilated attention mechanism with an expansion rate of <italic>i</italic>, <italic>W<sub>out</sub></italic> is the output feature.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>CFE module</title>
<p>Small objects are usually made up of only a few pixels in an image. The current mainstream object detection network typically consists of a backbone network, a neck network, and detect head three parts. The backbone network performs better for the detection of medium and large objects, but its feature extraction ability is limited for the detected objects with simple textures and small size. In the process of feature extraction, the features extracted by the backbone network often contain less semantic information and are limited by a narrow receptive field, which makes it difficult to distinguish the features of small objects from the occltors in the background, thus affecting the detection accuracy. To solve this problem, inspired by RFB-s (<xref ref-type="bibr" rid="B15">Liu et&#xa0;al., 2018a</xref>) and FFCA-YOLO (<xref ref-type="bibr" rid="B38">Zhang et&#xa0;al., 2024</xref>), we propose context feature enhancement (CFE) module, as shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Overall structure of the CFE module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g003.tif">
<alt-text content-type="machine-generated">Neural network diagram showing input passing through a one by one convolution, splitting into three parallel branches with convolutions of varying kernel sizes, merging again, passing through another one by one convolution, then combining with the original input to produce the output.</alt-text>
</graphic></fig>
<p>CFE module enhances the expression ability of object features from two aspects: First, it enhances the feature saturation by adopting a multi-branch convolution structure to extract richer semantic information, so as to improve the feature expression ability of small objects. Second, it expands the receptive field of features to obtain more adequate local context information by introducing dilated convolution, so as to enhance the context awareness of small objects. This design can not only significantly improve the feature expression ability of small objects, but also optimize the computational efficiency and reduce the parameters of the network to a certain extent.</p>
<p>In CFE module, we use the combination of multi-branch convolution and dilated convolution to achieve efficient extraction and multi-scale enhancement of small object features. The overall design of the module consists of three main branches, which undertake different feature processing tasks, and finally maximize the effectiveness of features by means of feature aggregation.</p>
<p>Firstly, the design focus of the first branch is to extract rich semantic information using a multi-branch convolutional structure and expand the receptive field in the process, so as to enhance the expression ability of context information. We performed a 1&#xd7;1 convolution operation on the input feature mapping, initially adjusted the number of channels for subsequent processing, reduced the computational cost and laid a foundation for subsequent multi-branch processing. Then, the processed input features were input into three branch quasi-convolution operations, among which three branches contained a branch with only one convolution kernel as 3&#xd7;3 standard convolution. Two branches consisting of a standard strip convolution with a size of 1&#xd7;7 and 7&#xd7;1 and a 3&#xd7;3 dilated convolution with a dilated rate of 7, respectively, effectively capture the asymmetric and directional information in the features through the extended characteristics of the long axis receptive field of the strip convolution. Meanwhile, the dilated convolution expands the receptive field without increasing the number of parameters by introducing sparse receptive fields. In this way, the long distance dependence between the context information and the object is captured, and the features on the three branches are concat operation, the 1&#xd7;1 convolution is input for processing. The features through the first main branch not only contain local details, but also retain global context information, and effectively improve the receptive field through the combination of strip convolution and dilated convolution. The multi-scale feature expression capability of the module for small objects is enhanced significantly. In addition, compared with the method of using large convolution kernel directly, the multi-branch design can significantly reduce the computational complexity and parameters while ensuring the effect of receptive field expansion, so as to realize the lightweight of the module. Second, the second branch is a residual structure composed of 1&#xd7;1 convolution. The residual structure forms an equivalent mapping, and its main function is to directly retain input features through the equivalent mapping mechanism to avoid the loss of key features of small objects in multi-branch convolution operations. The introduction of residual structure not only ensures the integrity of the feature flow but also makes the CFE module better adapt to the feature representation requirements of different scale objects. Through this design, the fine-grained features of small objects are preserved, providing accurate scale information for subsequent feature fusion. Last, the third branch is the input of the original global feature information, which supplements the global information on the basis of local feature enhancement, so as to improve the network&#x2019;s perception of the overall feature of the object. The retention of global features is particularly important for the detection of small objects, because the semantic information of small objects is sparse, and it is easy to be limited by local information. The introduction of global features can effectively improve the model&#x2019;s context-aware ability of small objects, and further enhance the detection robustness. Finally, context information, key information, and global information are added element by element to retain and extract effective information of small objects to the maximum extent. This feature fusion mechanism realizes the effective integration of context information, key feature information and global information, and enhances the object features from three different scales. Specifically, the enhancement of context information significantly improves the semantic saturation of small objects, the retention of key feature information ensures the fine-grained description of small objects, and the addition of global information enhances the overall consistency of the object characteristics. The formulas processed by the MDFA module are expressed as follows: <xref ref-type="disp-formula" rid="eq5">Equations 5</xref>&#x2013;<xref ref-type="disp-formula" rid="eq8">8</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mi>D</mml:mi><mml:msubsup><mml:mi>C</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:mn>7</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mi>D</mml:mi><mml:msubsup><mml:mi>C</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:mn>7</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#xa0;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>3</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2295;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2295;</mml:mo><mml:mi>F</mml:mi></mml:mrow></mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> represent the standard convolution operations with kernel sizes of <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math></inline-formula>, respectively. <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:mi>D</mml:mi><mml:msubsup><mml:mi>C</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:mn>7</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> represents the <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>.</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> dilated convolution operation with an expansion rate of 7, <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:math></inline-formula> represents feature mapping concatenation operation, <inline-formula>
<mml:math display="inline" id="im12"><mml:mo>&#x2295;</mml:mo></mml:math></inline-formula> represents Element-wise Addition, <inline-formula>
<mml:math display="inline" id="im13"><mml:mi>F</mml:mi></mml:math></inline-formula> is the input feature, <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>3</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> represent the output feature map of the first three branches after standard convolution and dilated convolution, <italic>F<sub>out</sub></italic> is the output feature.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Extra detect head</title>
<p>During feature extraction, the object detection model sends the feature images P3, P4, and P5 of three different resolutions obtained by the backbone network into the neck for feature fusion. This is because with the superposition of downsampling or convolution operations, the receptive field gradually expands, and the high-level feature map can capture richer semantic information, which is sufficient for the object detection of general objects. However, for a large number of small objects to be detected, due to little information in small objects, their size, location, and other feature information may be gradually lost with the increasing number of model layers, which is not conducive to accurate object recognition and positioning (<xref ref-type="bibr" rid="B29">Xu et&#xa0;al., 2022</xref>), and the prediction head cannot obtain enough feature information from the feature map, resulting in low recognition accuracy. Shallow feature maps have smaller receptive fields, pay more attention to detail information, and have higher spatial resolution and accurate location information, which is suitable for small object detection tasks that lack feature information and are difficult to pinpoint.</p>
<p>In order to retain more shallow features and small object location information, feature map P2 with the highest resolution is introduced. By reducing downsampling times and retaining more detailed information, feature map P2 extracted through the backbone network is fused with feature maps of other scales to improve the richness of fusion features. Moreover, an additional P2 feature small object prediction head is constructed by using the fused features, so that the model has more location information and feature information of the small object, effectively reducing the location feature loss during feature downsampling, enhancing the context information of the small object, and improving the location detection accuracy of the small object. And combine with the other three prediction heads, it can well mitigate the negative effects caused by drastic changes in object scale.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>MDFE-Net</title>
<p>To effectively address small object detection challenges, the proposed innovative modules MDFA and CFE are introduced into the object detection model YOLO11N of YOLO series methods, and the lightweight version model YOLO11N is used as the benchmark network framework. An innovative model multiscale dilate feature enhancement network (MDFE-Net) was constructed, and the overall framework of MDFE-Net was shown in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>. CSPDarkNet53 (<xref ref-type="bibr" rid="B19">Mahasin and Dewi, 2022</xref>) is employed as the backbone network to efficiently extract hierarchical multi-scale features from images. The neck structure is used for feature fusion, combine with multi-resolution feature maps to improve the awareness of small object context information. The detection head structure is used to classify and locate the object. The CFE module is used to enhance the image context feature information of four different resolutions output by the trunk to improve the feature extraction ability for small objects. Meanwhile, in order to make better use of the details of the underlying features, the MDFA module is used to carry out multi-scale feature aggregation for the underlying features to enhance the attentional expression ability of the underlying features. To further enhance the detection accuracy of the location of small objects, we introduce the underlying feature diagram P2 into the detection head and build additional detection head based on this, which effectively improves the classification and location performance of the model for small objects.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Overall structure of the proposed MDFE-Net, which includes baseline model components (CBS, C3k2, and SPPF).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g004.tif">
<alt-text content-type="machine-generated">Flowchart diagram of a deep learning model architecture divided into three sections: Backbone, Neck, and Head. Backbone contains layers C2PSA, SPPF, multiple C3k2 and CBS blocks. Neck features repeated CFE, upsampling, concatenation, and C3k2 blocks, with interconnections. Head contains detection modules accessed via colored pathways. A legend at the bottom defines abbreviations such as CBS, Conv, BatchNorm, Silu, C3k2, SPPF, and MaxPool.</alt-text>
</graphic></fig>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental setup</title>
<sec id="s4_1_1">
<label>4.1.1</label>
<title>Experimental dataset description</title>
<p>We chose three different types of small object datasets, including two public datasets and one self-built dataset.</p>
<list list-type="order">
<list-item>
<p>VisDrone (<xref ref-type="bibr" rid="B1">Cao et&#xa0;al., 2021</xref>): It is a large-scale UAV view dataset in a realistic scene, which contains a large number of small objects, diverse data distribution, and complex detection scenes, which makes the dataset more challenging. The dataset contains 10,209 still images from drones in different areas of 14 cities, covering 10 common object categories in traffic scenarios, including about 540,000 instances.</p></list-item>
<list-item>
<p>GTSDB (<xref ref-type="bibr" rid="B35">Zhang et&#xa0;al., 2020</xref>): The German Traffic Sign Detection Benchmark dataset is a traffic sign detection benchmark dataset in Germany, which contains a total of 900 images of 1360&#xd7;800 pixels and 4 categories of label types, and has a large number of small traffic signs.</p></list-item>
<list-item>
<p>PSD-Node: Plant Seedling Dataset-Node is a dataset for plant seedling node detection, which is collected and labeled by us in an independent seedling image data collection room (as shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>. There are 1350 original seedling images collected in total, including 810 training sets. There are 270 verification sets and 270 test sets, which contain tens of thousands of small object labels for seedling nodes. The dataset has the following characteristics: (1) The labels of seedling nodes in PSD-Node belong to small objects in the definition of relative size. (2) The goal of a large number of seed and seedling nodes in PSD-Node is to more effectively verify the model&#x2019;s performance in detecting small objects under low light conditions and under the condition of blade occlusion. (3) PSD-Node belongs to the small object dataset in the agricultural field, which has reference value for improving the small object detection performance of the model in the agricultural field.</p></list-item>
</list>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Self-built image collection room for plant seedlings.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g005.tif">
<alt-text content-type="machine-generated">Small wilted plant positioned on a round platform inside a reflective light box, illuminated by a circular LED ring overhead, while a perforated handheld device points toward the plant, possibly for analysis or imaging.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_1_2">
<label>4.1.2</label>
<title>Evaluation metrics</title>
<p>We used P (Precision), R (Recall), F1-score, AP (averaged over IoU thresholds from 0.50 to 0.95 with a step of 0.05), and AP50 (average precision at IoU = 0.50) as the main evaluation metrics for the model. In addition to AP and AP50, we report scale-aware metrics APs, APm, and APl to better evaluate performance across different object sizes. These metrics follow the common small, medium, and large scale partition used in standard detection evaluation. In addition, we report GFLOPs and parameter count in <xref ref-type="table" rid="T1"><bold>Tables&#xa0;1</bold></xref>, <xref ref-type="table" rid="T2"><bold>2</bold></xref> to provide an efficiency-related reference under the same input resolution.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Comparisons of MDFE-Net with state-of-the-art algorithms in VisDrone.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">AP50</th>
<th valign="middle" align="center">AP</th>
<th valign="middle" align="center">APs</th>
<th valign="middle" align="center">APm</th>
<th valign="middle" align="center">APl</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">Parameter</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">YOLOv5N(2020)</td>
<td valign="middle" align="center">0.379</td>
<td valign="middle" align="center">0.294</td>
<td valign="middle" align="center">0.331</td>
<td valign="middle" align="center">0.266</td>
<td valign="middle" align="center">0.152</td>
<td valign="middle" align="center">0.046</td>
<td valign="middle" align="center">0.154</td>
<td valign="middle" align="center">0.231</td>
<td valign="middle" align="center">7.1</td>
<td valign="middle" align="center">2.18M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv6N(2022)</td>
<td valign="middle" align="center">0.381</td>
<td valign="middle" align="center">0.289</td>
<td valign="middle" align="center">0.329</td>
<td valign="middle" align="center">0.268</td>
<td valign="middle" align="center">0.148</td>
<td valign="middle" align="center">0.051</td>
<td valign="middle" align="center">0.210</td>
<td valign="middle" align="center">0.316</td>
<td valign="middle" align="center">11.4</td>
<td valign="middle" align="center">4.15M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv8N(2023)</td>
<td valign="middle" align="center">0.376</td>
<td valign="middle" align="center">0.288</td>
<td valign="middle" align="center">0.326</td>
<td valign="middle" align="center">0.264</td>
<td valign="middle" align="center">0.147</td>
<td valign="middle" align="center">0.059</td>
<td valign="middle" align="center">0.225</td>
<td valign="middle" align="center">0.339</td>
<td valign="middle" align="center">8.1</td>
<td valign="middle" align="center">3.00M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv9T(2024)</td>
<td valign="middle" align="center">0.385</td>
<td valign="middle" align="center">0.291</td>
<td valign="middle" align="center">0.331</td>
<td valign="middle" align="center">0.270</td>
<td valign="middle" align="center">0.154</td>
<td valign="middle" align="center">0.059</td>
<td valign="middle" align="center">0.221</td>
<td valign="middle" align="center">0.341</td>
<td valign="middle" align="center">10.7</td>
<td valign="middle" align="center">2.00M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv10N(2024)</td>
<td valign="middle" align="center">0.383</td>
<td valign="middle" align="center">0.292</td>
<td valign="middle" align="center">0.331</td>
<td valign="middle" align="center">0.268</td>
<td valign="middle" align="center">0.149</td>
<td valign="middle" align="center">0.063</td>
<td valign="middle" align="center">0.224</td>
<td valign="middle" align="center">0.292</td>
<td valign="middle" align="center">10.8</td>
<td valign="middle" align="center">2.69M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv11N(2024)</td>
<td valign="middle" align="center">0.380</td>
<td valign="middle" align="center">0.293</td>
<td valign="middle" align="center">0.331</td>
<td valign="middle" align="center">0.267</td>
<td valign="middle" align="center">0.149</td>
<td valign="middle" align="center">0.058</td>
<td valign="middle" align="center">0.225</td>
<td valign="middle" align="center">0.316</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">2.58M</td>
</tr>
<tr>
<td valign="middle" align="left">RT-DETR-L(2024)</td>
<td valign="middle" align="center">0.384</td>
<td valign="middle" align="center">0.285</td>
<td valign="middle" align="center">0.327</td>
<td valign="middle" align="center">0.278</td>
<td valign="middle" align="center">0.150</td>
<td valign="middle" align="center">/</td>
<td valign="middle" align="center">/</td>
<td valign="middle" align="center">/</td>
<td valign="middle" align="center">108.0</td>
<td valign="middle" align="center">32.81M</td>
</tr>
<tr>
<td valign="middle" align="left">Hyper-YOLO-N(2025)</td>
<td valign="middle" align="center">0.402</td>
<td valign="middle" align="center">0.314</td>
<td valign="middle" align="center">0.353</td>
<td valign="middle" align="center">0.292</td>
<td valign="middle" align="center">0.161</td>
<td valign="middle" align="center">0.066</td>
<td valign="middle" align="center">0.240</td>
<td valign="middle" align="center">0.348</td>
<td valign="middle" align="center">10.8</td>
<td valign="middle" align="center">3.94M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv12N(2025)</td>
<td valign="middle" align="center">0.395</td>
<td valign="middle" align="center">0.299</td>
<td valign="middle" align="center">0.340</td>
<td valign="middle" align="center">0.275</td>
<td valign="middle" align="center">0.156</td>
<td valign="middle" align="center">0.057</td>
<td valign="middle" align="center">0.224</td>
<td valign="middle" align="center">0.346</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">2.55M</td>
</tr>
<tr>
<td valign="middle" align="left">FBRT-YOLO-N(2025)</td>
<td valign="middle" align="center">0.393</td>
<td valign="middle" align="center">0.300</td>
<td valign="middle" align="center">0.340</td>
<td valign="middle" align="center">0.277</td>
<td valign="middle" align="center">0.158</td>
<td valign="middle" align="center">0.063</td>
<td valign="middle" align="center">0.230</td>
<td valign="middle" align="center">0.310</td>
<td valign="middle" align="center">6.7</td>
<td valign="middle" align="center">0.85M</td>
</tr>
<tr>
<td valign="middle" align="left">EDSOD(2025)</td>
<td valign="middle" align="center">0.245</td>
<td valign="middle" align="center">0.437</td>
<td valign="middle" align="center">0.314</td>
<td valign="middle" align="center">0.245</td>
<td valign="middle" align="center">0.135</td>
<td valign="middle" align="center">0.073</td>
<td valign="middle" align="center">0.197</td>
<td valign="middle" align="center">0.291</td>
<td valign="middle" align="center">88.1</td>
<td valign="middle" align="center">23.45M</td>
</tr>
<tr>
<td valign="middle" align="left">MDFE-Net(Ours)</td>
<td valign="middle" align="center"><bold>0.410</bold></td>
<td valign="middle" align="center"><bold>0.326</bold></td>
<td valign="middle" align="center"><bold>0.363</bold></td>
<td valign="middle" align="center"><bold>0.304</bold></td>
<td valign="middle" align="center"><bold>0.172</bold></td>
<td valign="middle" align="center"><bold>0.077</bold></td>
<td valign="middle" align="center"><bold>0.268</bold></td>
<td valign="middle" align="center"><bold>0.361</bold></td>
<td valign="middle" align="center">13.6</td>
<td valign="middle" align="center">3.22M</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bolded values represent the evaluation metric with the best performance after comparing all the models.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparisons of MDFE-Net with state-of-the-art algorithms in GTSDB.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">AP50</th>
<th valign="middle" align="center">AP</th>
<th valign="middle" align="center">APs</th>
<th valign="middle" align="center">APm</th>
<th valign="middle" align="center">APl</th>
<th valign="middle" align="center">GFLOPs</th>
<th valign="middle" align="center">Parameter</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">YOLOv5N(2020)</td>
<td valign="middle" align="center">0.953</td>
<td valign="middle" align="center">0.833</td>
<td valign="middle" align="center">0.889</td>
<td valign="middle" align="center">0.925</td>
<td valign="middle" align="center">0.748</td>
<td valign="middle" align="center">0.523</td>
<td valign="middle" align="center">0.783</td>
<td valign="middle" align="center">0.895</td>
<td valign="middle" align="center">7.1</td>
<td valign="middle" align="center">2.18M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv6N(2022)</td>
<td valign="middle" align="center">0.914</td>
<td valign="middle" align="center">0.854</td>
<td valign="middle" align="center">0.883</td>
<td valign="middle" align="center">0.911</td>
<td valign="middle" align="center">0.728</td>
<td valign="middle" align="center">0.501</td>
<td valign="middle" align="center">0.773</td>
<td valign="middle" align="center">0.879</td>
<td valign="middle" align="center">11.4</td>
<td valign="middle" align="center">4.15M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv8N(2023)</td>
<td valign="middle" align="center">0.913</td>
<td valign="middle" align="center">0.887</td>
<td valign="middle" align="center">0.900</td>
<td valign="middle" align="center">0.921</td>
<td valign="middle" align="center">0.758</td>
<td valign="middle" align="center">0.534</td>
<td valign="middle" align="center">0.766</td>
<td valign="middle" align="center">0.838</td>
<td valign="middle" align="center">8.1</td>
<td valign="middle" align="center">3.00M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv9T(2024)</td>
<td valign="middle" align="center">0.923</td>
<td valign="middle" align="center">0.846</td>
<td valign="middle" align="center">0.883</td>
<td valign="middle" align="center">0.905</td>
<td valign="middle" align="center">0.744</td>
<td valign="middle" align="center">0.543</td>
<td valign="middle" align="center">0.788</td>
<td valign="middle" align="center">0.931</td>
<td valign="middle" align="center">10.7</td>
<td valign="middle" align="center">2.00M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv10N(2024)</td>
<td valign="middle" align="center">0.956</td>
<td valign="middle" align="center">0.846</td>
<td valign="middle" align="center">0.898</td>
<td valign="middle" align="center">0.929</td>
<td valign="middle" align="center">0.767</td>
<td valign="middle" align="center">0.510</td>
<td valign="middle" align="center">0.786</td>
<td valign="middle" align="center">0.883</td>
<td valign="middle" align="center">10.8</td>
<td valign="middle" align="center">2.69M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv11N(2024)</td>
<td valign="middle" align="center">0.953</td>
<td valign="middle" align="center">0.859</td>
<td valign="middle" align="center">0.905</td>
<td valign="middle" align="center">0.926</td>
<td valign="middle" align="center">0.773</td>
<td valign="middle" align="center">0.541</td>
<td valign="middle" align="center">0.789</td>
<td valign="middle" align="center">0.891</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">2.58M</td>
</tr>
<tr>
<td valign="middle" align="left">RT-DETR-L(2024)</td>
<td valign="middle" align="center">0.960</td>
<td valign="middle" align="center">0.874</td>
<td valign="middle" align="center">0.915</td>
<td valign="middle" align="center">0.935</td>
<td valign="middle" align="center">0.778</td>
<td valign="middle" align="center">/</td>
<td valign="middle" align="center">/</td>
<td valign="middle" align="center">/</td>
<td valign="middle" align="center">108.0</td>
<td valign="middle" align="center">32.81M</td>
</tr>
<tr>
<td valign="middle" align="left">Hyper-YOLO-N(2025)</td>
<td valign="middle" align="center">0.911</td>
<td valign="middle" align="center">0.896</td>
<td valign="middle" align="center">0.903</td>
<td valign="middle" align="center">0.939</td>
<td valign="middle" align="center">0.769</td>
<td valign="middle" align="center">0.515</td>
<td valign="middle" align="center">0.787</td>
<td valign="middle" align="center">0.887</td>
<td valign="middle" align="center">10.8</td>
<td valign="middle" align="center">3.94M</td>
</tr>
<tr>
<td valign="middle" align="left">YOLOv12N(2025)</td>
<td valign="middle" align="center">0.958</td>
<td valign="middle" align="center">0.816</td>
<td valign="middle" align="center">0.881</td>
<td valign="middle" align="center">0.921</td>
<td valign="middle" align="center">0.744</td>
<td valign="middle" align="center">0.491</td>
<td valign="middle" align="center">0.812</td>
<td valign="middle" align="center">0.878</td>
<td valign="middle" align="center">6.3</td>
<td valign="middle" align="center">2.55M</td>
</tr>
<tr>
<td valign="middle" align="left">FBRT-YOLO-N(2025)</td>
<td valign="middle" align="center">0.961</td>
<td valign="middle" align="center">0.884</td>
<td valign="middle" align="center">0.921</td>
<td valign="middle" align="center">0.933</td>
<td valign="middle" align="center">0.769</td>
<td valign="middle" align="center">0.577</td>
<td valign="middle" align="center">0.817</td>
<td valign="middle" align="center">0.855</td>
<td valign="middle" align="center">6.7</td>
<td valign="middle" align="center">0.85M</td>
</tr>
<tr>
<td valign="middle" align="left">EDSOD(2025)</td>
<td valign="middle" align="center">0.825</td>
<td valign="middle" align="center"><bold>0.919</bold></td>
<td valign="middle" align="center">0.869</td>
<td valign="middle" align="center">0.825</td>
<td valign="middle" align="center">0.658</td>
<td valign="middle" align="center">0.436</td>
<td valign="middle" align="center">0.762</td>
<td valign="middle" align="center">0.774</td>
<td valign="middle" align="center">88.1</td>
<td valign="middle" align="center">23.45M</td>
</tr>
<tr>
<td valign="middle" align="left">MDFE-Net(Ours)</td>
<td valign="middle" align="center"><bold>0.964</bold></td>
<td valign="middle" align="center">0.898</td>
<td valign="middle" align="center"><bold>0.930</bold></td>
<td valign="middle" align="center"><bold>0.952</bold></td>
<td valign="middle" align="center"><bold>0.805</bold></td>
<td valign="middle" align="center"><bold>0.593</bold></td>
<td valign="middle" align="center"><bold>0.828</bold></td>
<td valign="middle" align="center"><bold>0.910</bold></td>
<td valign="middle" align="center">13.6</td>
<td valign="middle" align="center">3.22M</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bolded values represent the evaluation metric with the best performance after comparing all the models.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_1_3">
<label>4.1.3</label>
<title>Other details</title>
<p>We trained on 1 GPU (NVIDIA GeForce RTX 2080), Intel(R)Core(TM)i7&#x2013;8700 CPU, and Windows 10 operating system, and selected the best performance as the experimental results. We selected Stochastic Gradient Descent (SGD) as the network optimizer. The epochs and batch size were set to 300 and 4, respectively. In the training process, the initial learning rate was set to 0.01, and the cosine annealing strategy was used to reduce the learning rate. Momentum was set to 0.937 and weight loss was set to 0.0005. To ensure a fair comparison, we excluded the use of pre-training and self-distillation strategies for all methods used for comparison, and in addition, recognizing the potential impact of input image size on evaluation, we normalized the input resolution for all data images to 640&#xd7;640, a common choice in the field of object detection. For reproducibility, we additionally report the main software environment: Python 3.8, PyTorch 2.0.1, CUDA 11.7, and cuDNN 8.5.0. Unless otherwise specified, we fix the random seed (e.g., 42) for Python, NumPy, and PyTorch, and enable deterministic settings where applicable.</p>
<p>Training objective and loss: To avoid ambiguity, MDFE-Net follows the same training objective as the YOLO11N baseline. Specifically, we keep the default YOLO11N loss formulation (classification, localization, and objectness terms) and their weights unchanged. Moreover, the label assignment strategy is identical to the baseline, and we do not introduce any additional loss terms, customized matching rules, or auxiliary supervision. Therefore, the performance gains mainly come from the proposed MDFA and CFE modules rather than changes in the training objective.</p>
</sec>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Comparisons with state-of-the-art algorithms</title>
<p>To evaluate MDFE-Net, we selected the current advanced single-stage object detection methods for comparative experiments, including YOLOv5N (<xref ref-type="bibr" rid="B6">Jocher, 2020</xref>), YOLOv6N (<xref ref-type="bibr" rid="B11">Li et&#xa0;al., 2023a</xref>), YOLOv8N (<xref ref-type="bibr" rid="B7">Jocher et&#xa0;al., 2023</xref>), YOLOv9T (<xref ref-type="bibr" rid="B26">Wang et&#xa0;al., 2024b</xref>), YOLOv10N (<xref ref-type="bibr" rid="B25">Wang et&#xa0;al., 2024a</xref>), YOLO11N (<xref ref-type="bibr" rid="B8">Khanam and Hussain, 2024</xref>), Hyper-YOLO-N (<xref ref-type="bibr" rid="B3">Feng et&#xa0;al., 2024</xref>), and YOLOv12N (<xref ref-type="bibr" rid="B23">Tian et&#xa0;al., 2025</xref>). At the same time, RT-DETR-L (<xref ref-type="bibr" rid="B39">Zhao et&#xa0;al., 2024</xref>) based on the end-to-end non-CNN framework was selected for comparative experiments.</p>
<p>1. PSD-Node: <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref> shows that MDFE-Net achieves the best performance on the PSD-Node dataset in terms of F1, AP50, and AP. Compared with the baseline YOLO11N, MDFE-Net improves F1, AP50, and AP by 3.7%, 4.8%, and 6.8%, respectively. Compared with the recent SOTA Hyper-YOLO-N, MDFE-Net still achieves consistent gains of 1.5%, 1.4%, and 1.4% in F1, AP50, and AP, respectively. Specifically, MDFE-Net achieves 84.8% F1, 89.5% AP50, and 47.6% AP. Although MDFE-Net attains slightly lower precision than FBRT-YOLO-N, it remains the second-highest among all compared methods, while achieving the best results on F1, AP50, and AP. Moreover, MDFE-Net also outperforms other strong detectors included in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>, such as RT-DETR-L, YOLOv12N, and FBRT-YOLO-N, demonstrating the effectiveness of our approach on this challenging dataset. We provide visual examples in <xref ref-type="fig" rid="f6"><bold>Figure&#xa0;6</bold></xref> to compare MDFE-Net with the advanced detectors YOLO11N and YOLOv12N. As shown in the zoomed-in region, YOLO11N and YOLOv12N produce an additional false positive (highlighted in red), whereas MDFE-Net suppresses this false alarm, indicating improved robustness and accuracy for small-object detection.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparisons of MDFE-Net with state-of-the-art algorithms in PSD-Node.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">AP50</th>
<th valign="middle" align="center">AP</th>
<th valign="middle" align="center">Parameter</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">YOLOv5N(2020)</td>
<td valign="middle" align="center">0.858</td>
<td valign="middle" align="center">0.788</td>
<td valign="middle" align="center">0.822</td>
<td valign="middle" align="center">0.863</td>
<td valign="middle" align="center">0.429</td>
<td valign="middle" align="center">2.18M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv6N(2022)</td>
<td valign="middle" align="center">0.865</td>
<td valign="middle" align="center">0.776</td>
<td valign="middle" align="center">0.818</td>
<td valign="middle" align="center">0.844</td>
<td valign="middle" align="center">0.394</td>
<td valign="middle" align="center">4.15M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv8N(2023)</td>
<td valign="middle" align="center">0.855</td>
<td valign="middle" align="center">0.775</td>
<td valign="middle" align="center">0.813</td>
<td valign="middle" align="center">0.847</td>
<td valign="middle" align="center">0.395</td>
<td valign="middle" align="center">3.00M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv9T(2024)</td>
<td valign="middle" align="center">0.863</td>
<td valign="middle" align="center">0.764</td>
<td valign="middle" align="center">0.810</td>
<td valign="middle" align="center">0.835</td>
<td valign="middle" align="center">0.382</td>
<td valign="middle" align="center">2.00M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv10N(2024)</td>
<td valign="middle" align="center">0.853</td>
<td valign="middle" align="center">0.771</td>
<td valign="middle" align="center">0.810</td>
<td valign="middle" align="center">0.856</td>
<td valign="middle" align="center">0.419</td>
<td valign="middle" align="center">2.69M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv11N(2024)</td>
<td valign="middle" align="center">0.860</td>
<td valign="middle" align="center">0.768</td>
<td valign="middle" align="center">0.811</td>
<td valign="middle" align="center">0.847</td>
<td valign="middle" align="center">0.408</td>
<td valign="middle" align="center">2.58M</td>
</tr>
<tr>
<td valign="middle" align="center">RT-DETR-L(2024)</td>
<td valign="middle" align="center">0.849</td>
<td valign="middle" align="center">0.798</td>
<td valign="middle" align="center">0.823</td>
<td valign="middle" align="center">0.865</td>
<td valign="middle" align="center">0.437</td>
<td valign="middle" align="center">32.81M</td>
</tr>
<tr>
<td valign="middle" align="center">Hyper-YOLO-N(2025)</td>
<td valign="middle" align="center">0.863</td>
<td valign="middle" align="center">0.805</td>
<td valign="middle" align="center">0.833</td>
<td valign="middle" align="center">0.881</td>
<td valign="middle" align="center">0.462</td>
<td valign="middle" align="center">3.94M</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv12N(2025)</td>
<td valign="middle" align="center">0.868</td>
<td valign="middle" align="center">0.791</td>
<td valign="middle" align="center">0.828</td>
<td valign="middle" align="center">0.866</td>
<td valign="middle" align="center">0.434</td>
<td valign="middle" align="center">2.55M</td>
</tr>
<tr>
<td valign="middle" align="center">FBRT-YOLO-N(2025)</td>
<td valign="middle" align="center"><bold>0.890</bold></td>
<td valign="middle" align="center">0.779</td>
<td valign="middle" align="center">0.831</td>
<td valign="middle" align="center">0.875</td>
<td valign="middle" align="center">0.445</td>
<td valign="middle" align="center">0.85M</td>
</tr>
<tr>
<td valign="middle" align="center">MDFE-Net(Ours)</td>
<td valign="middle" align="center">0.884</td>
<td valign="middle" align="center"><bold>0.815</bold></td>
<td valign="middle" align="center"><bold>0.848</bold></td>
<td valign="middle" align="center"><bold>0.895</bold></td>
<td valign="middle" align="center"><bold>0.476</bold></td>
<td valign="middle" align="center">3.22M</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bolded values represent the evaluation metric with the best performance after comparing all the models.</p>
</table-wrap-foot>
</table-wrap>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Visualization comparison of YOLO11N, YOLOv12N, and MDFE-Net on the PSD-Node dataset. The bottom row shows a zoomed-in region. Red bounding boxes denote false positives. YOLO11N and YOLOv12N produce an additional false alarm in the enlarged area, whereas MDFE-Net suppresses this false detection.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g006.tif">
<alt-text content-type="machine-generated">Three-panel comparison showing street scenes with AI detection models identifying traffic signs. Each panel highlights detections: YOLO11N marks one sign, YOLOv12N marks another, and MDFE-Net marks two signs, indicating improved detection accuracy.</alt-text>
</graphic></fig>
<p>2. VisDrone: <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref> shows that MDFE-Net outperforms the baseline YOLO11N on the VisDrone dataset, improving F1, AP50, and AP by 3.2%, 3.7%, and 2.3%, respectively. In addition, MDFE-Net achieves better scale-aware performance with APs/APm/APl of 0.077/0.268/0.361, showing clear gains across different object scales, especially for small objects. Compared with recent state-of-the-art lightweight detectors included in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>, MDFE-Net achieves the best overall performance under the same evaluation setting. We also report GFLOPs (computed at 640&#xd7;640 input) together with parameter count to characterize computational cost and provide a more complete view of accuracy-efficiency trade-offs. To further illustrate the effectiveness of MDFE-Net on small-object detection, we provide a qualitative comparison in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref> against two strong baselines, YOLO11N and YOLOv12N, on the VisDrone dataset. Different categories are indicated by different colors: purple denotes motorcycles, cyan denotes cars, and blue denotes vans. In this challenging scene with dense small objects, YOLO11N and YOLOv12N exhibit more missed detections, particularly for motorcycles, while MDFE-Net detects more valid instances with fewer misses in the zoomed-in region. These observations are consistent with the quantitative results in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>, where MDFE-Net achieves the best overall performance on VisDrone.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Qualitative comparison of YOLO11N, YOLOv12N, and MDFE-Net on VisDrone. Purple boxes indicate motorcycles, cyan boxes indicate cars, and blue boxes indicate vans. In this dense small-object scene, MDFE-Net detects more valid instances with fewer missed detections in the zoomed-in region compared with YOLO11N and YOLOv12N.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g007.tif">
<alt-text content-type="machine-generated">Three side-by-side photographs show a potted plant with leaves and annotated blue boxes along the stem; each panel has a magnified view highlighting annotation accuracy for YOLO11N, YOLOv12N, and MDFE-Net models, respectively.</alt-text>
</graphic></fig>
<p>3. GTSDB: <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref> shows that MDFE-Net achieves consistent improvements on the GTSDB dataset. Compared with the baseline YOLO11N, MDFE-Net improves F1, AP50, and AP by 2.5%, 2.6%, and 3.2%, respectively. Moreover, MDFE-Net obtains APs/APm/APl of 0.593/0.828/0.910, demonstrating strong performance across different object scales. We further report GFLOPs (computed at 640 <inline-formula>
<mml:math display="inline" id="im15"><mml:mo>&#xd7;</mml:mo></mml:math></inline-formula>640 input) and parameter count to provide an efficiency-related reference under the same evaluation setting. Although the recall of MDFE-Net is slightly lower than that of EDSOD, it remains the second-highest among all compared methods, while MDFE-Net achieves the best results on the other major metrics (F1, AP50, AP, and APs/APm/APl), indicating strong overall detection performance. We further validate the effectiveness of our proposed method in small object detection tasks by providing an example of visualization of the GTSDB dataset in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>, comparing MDFE-Net with the state-of-the-art object detectors YOLO11N and YOLOv12N. It is not difficult to see that the proposed method can effectively improve the accuracy of traffic small object detection model, and has better small object detection performance and lower miss rate than other SOTA models.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Comparison of visualization results of YOLO11N, YOLOv12N, and MDFE-Net on GTSDB. Compared with the advanced YOLO11N and YOLOv12N, MDFE-Net has better detection performance for small objects in images.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1778795-g008.tif">
<alt-text content-type="machine-generated">Three side-by-side aerial photos of an urban intersection display vehicle and pedestrian detection outputs from YOLO11N, YOLOv12N, and MDFE-Net models. Colored rectangles highlight detected cars and people, with a zoomed-in region below each main image for detailed comparison.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Ablation studies</title>
<p>As for the MDFA module, the original design intention of the module was to extract more underlying feature information in a larger range of receptive fields to improve the detection performance of small objects through the characteristics of multi-scale cavity convolution, so we placed the module in four feature output paths in YOLO11N for ablation study to verify the placement position of the module in the network. According to <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref> the experimental results show that the P2 feature layer is the optimal location for MDFA module. The main reason is that the P2 feature layer, as the bottom feature output of the network, has the highest feature map resolution and contains the richest detail information and texture data, which is crucial for the detection of small objects. Through the design of multi-scale cavity convolution, MDFA module can extract global context information from a wider range of receptive fields while maintaining the integrity of high-resolution feature maps, and strengthen the underlying features by combining multi-scale feature aggregation mechanisms. This design enables the module to give full play to the advantages of rich detail information of the P2 layer, and provides more accurate feature support for the classification and positioning of small objects. In contrast, when MDFA modules are placed in mid-to-high level feature output paths (such as P3, P4, or P5), although these layers contain stronger semantic information, due to their lower resolution and sparse object details, it is difficult to fully utilize the characteristics of the MDFA module. At the same time, high-level features pay more attention to the semantic expression of large objects, and there is a certain deviation between the features and the requirements of small object detection tasks, so it will cause interference with small object detection. Therefore, adding MDFA modules to layer P3, P4, or P5, or adding MDFA modules to all four layers, is not as significant as adding MDFA modules to layer P2.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Ablation study of MDFA module.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">P5</th>
<th valign="middle" align="center">P4</th>
<th valign="middle" align="center">P3</th>
<th valign="middle" align="center">P2</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">AP50</th>
<th valign="middle" align="center">AP</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.863</td>
<td valign="middle" align="center">0.783</td>
<td valign="middle" align="center">0.821</td>
<td valign="middle" align="center">0.872</td>
<td valign="middle" align="center">0.444</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.879</td>
<td valign="middle" align="center">0.783</td>
<td valign="middle" align="center">0.828</td>
<td valign="middle" align="center">0.880</td>
<td valign="middle" align="center">0.458</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.859</td>
<td valign="middle" align="center">0.815</td>
<td valign="middle" align="center">0.836</td>
<td valign="middle" align="center">0.883</td>
<td valign="middle" align="center">0.449</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">0.884</td>
<td valign="middle" align="center">0.815</td>
<td valign="middle" align="center">0.848</td>
<td valign="middle" align="center">0.895</td>
<td valign="middle" align="center">0.476</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">0.882</td>
<td valign="middle" align="center">0.766</td>
<td valign="middle" align="center">0.820</td>
<td valign="middle" align="center">0.870</td>
<td valign="middle" align="center">0.453</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To verify the validity of our proposed method, we conducted ablation study on the PSD-Node validation set and analyzed the impact of introducing various modules into the baseline network, as shown in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Ablation study of each module in PSD-Node EDH denoted as extra detect head.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">EDH</th>
<th valign="middle" align="center">MDFA</th>
<th valign="middle" align="center">CFE</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">F1</th>
<th valign="middle" align="center">AP50</th>
<th valign="middle" align="center">AP</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.860</td>
<td valign="middle" align="center">0.768</td>
<td valign="middle" align="center">0.811</td>
<td valign="middle" align="center">0.847</td>
<td valign="middle" align="center">0.408</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.855</td>
<td valign="middle" align="center">0.801</td>
<td valign="middle" align="center">0.827</td>
<td valign="middle" align="center">0.869</td>
<td valign="middle" align="center">0.446</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">0.860</td>
<td valign="middle" align="center">0.805</td>
<td valign="middle" align="center">0.832</td>
<td valign="middle" align="center">0.873</td>
<td valign="middle" align="center">0.450</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">0.864</td>
<td valign="middle" align="center">0.808</td>
<td valign="middle" align="center">0.835</td>
<td valign="middle" align="center">0.875</td>
<td valign="middle" align="center">0.452</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.870</td>
<td valign="middle" align="center">0.796</td>
<td valign="middle" align="center">0.831</td>
<td valign="middle" align="center">0.874</td>
<td valign="middle" align="center">0.450</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">0.879</td>
<td valign="middle" align="center">0.808</td>
<td valign="middle" align="center">0.842</td>
<td valign="middle" align="center">0.884</td>
<td valign="middle" align="center">0.458</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">0.879</td>
<td valign="middle" align="center">0.798</td>
<td valign="middle" align="center">0.837</td>
<td valign="middle" align="center">0.890</td>
<td valign="middle" align="center">0.471</td>
</tr>
<tr>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">&#x2714;</td>
<td valign="middle" align="center">0.884</td>
<td valign="middle" align="center">0.815</td>
<td valign="middle" align="center">0.848</td>
<td valign="middle" align="center">0.895</td>
<td valign="middle" align="center">0.476</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Firstly, the introduction of MDFA modules can significantly improve performance. It can effectively utilize low-level feature information through a multi-scale attention mechanism, improve the localization and detection ability of smaller objects, and better capture multi-scale local feature information. The introduction of the MDFA module alone resulted in a 1.6% increase in F1, a 2.2% increase in AP50, and a 3.8% increase in AP. Second, the introduction of the CFE module alone can extract a variety of feature semantic information through the multi-branch convolution structure, and dilated convolution can be used to increase the receptive field and obtain richer local context information. With the addition of the CFE module, F1, AP50, and AP improved by 2.1%, 2.6%, and 4.2%, respectively, due to the ability to extract rich contextual information through the multi-branch convolutional structure. The separate introduction of MDFA and CFE modules can be significantly improved. When the above two modules are introduced together, finally, F1, AP50, and AP increase by 2.4%, 2.8%, and 4.6% respectively, indicating that the combination of MDFA and CFE modules can produce better results than the separate modules. Finally, low-level feature maps typically contain richer spatial and local details, essential for accurate localization. We introduce the feature map P2 with the highest resolution. By reducing the downsampling times and retaining more detailed information, P2 is fused with feature maps of other scales to increase the sensitivity of small objects, and the detection head is used to output positioning and classification. It can be seen that F1, AP50, and AP increase by 2.0%, 2.7%, and 4.2% respectively after the feature map of layer P2 is introduced and the detection head is added. We combine MDFA and CFE modules with P2 layer detect head into network. F1, AP50, and AP increased by 3.7%, 4.8%, and 6.8% respectively, which made the model achieve the best detection effect, and also proved the effectiveness of our proposed MDFA and CFE innovative modules.</p>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>In this paper, we proposed two innovative plug-and-play modules, the MDFA and CFE modules, tailored specifically for small object detection, and integrated them into the YOLO11N framework to construct an effective small-object detector. We name this module the multi-scale dilated feature aggregation (MDFA) module. The proposed MDFA module improves multi-scale context modeling via dilated convolutions, thereby enriching the feature information of small objects. In addition, the proposed Context Feature Enhancement (CFE) module improves local feature perception and preserves informative cues for small objects to the greatest extent. It applies dilated convolution to increase the receptive field and enrich the context information, so as to better solve the detection problem of small objects. Finally, we demonstrate through experimental results that MDFE-Net achieves state-of-the-art performance over the current SOTA model in terms of small objects. In future work, we plan to further improve robustness under challenging conditions such as heavy occlusion and background clutter in dense small-object scenes. We also plan to strengthen fine-grained category discrimination for confusing classes. In addition, we will explore deployment-oriented optimization and validation on more domains to enhance generalization.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding authors.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>TL: Investigation, Writing &#x2013; review &amp; editing, Funding acquisition, Data curation, Resources, Conceptualization, Writing &#x2013; original draft, Methodology, Formal analysis. SL: Writing &#x2013; review &amp; editing, Investigation, Software, Formal analysis, Data curation, Project administration, Methodology. JZha: Investigation, Writing &#x2013; review &amp; editing, Methodology, Data curation, Validation, Supervision. BL: Investigation, Formal Analysis, Writing &#x2013; review &amp; editing, Project administration, Validation, Supervision, Methodology, Data curation, Conceptualization. JZhu: Investigation, Writing &#x2013; review &amp; editing, Conceptualization, Validation, Methodology, Data curation.</p></sec>
<ack>
<title>Acknowledgments</title>
<p>The authors would like to thank the reviewers for their valuable comments and suggestions.</p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Cao</surname> <given-names>Y.</given-names></name>
<name><surname>He</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>L.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Yuan</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;
<article-title>Visdrone-det2021: The vision meets drone object detection challenge results</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>2847</fpage>&#x2013;<lpage>2854</lpage>.
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cheng</surname> <given-names>G.</given-names></name>
<name><surname>Lang</surname> <given-names>C.</given-names></name>
<name><surname>Wu</surname> <given-names>M.</given-names></name>
<name><surname>Xie</surname> <given-names>X.</given-names></name>
<name><surname>Yao</surname> <given-names>X.</given-names></name>
<name><surname>Han</surname> <given-names>J.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Feature enhancement network for object detection in optical remote sensing images</article-title>. <source>J. Remote Sens</source> <volume>2021</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.34133/2021/9805389</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Feng</surname> <given-names>Y.</given-names></name>
<name><surname>Huang</surname> <given-names>J.</given-names></name>
<name><surname>Du</surname> <given-names>S.</given-names></name>
<name><surname>Ying</surname> <given-names>S.</given-names></name>
<name><surname>Yong</surname> <given-names>J.-H.</given-names></name>
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). 
<article-title>Hyper-yolo: When visual object detection meets hypergraph computation</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2024.3451368</pub-id>, PMID: <pub-id pub-id-type="pmid">40030788</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>Donahue</surname> <given-names>J.</given-names></name>
<name><surname>Darrell</surname> <given-names>T.</given-names></name>
<name><surname>Malik</surname> <given-names>J.</given-names></name>
</person-group> (<year>2014</year>). &#x201c;
<article-title>Rich feature hierarchies for accurate object detection and semantic segmentation</article-title>,&#x201d; in <conf-name>2014 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>580</fpage>&#x2013;<lpage>587</lpage>.
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jiao</surname> <given-names>J.</given-names></name>
<name><surname>Tang</surname> <given-names>Y.-M.</given-names></name>
<name><surname>Lin</surname> <given-names>K.-Y.</given-names></name>
<name><surname>Gao</surname> <given-names>Y.</given-names></name>
<name><surname>Ma</surname> <given-names>A. J.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Dilateformer: Multi-scale dilated transformer for visual recognition</article-title>. <source>IEEE Trans. Multimedia</source> <volume>25</volume>, <fpage>8906</fpage>&#x2013;<lpage>8919</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TMM.2023.3243616</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Jocher</surname> <given-names>G.</given-names></name>
</person-group> (<year>2020</year>). <source>Ultralytics YOLOv5</source>. Available online at: <uri xlink:href="https://github.com/ultralytics/yolov5">https://github.com/ultralytics/yolov5</uri> (Accessed <date-in-citation content-type="access-date">February 16, 2026</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Jocher</surname> <given-names>G.</given-names></name>
<name><surname>Chaurasia</surname> <given-names>A.</given-names></name>
<name><surname>Qiu</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>). <source>Ultralytics YOLOv8. GitHub repository</source>.
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Khanam</surname> <given-names>R.</given-names></name>
<name><surname>Hussain</surname> <given-names>M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Yolov11: An overview of the key architectural enhancements</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2410.11368</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kisantal</surname> <given-names>M.</given-names></name>
<name><surname>Wojna</surname> <given-names>Z.</given-names></name>
<name><surname>Murawski</surname> <given-names>J.</given-names></name>
<name><surname>Naruniec</surname> <given-names>J.</given-names></name>
<name><surname>Cho</surname> <given-names>K.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Augmentation for small object detection</article-title>. <source>arXiv</source> <volume>9</volume>, <fpage>29</fpage>&#x2013;<lpage>40</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.5121/csit.2019</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Cheng</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>C.</given-names></name>
<name><surname>Zhao</surname> <given-names>J.</given-names></name>
<name><surname>Huang</surname> <given-names>L.</given-names></name>
</person-group> (<year>2023</year>b). 
<article-title>Rcct-asppnet: dual-encoder remote image segmentation based on transformer and aspp</article-title>. <source>Remote Sens.</source> <volume>15</volume>, <fpage>379</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs15020379</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>L.</given-names></name>
<name><surname>Geng</surname> <given-names>Y.</given-names></name>
<name><surname>Jiang</surname> <given-names>H.</given-names></name>
<name><surname>Cheng</surname> <given-names>M.</given-names></name>
<name><surname>Zhang</surname> <given-names>B.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>a). <source>YOLOv6 v3.0: A full-scale reloading</source>.
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Zhou</surname> <given-names>M.</given-names></name>
<name><surname>Cao</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2025</year>). &#x201c;
<article-title>Edsod: An encoder-decoder, diffusion-model, and swin-transformer-based small object detector</article-title>,&#x201d; in <conf-name>2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>1659</fpage>&#x2013;<lpage>1665</lpage>.
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>T.-Y.</given-names></name>
<name><surname>Doll&#xe1;r</surname> <given-names>P.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Hariharan</surname> <given-names>B.</given-names></name>
<name><surname>Belongie</surname> <given-names>S.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <conf-name>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>2117</fpage>&#x2013;<lpage>2125</lpage>.
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>T.-Y.</given-names></name>
<name><surname>Maire</surname> <given-names>M.</given-names></name>
<name><surname>Belongie</surname> <given-names>S.</given-names></name>
<name><surname>Hays</surname> <given-names>J.</given-names></name>
<name><surname>Perona</surname> <given-names>P.</given-names></name>
<name><surname>Ramanan</surname> <given-names>D.</given-names></name>
<etal/>
</person-group>. (<year>2014</year>). &#x201c;
<article-title>Microsoft coco: Common objects in context</article-title>,&#x201d; in <conf-name>Computer Vision  - ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6 -12, 2014, Proceedings, Part V</conf-name>, eds. 
<person-group person-group-type="editor">
<name><surname>Fleet</surname> <given-names>D.</given-names></name>
<name><surname>Pajdla</surname> <given-names>T.</given-names></name>
<name><surname>Schiele</surname> <given-names>B.</given-names></name>
<name><surname>Tuytelaars</surname> <given-names>T.</given-names></name>
</person-group>. (<publisher-loc>Heidelberg, Germany</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>740</fpage>&#x2013;<lpage>755</lpage>.
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>S.</given-names></name>
<name><surname>Huang</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>Y</given-names></name>
</person-group>. (<year>2018</year>a). &#x201c;
<article-title>Receptive field block net for accurate and fast object detection</article-title>,&#x201d; in <conf-name>Computer Vision  - ECCV 2018: 15th European Conference, Munich, Germany, September 8 -14, 2018, Proceedings, Part X</conf-name>. eds. 
<person-group person-group-type="editor">
<name><surname>Ferrari</surname> <given-names>V.</given-names></name>
<name><surname>Hebert</surname> <given-names>M.</given-names></name>
<name><surname>Sminchisescu</surname> <given-names>C.</given-names></name>
<name><surname>Weiss</surname> <given-names>Y.</given-names></name>
</person-group> (<publisher-loc>Cham, Switzerland</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>385</fpage>&#x2013;<lpage>400</lpage>.
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>S.</given-names></name>
<name><surname>Huang</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Learning spatial fusion for single-shot object detection</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1911.09516</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>S.</given-names></name>
<name><surname>Qi</surname> <given-names>L.</given-names></name>
<name><surname>Qin</surname> <given-names>H.</given-names></name>
<name><surname>Shi</surname> <given-names>J.</given-names></name>
<name><surname>Jia</surname> <given-names>J.</given-names></name>
</person-group> (<year>2018</year>b). &#x201c;
<article-title>Path aggregation network for instance segmentation</article-title>,&#x201d; in <conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>8759</fpage>&#x2013;<lpage>8768</lpage>.
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>R.</given-names></name>
<name><surname>Tao</surname> <given-names>F.</given-names></name>
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Na</surname> <given-names>J.</given-names></name>
<name><surname>Leng</surname> <given-names>H.</given-names></name>
<name><surname>Wu</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Raanet: A residual aspp with attention framework for semantic segmentation of high-resolution remote sensing images</article-title>. <source>Remote Sens.</source> <volume>14</volume>, <fpage>3109</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs14133109</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mahasin</surname> <given-names>M.</given-names></name>
<name><surname>Dewi</surname> <given-names>I. A.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Comparison of cspdarknet53, cspresnext-50, and efficientnet-b0 backbones on yolo v4 as object detector</article-title>. <source>Int. J. engineering Sci. Inf. Technol.</source> <volume>2</volume>, <fpage>64</fpage>&#x2013;<lpage>72</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.31227/ijesit.v2i2.1128</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ruan</surname> <given-names>H.</given-names></name>
<name><surname>Qian</surname> <given-names>W.</given-names></name>
<name><surname>Zheng</surname> <given-names>Z.</given-names></name>
<name><surname>Peng</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>A decoupled semantic&#x2013;detail learning network for remote sensing object detection in complex backgrounds</article-title>. <source>Electronics</source> <volume>12</volume>, <fpage>3201</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics12143201</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shimoni</surname> <given-names>M.</given-names></name>
<name><surname>Haelterman</surname> <given-names>R.</given-names></name>
<name><surname>Perneel</surname> <given-names>C.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Hypersectral imaging for military and security applications: Combining myriad processing and sensing techniques</article-title>. <source>IEEE Geosci. Remote Sens. Magazine</source> <volume>7</volume>, <fpage>101</fpage>&#x2013;<lpage>117</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/MGRS.2019.2902525</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tan</surname> <given-names>M.</given-names></name>
<name><surname>Pang</surname> <given-names>R.</given-names></name>
<name><surname>Le</surname> <given-names>Q. V.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Efficientdet: Scalable and efficient object detection</article-title>,&#x201d; in <conf-name>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>10781</fpage>&#x2013;<lpage>10790</lpage>.
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tian</surname> <given-names>Y.</given-names></name>
<name><surname>Ye</surname> <given-names>Q.</given-names></name>
<name><surname>Doermann</surname> <given-names>D.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>Yolov12: Attention-centric real-time object detectors</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2501.09841</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tong</surname> <given-names>K.</given-names></name>
<name><surname>Wu</surname> <given-names>Y.</given-names></name>
<name><surname>Zhou</surname> <given-names>F.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Recent advances in small object detection based on deep learning: A review</article-title>. <source>Image Vision Computing</source> <volume>97</volume>, <fpage>103910</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.imavis.2020.103910</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>A.</given-names></name>
<name><surname>Chen</surname> <given-names>H.</given-names></name>
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Chen</surname> <given-names>K.</given-names></name>
<name><surname>Lin</surname> <given-names>Z.</given-names></name>
<name><surname>Han</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>a). 
<article-title>Yolov10: Real-time end-to-end object detection</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>37</volume>, <fpage>107984</fpage>&#x2013;<lpage>108011</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.52202/079017</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>C.-Y.</given-names></name>
<name><surname>Yeh</surname> <given-names>I.-H.</given-names></name>
<name><surname>Mark Liao</surname> <given-names>H.-Y.</given-names></name>
</person-group> (<year>2024</year>b). &#x201c;
<article-title>Yolov9: Learning what you want to learn using programmable gradient information</article-title>,&#x201d; in <conf-name>Computer Vision  - ECCV 2024: 18th European Conference, Milan, Italy, September 29  - October 4, 2024, Proceedings, Part XXVIII</conf-name>, eds. 
<person-group person-group-type="editor">
<name><surname>Leal-Taix&#xe9;</surname> <given-names>L.</given-names></name>
<name><surname>Brox</surname> <given-names>T.</given-names></name>
<name><surname>Ferrari</surname> <given-names>V.</given-names></name>
<name><surname>Pollefeys</surname> <given-names>M.</given-names></name>
</person-group> (<publisher-loc>Cham, Switzerland</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>21</lpage> (Accessed <date-in-citation content-type="access-date">October 4, 2024</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Woo</surname> <given-names>S.</given-names></name>
<name><surname>Park</surname> <given-names>J.</given-names></name>
<name><surname>Lee</surname> <given-names>J.-Y.</given-names></name>
<name><surname>Kweon</surname> <given-names>I. S.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Cbam: Convolutional block attention module</article-title>,&#x201d; in <conf-name>Computer Vision  - ECCV 2018: 15th European Conference, Munich, Germany, September 8 -14, 2018, Proceedings, Part VII</conf-name>, eds. 
<person-group person-group-type="editor">
<name><surname>Ferrari</surname> <given-names>V.</given-names></name>
<name><surname>Hebert</surname> <given-names>M.</given-names></name>
<name><surname>Sminchisescu</surname> <given-names>C.</given-names></name>
<name><surname>Weiss</surname> <given-names>Y.</given-names></name>
</person-group>. (<publisher-loc>Cham, Switzerland</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>3</fpage>&#x2013;<lpage>19</lpage>.
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Xiao</surname> <given-names>Y.</given-names></name>
<name><surname>Xu</surname> <given-names>T.</given-names></name>
<name><surname>Xin</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
</person-group> (<year>2025</year>). &#x201c;
<article-title>Fbrt-yolo: Faster and better for real-time aerial image detection</article-title>,&#x201d; in <conf-name>Proceedings of the 39th AAAI Conference on Artificial Intelligence (AAAI 2025)</conf-name>, Vol. <volume>39</volume> (<publisher-loc>Palo Alto, CA, USA</publisher-loc>: 
<publisher-name>AAAI Press</publisher-name>), <fpage>8673</fpage>&#x2013;<lpage>8681</lpage>.
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xu</surname> <given-names>X.</given-names></name>
<name><surname>Jiang</surname> <given-names>Y.</given-names></name>
<name><surname>Chen</surname> <given-names>W.</given-names></name>
<name><surname>Huang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Sun</surname> <given-names>X.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Damo-yolo: A report on real-time object detection design</article-title>. <source>arXiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2211.15444</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>S.</given-names></name>
<name><surname>Luo</surname> <given-names>P.</given-names></name>
<name><surname>Loy</surname> <given-names>C.-C.</given-names></name>
<name><surname>Tang</surname> <given-names>X.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Wider face: A face detection benchmark</article-title>,&#x201d; in <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>5525</fpage>&#x2013;<lpage>5533</lpage>.
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>C.</given-names></name>
<name><surname>Ma</surname> <given-names>J.</given-names></name>
<name><surname>Qi</surname> <given-names>S.</given-names></name>
<name><surname>Tian</surname> <given-names>J.</given-names></name>
<name><surname>Zheng</surname> <given-names>S.</given-names></name>
<name><surname>Tian</surname> <given-names>X.</given-names></name>
</person-group> (<year>2015</year>). 
<article-title>Directional support value of gaussian transformation for infrared small target detection</article-title>. <source>Appl. optics</source> <volume>54</volume>, <fpage>2255</fpage>&#x2013;<lpage>2265</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1364/AO.54.002255</pub-id>, PMID: <pub-id pub-id-type="pmid">25968508</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>X.</given-names></name>
<name><surname>Yang</surname> <given-names>J.</given-names></name>
<name><surname>Yan</surname> <given-names>J.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>T.</given-names></name>
<name><surname>Guo</surname> <given-names>Z.</given-names></name>
<etal/>
</person-group>. (<year>2019</year>). 
<article-title>Scrdet: Towards more robust detection for small, cluttered and rotated objects</article-title>. <source>Proc. IEEE/CVF Int. Conf. Comput. vision.</source> <volume>1</volume>, <fpage>8232</fpage>&#x2013;<lpage>8241</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2019.00832</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yu</surname> <given-names>X.</given-names></name>
<name><surname>Gong</surname> <given-names>Y.</given-names></name>
<name><surname>Jiang</surname> <given-names>N.</given-names></name>
<name><surname>Ye</surname> <given-names>Q.</given-names></name>
<name><surname>Han</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Scale match for tiny person detection</article-title>,&#x201d; in <conf-name>2020 IEEE Winter Conference on Applications of Computer Vision (WACV)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>1257</fpage>&#x2013;<lpage>1265</lpage>.
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yuan</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Olcn: An optimized low coupling network for small objects detection</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>19</volume>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2021.3119457</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
<name><surname>Qin</surname> <given-names>L.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Guo</surname> <given-names>Y.</given-names></name>
<name><surname>Zhou</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>Real-time detection method for small traffic signs based on yolov3</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>64145</fpage>&#x2013;<lpage>64156</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2020.2984554</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>R.</given-names></name>
<name><surname>Shao</surname> <given-names>Z.</given-names></name>
<name><surname>Huang</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>D.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Adaptive dense pyramid network for object detection in uav imagery</article-title>. <source>Neurocomputing</source> <volume>489</volume>, <fpage>377</fpage>&#x2013;<lpage>389</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2022.03.033</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>K.</given-names></name>
<name><surname>Shen</surname> <given-names>H.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Multi-stage feature enhancement pyramid network for detecting objects in optical remote sensing images</article-title>. <source>Remote Sens.</source> <volume>14</volume>, <fpage>579</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs14030579</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<name><surname>Ye</surname> <given-names>M.</given-names></name>
<name><surname>Zhu</surname> <given-names>G.</given-names></name>
<name><surname>Liu</surname> <given-names>Y.</given-names></name>
<name><surname>Guo</surname> <given-names>P.</given-names></name>
<name><surname>Yan</surname> <given-names>J.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Ffca-yolo for small object detection in remote sensing images</article-title>. <source>IEEE Trans. Geosci. Remote Sens.</source> <volume>62</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TGRS.2024.3363057</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhao</surname> <given-names>Y.</given-names></name>
<name><surname>Lv</surname> <given-names>W.</given-names></name>
<name><surname>Xu</surname> <given-names>S.</given-names></name>
<name><surname>Wei</surname> <given-names>J.</given-names></name>
<name><surname>Wang</surname> <given-names>G.</given-names></name>
<name><surname>Dang</surname> <given-names>Q.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). &#x201c;
<article-title>Detrs beat yolos on real-time object detection</article-title>,&#x201d; in <conf-name>2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>16965</fpage>&#x2013;<lpage>16974</lpage>.
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1886836">Zhenghong Yu</ext-link>, Guangdong Polytechnic of Science and Technology, China</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2248489">Shengyong Xu</ext-link>, Huazhong Agricultural University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3342396">Mengchu Zhou</ext-link>, New Jersey Institute of Technology Instructive Biomaterials and Additive Manufacturing Laboratory, United States</p></fn>
</fn-group>
</back>
</article>