<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="methods-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Computer Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-9898</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcomp.2026.1763780</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Methods</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>PrecisionMicro-DETR: enhancing small pulmonary nodule detection in CT scans with multi-scale feature fusion and lightweight design</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Jianle</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2648026"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhu</surname>
<given-names>Jianyu</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2408613"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lin</surname>
<given-names>YuYan</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3370813"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Deng</surname>
<given-names>Fuqin</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fu</surname>
<given-names>Lanhui</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2541172"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Liao</surname>
<given-names>Huilian</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2882979"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Shunde Hospital of Guangzhou University of Chinese Medicine</institution>, <city>Foshan</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>School of Business, Macau University of Science and Technology</institution>, <city>Taipa, Macao SAR</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>School of Electronic and Information Engineering, The Wuyi University</institution>, <city>Jiangmen</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Huilian Liao, <email xlink:href="mailto:liaohuilian@gzucm.edu.cn">liaohuilian@gzucm.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-11">
<day>11</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>8</volume>
<elocation-id>1763780</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>26</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Chen, Zhu, Lin, Deng, Fu and Liao.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Chen, Zhu, Lin, Deng, Fu and Liao</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-11">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>To address the common issue of insufficient accuracy in existing detection models when dealing with morphologically complex and minute pulmonary nodules, this study proposes an enhanced detection model called PrecisionMicro-DETR based on the RT-DETR architecture. The model introduces a feature enhancement fusion module tailored for small targets in the detection head to strengthen the feature extraction capability for subtle structures (Strengthen the integration of small target features, SSTF). It also incorporates a Modulation Fusion Module (MFM) to effectively improve discriminative performance in areas with blurred boundaries between lesions and normal tissues. Additionally, a lightweight neck network based on SNI-GSConvE is introduced to optimize computational load while maintaining high accuracy. Experimental evaluation shows that PrecisionMicro-DETR achieves a mean average precision (mAP) of 94.9% on the publicly available Tianchi dataset. Its robustness and generalization ability in real diagnostic environments are further validated through clinical CT images from hospital PACS systems. This study provides a high-precision and efficient solution for CT pulmonary nodule detection, contributing positively to advancing the clinical application of intelligent assisted diagnostic systems.</p>
</abstract>
<kwd-group>
<kwd>CT images</kwd>
<kwd>multi-scale features</kwd>
<kwd>object detection</kwd>
<kwd>pulmonary nodule detection</kwd>
<kwd>RT-DETR</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="11"/>
<table-count count="4"/>
<equation-count count="16"/>
<ref-count count="28"/>
<page-count count="16"/>
<word-count count="9645"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computer Vision</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Lung cancer is the leading cause of cancer-related deaths worldwide, with the highest incidence and mortality rates among all cancer types (<xref ref-type="bibr" rid="ref5">Bray et al., 2018</xref>). According to data from the World Health Organization, approximately 1.8 million people die from lung cancer annually, accounting for a significant 18% of global cancer-related deaths each year (<xref ref-type="bibr" rid="ref25">Zhang et al., 2024</xref>). Studies have confirmed that early screening can increase the five-year survival rate of lung cancer patients by approximately 60% (<xref ref-type="bibr" rid="ref8">Henschke, 2001</xref>). Pulmonary nodules are a common early manifestation of lung cancer. Medically, they are defined as round or irregularly shaped lung lesions with a diameter not exceeding 3 centimeters (<xref ref-type="bibr" rid="ref11">Karki, 2017</xref>). Clinical studies have demonstrated that the emergence of symptoms such as cough, chest pain, and hemoptysis often indicates malignant progression or cancerous transformation of pulmonary nodules (<xref ref-type="bibr" rid="ref18">Mazzone and Lam, 2022</xref>). Therefore, achieving early detection and precise management at the pulmonary nodule stage is regarded as a primary strategy for halting the progression of lung cancer and reducing its incidence rate (<xref ref-type="bibr" rid="ref10">Jin et al., 2023</xref>). Clinical data confirm that the earlier standardized treatment is initiated, the more beneficial it is for extending patients&#x2019; overall survival and improving their quality of life (<xref ref-type="bibr" rid="ref15">Li et al., 2022</xref>). At present, the clinical detection of pulmonary nodules primarily relies on radiologists&#x2019; visual assessment of lung CT scan images. However, this method exhibits significant limitations. A single lung CT scan can generate 100&#x2013;200 images with varying slice thicknesses. With the rapid increase in screening demands, physicians are required to process massive volumes of imaging data within limited time frames. Sustained high-intensity workloads not only exacerbate physical and mental stress among medical professionals but also elevate the risk of nodule misdiagnosis or oversight due to visual fatigue and cognitive overload. Furthermore, given the complexity and multi-layered nature of CT imaging, complete reliance on manual slice-by-slice interpretation is inefficient, and prolongs diagnostic cycles, and may also delay optimal treatment opportunities for patients. Consequently, developing a novel approach capable of assisting physicians in achieving accurate and efficient pulmonary nodule detection holds substantial clinical value and promising application prospects. This initiative also aligns with the national policy advocated by the National Health Commission to promote the integration of &#x201C;artificial intelligence with healthcare.&#x201D;</p>
<p>The advancement of deep learning technology has provided robust technical support for medical image analysis (<xref ref-type="bibr" rid="ref7">Haq, 2022</xref>). Deep learning technology is fundamentally built upon a multi-layered neural network architecture. The core strength of this architecture lies in its ability to automatically extract high-level features and abstract representations from large-scale datasets, thereby eliminating the dependence on manual feature engineering required by conventional methodologies (<xref ref-type="bibr" rid="ref19">Milletari et al., 2016</xref>). Convolutional Neural Networks (CNNs) have demonstrated exceptional performance in the field of image recognition and have been widely adopted as fundamental network architectures, thereby laying a solid foundation for subsequent research. <xref ref-type="bibr" rid="ref17">Marques et al. (2021)</xref> proposed a convolutional neural network-based method for classifying the malignancy degree of pulmonary nodules. <xref ref-type="bibr" rid="ref28">Zuo et al. (2019)</xref> proposed a multi-resolution convolutional neural network (CNN) for classifying candidate pulmonary nodules. The task of pulmonary nodule detection faces challenges due to the small target volume, morphological variability, and blurred boundaries. Traditional convolutional neural networks (CNNs) exhibit certain performance limitations in handling such multi-scale features and small object detection. In contrast, the Transformer architecture, leveraging its powerful capability to model global dependencies, has demonstrated significant advantages in the field of computer vision and has gradually achieved prominent results in object detection tasks, thereby offering a new technological pathway for pulmonary nodule detection (<xref ref-type="bibr" rid="ref1">Carion et al., 2020</xref>). Transformer architecture has demonstrated outstanding performance across various visual tasks. However, in the specific application scenario of pulmonary nodule detection, its practical effectiveness remains constrained by several inherent challenges. Due to the typically small size, diverse morphology, and blurred boundaries of pulmonary nodules, existing Transformer models still exhibit deficiencies in effectively capturing multi-scale contextual information and processing small targets. Furthermore, inherent limitations in computational complexity and inference speed make it difficult for such models to meet the stringent real-time requirements of clinical practice while maintaining high detection accuracy.</p>
<p>A multi-module collaboratively optimized architecture is proposed to address key challenges in detecting minute pulmonary nodules. To improve the accuracy of existing detection models in handling pulmonary nodules with complex morphology, small size, and ambiguous boundaries, this study proposed the PrecisionMicro-DETR model based on real-time detection Transformer model RT-DETR (<xref ref-type="bibr" rid="ref26">Zhao et al., 2024</xref>). Its core contribution lies in the integration of three tailored modules, each designed to tackle a specific difficulty: (1) The Small-target-oriented Strengthened Feature Fusion module (SSTF) (<xref ref-type="bibr" rid="ref21">Sunkara and Luo, 2022</xref>) helps alleviate the loss of fine-grained features in deep networks through lossless downsampling and high-resolution feature fusion; (2) The Modulation Fusion Module (MFM) (<xref ref-type="bibr" rid="ref4">Deng et al., 2025</xref>) employs a dynamic weight allocation mechanism to adaptively fuse multi-scale features, thereby improving localization in regions with blurred boundaries between lesions and normal tissues; (3) The lightweight SNI-GSConvE Neck (<xref ref-type="bibr" rid="ref14">Li et al., 2024</xref>; <xref ref-type="bibr" rid="ref13">Li, 2024</xref>) reduces computational cost while maintaining feature alignment quality via soft nearest-neighbor interpolation and efficient convolutional units, addressing the practical need for real-time performance and lower resource overhead in clinical settings. Together, these modules form an end-to-end collaborative optimization framework, offering a balanced and effective approach for achieving both high detection accuracy and computational efficiency in medical imaging applications.</p>
</sec>
<sec id="sec2">
<label>2</label>
<title>Related work</title>
<sec id="sec3">
<label>2.1</label>
<title>Detection methods based on convolutional neural networks</title>
<p>With the rapid development of deep learning technology, convolutional neural networks (CNNs) have achieved remarkable success in the field of medical image processing, promoting the application of various CNN-based object detection models in pulmonary nodule detection tasks. Among them, the YOLO series models have attracted widespread attention in pulmonary nodule detection research due to their ability to balance high accuracy and real-time performance. For example, the YOLOv5-CASP model proposed by <xref ref-type="bibr" rid="ref9">Ji et al. (2023)</xref> enhances feature extraction and multi-scale fusion capabilities for small pulmonary nodules by introducing the CBAM attention mechanism, improving the ASPP module, and replacing standard convolutions with CoT modules, thereby significantly improving detection accuracy. However, the added modules also substantially increase computational complexity without fully considering model efficiency. The plug-and-play pulmonary nodule detection solution proposed by <xref ref-type="bibr" rid="ref23">Tang et al. (2025)</xref> enhances the model&#x2019;s perception of nodules of different sizes by constructing a multi-scale dual-branch attention mechanism, and designs a cross-layer aggregation module to mitigate detail loss during feature transmission, significantly improving the localization capability of small nodules while maintaining detection accuracy. The YOLO-MSRF model proposed by <xref ref-type="bibr" rid="ref24">Wu et al. (2024)</xref> effectively improves the detection accuracy of small pulmonary nodules by introducing three key enhancements: a small-target detection layer, a multi-scale receptive field module, and efficient omnidirectional convolution.</p>
<p>Beyond the aforementioned end-to-end detection models, another widely adopted technical paradigm involves using pre-trained deep convolutional neural networks as feature extractors, combined with traditional machine learning classifiers for final diagnosis. For instance, <xref ref-type="bibr" rid="ref12">Lanjewar et al. (2023)</xref> proposed a modified DenseNet201 model, which incorporates pooling and Dropout layers for a lightweight design to extract high-level features from CT images. These features are then refined using feature selection methods such as ETC and MRMR before being fed into various machine learning classifiers, reportedly achieving high classification accuracy on a specific dataset. Such hybrid approaches leverage the strengths of deep learning in feature representation and the high efficiency of classical machine learning algorithms. However, the feature extraction and classification stages are disconnected, preventing true end-to-end optimization. More importantly, their performance heavily depends on the effectiveness of feature selection, and they still suffer from issues such as feature loss and inadequate representation when dealing with morphologically complex and minute pulmonary nodules. Furthermore, most of these studies are validated on single, small-scale datasets, leaving their generalization capability and clinical interpretability insufficiently substantiated.</p>
<p>Although current research has made significant progress in improving the accuracy of pulmonary nodule detection, there remains a notable shortcoming in the synergistic optimization of accuracy and computational efficiency, making it difficult to meet the stringent computational resource requirements of clinical practice. Particularly given the characteristics of pulmonary nodule detection tasks&#x2014;where small targets and multi-scale distribution coexist&#x2014;existing methods still lack a comprehensive solution capable of simultaneously addressing both detection accuracy and operational efficiency.</p>
<p>Therefore, a major focus and ongoing challenge in current research is how to develop a detection model capable of achieving end-to-end collaborative optimization while possessing high detection accuracy, strong generalization capability, good clinical interpretability, and high computational efficiency. Bridging this gap is of significant importance for advancing the clinical adoption of intelligent pulmonary nodule detection technologies.</p>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Transformer-based detection methods</title>
<p>The Transformer architecture leverages self-attention mechanisms to capture global dependencies in parallel, overcoming limitations in long-range modeling. Its exceptional scalability has established it as a foundational technology in both natural language processing and computer vision. With ongoing technological evolution, Transformer-based approaches are gradually being applied in the medical field, demonstrating potential to surpass traditional methods, particularly in medical image analysis (<xref ref-type="bibr" rid="ref1">Carion et al., 2020</xref>). To address the morphological and positional complexity of pulmonary nodules in medical images, researchers have begun introducing adaptable modules such as deformable convolutions into the DETR framework to enhance its perception and localization capabilities for irregular targets (<xref ref-type="bibr" rid="ref6">Han et al., 2023</xref>). <xref ref-type="bibr" rid="ref22">Tang et al. (2025)</xref> proposed an enhanced pulmonary nodule detection algorithm based on RT-DETR, named LN-DETR. The proposed algorithm improves cross-scale feature fusion via newly-designed deep and shallow detail fusion layers, optimizes the computational load of the backbone network to reduce model size, and enhances contextual information by using efficient downsampling methods. <xref ref-type="bibr" rid="ref27">Zhou et al. (2025)</xref> proposed a Transformer-based pulmonary nodule detection model named LN-DETR, which innovatively integrates three core modules: the PC-EMA module enhances feature extraction capability while optimizing computational efficiency through multi-scale attention and partial convolution mechanisms; the GS-CCFM module facilitates effective cross-scale feature fusion using grouped shuffle convolution; and the CTrans module further improves overall feature fusion through cross-channel attention. This model significantly enhances computational efficiency while maintaining detection accuracy.</p>
<p>Building upon existing Transformer models, this study systematically addresses key issues in pulmonary nodule detection, such as inadequate multi-scale feature fusion and inefficient channel interaction, by designing multi-scale attention and cross-layer fusion modules, thereby achieving simultaneous improvements in detection accuracy and computational efficiency. However, the method still requires enhancement in capturing features of very small nodules, and its adaptability to nodules with complex morphologies needs further validation. More experiments are needed to support its practicality and robustness in real clinical environments.</p>
</sec>
</sec>
<sec sec-type="materials|methods" id="sec5">
<label>3</label>
<title>Materials and methods</title>
<p>In the task of CT pulmonary nodule detection, the features of tiny targets are highly susceptible to being lost in deep networks, which is the primary cause of missed detection of small nodules. The goal is to significantly reduce computational complexity while maintaining feature alignment quality, ensuring that the model can still output accurate nodule boundary information under limited medical computing resources. To this end, PrecisionMicro-DETR was constructed, with its structure illustrated in <xref ref-type="fig" rid="fig1">Figure 1</xref>. In the head, the SPDConv module is introduced, which utilizes a lossless downsampling mechanism from space to depth. This maintains the high resolution of feature maps while expanding the receptive field, significantly enhancing the model&#x2019;s ability to retain features of small pulmonary nodules and effectively addressing the feature loss problem of small targets in convolutional networks. In response to the diverse morphology of pulmonary nodules and their complex associations with surrounding tissues, traditional single-scale convolutions struggle to effectively capture multi-form features. The CSP-OmniKernel module is employed, leveraging a parallel architecture of local branches, large receptive field branches, and global attention branches. This enables multi-scale collaborative perception of the nodule&#x2019;s own texture, associations with surrounding blood vessels, and overall regional context, improving recognition accuracy for irregular nodules and complex cases. During multi-scale feature fusion, irrelevant tissue information from features at different levels can easily lead to semantic conflicts. The MFM module utilizes a dynamic weight allocation mechanism to adaptively evaluate the importance of each feature branch. This suppresses background interference and enhances lesion features during the fusion process, significantly improving nodule localization accuracy in low-contrast lung tissue backgrounds. To address the conflict between feature alignment distortion and computational efficiency, the SNI-GSConvE module employs soft nearest-neighbor interpolation to calibrate feature responses and incorporates a lightweight dual-path convolutional unit.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>PrecisionMicro-DETR.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Block diagram illustrating a deep learning architecture with three main components: Backbone (left), Improved Hybrid Encoder (center), and Decoder (right). The Backbone features stacked ConvNormLayers, MaxPool2d, BasicBlocks, and Conv layers. The Improved Hybrid Encoder integrates modules such as SNI, SPDConv, GSConvE, CSPOmniKernel, RepC3, and AIFI, with blue circles labeled MFM indicating Modulation Fusion Module connections. The Decoder includes IoU-aware query selection and a Decoder &#x0026; Head stage. Color-coded boxes and directional arrows clarify data flow between modules.</alt-text>
</graphic>
</fig>
<sec id="sec6">
<label>3.1</label>
<title>Strengthen the integration of small target features module</title>
<p>To address the issue of insufficient feature representation capability in the baseline RT-DETR model for small target detection tasks, this study systematically improves the model from two dimensions: feature pyramid construction and cross-scale feature fusion. The existing model&#x2019;s strategy of only fusing features from the P3-P5 levels results in the ineffective utilization of high-resolution features containing critical details. To overcome this limitation, this paper introduces a Space-to-Depth Convolution (SPDConv) module to perform deep enhancement on the P2-level features, which are rich in spatial details. By reorganizing spatial dimensional information into channel dimensions, this module expands the receptive field while maintaining the high spatial resolution of the feature maps, thereby enhancing semantic representation capability while preserving fine spatial structures and significantly improving the feature discriminability of small targets. The enhanced P2 features are fused into the P3 level through the lateral connection mechanism of the feature pyramid, enabling the model to obtain high-quality small target representations in the early stages of feature extraction.</p>
<sec id="sec7">
<label>3.1.1</label>
<title>SPDConv module</title>
<p>Since pulmonary nodules generally appear as small-scale targets in CT images, coupled with their morphological diversity and boundary ambiguity, traditional detection methods often suffer from insufficient feature extraction and missed detection issues. To address this limitation, this study adopts SPDConv (Space-to-Depth Convolution) (<xref ref-type="bibr" rid="ref21">Sunkara and Luo, 2022</xref>), a network architecture specifically designed to optimize low-resolution inputs and small object detection tasks. The core structure of the SPDConv module consists of a space-to-depth layer followed by a non-strided convolutional layer in sequential order. The working mechanism of SPDConv involves the space-to-depth layer restructuring spatial dimensional information of the feature map into the channel dimension, thereby achieving a lossless downsampling. Subsequently, the non-strided convolutional layer compresses the number of channels while utilizing its learnable parameters to efficiently integrate and transform these features. <xref ref-type="fig" rid="fig2">Figure 2</xref> illustrates the process of handling an intermediate feature map <inline-formula>
<mml:math id="M1">
<mml:mi mathvariant="normal">X</mml:mi>
</mml:math>
</inline-formula>(<inline-formula>
<mml:math id="M2">
<mml:mi>s</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M3">
<mml:mi>s</mml:mi>
</mml:math>
</inline-formula> is the spatial dimension&#x2014;height and width, and <inline-formula>
<mml:math id="M4">
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula> is the number of channels.) of arbitrary size using interval sampling with a scale factor of 2, resulting in four sub-feature maps <inline-formula>
<mml:math id="M5">
<mml:mi>f</mml:mi>
</mml:math>
</inline-formula>: <inline-formula>
<mml:math id="M6">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>,<inline-formula>
<mml:math id="M7">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>,<inline-formula>
<mml:math id="M8">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M9">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, each with a shape of <inline-formula>
<mml:math id="M10">
<mml:mo stretchy="true">(</mml:mo>
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>. To create the new feature map <inline-formula>
<mml:math id="M11">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo stretchy="true">(</mml:mo>
<mml:mfrac>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mn>4</mml:mn>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>, all sub-feature maps are concatenated along the channel axis. Subsequently, <inline-formula>
<mml:math id="M12">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
<mml:mo stretchy="true">(</mml:mo>
<mml:mfrac>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula> is obtained by applying a non-strided convolution with <inline-formula>
<mml:math id="M13">
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:math>
</inline-formula> filters. The transformation from <inline-formula>
<mml:math id="M14">
<mml:mi>X</mml:mi>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula> to <inline-formula>
<mml:math id="M15">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
</mml:math>
</inline-formula>, involving changes in both size and the number of channels, is conceptually equivalent to performing a strided convolution with altered channel dimensions on the original feature map. However, the key distinction lies in the fact that no pixel information is lost throughout this process.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Architecture of the SPDConv module. <bold>(A)</bold> The original feature map has dimensions of <italic>s</italic>&#x202F;&#x00D7;&#x202F;<italic>s</italic>. <bold>(B)</bold> Based on the stride, sampling is performed along the row and column directions to generate multiple sub-feature maps. <bold>(C)</bold> After two-fold downsampling, the feature map is divided into four sub-feature maps, each with dimensions of <inline-formula>
<mml:math id="M16">
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>&#x00D7;</mml:mo>
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula>. <bold>(D)</bold> When the four sub-feature maps are stacked along the channel dimension, the output feature map has dimensions of <inline-formula>
<mml:math id="M17">
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>&#x00D7;</mml:mo>
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>4</mml:mn>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula>. <bold>(E)</bold> Using a convolution operation with a unit stride, a feature map with dimensions of <inline-formula>
<mml:math id="M18">
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>&#x00D7;</mml:mo>
<mml:mfrac>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:math>
</inline-formula> is generated.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating the space-to-depth operation in convolutional neural networks. It progresses from a feature map cube with axes labeled x, y, and z (a), to colored blocks showing spatial reorganization (b), to grouping by color in smaller blocks of size s divided by two (c), merging into a stack with four times the original channels (d), and finally a convolution operation producing a red output feature map with new spatial and channel dimensions (e).</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec8">
<label>3.1.2</label>
<title>CSP-OmniKernel</title>
<p>The CSP-OmniKernel module (<xref ref-type="bibr" rid="ref3">Cui et al., 2024</xref>) introduced in this study adopts a dual-path architecture that integrates feature preservation and feature enhancement. One branch performs deep feature transformation through the OmniKernel module, while the other branch preserves the original features. Both branches are adjusted via 1&#x202F;&#x00D7;&#x202F;1 convolutions before fusion, achieving a balance between maintaining original information and enhancing features. After preprocessing with a 1&#x202F;&#x00D7;&#x202F;1 convolution, the OmniKernel module processes features in parallel through three distinct branches. The local branch employs depthwise separable convolution for local modulation. The large branch utilizes multi-scale depthwise separable convolutions (1&#x202F;&#x00D7;&#x202F;31, 31&#x202F;&#x00D7;&#x202F;1, 31&#x202F;&#x00D7;&#x202F;31) to capture strip-shaped contextual information. The global branch achieves cross-domain global modeling through dual-domain channel attention and frequency-domain spatial attention modules. After summing and fusing the outputs from each branch, feature modulation is finalized via a 1&#x202F;&#x00D7;&#x202F;1 convolution. The structure is illustrated in <xref ref-type="fig" rid="fig3">Figure 3</xref>. Specifically, for the input feature map <inline-formula>
<mml:math id="M19">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mtext mathvariant="italic">input</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula>, a 1&#x202F;&#x00D7;&#x202F;1 convolutional layer is first applied to perform channel adjustment, resulting in <inline-formula>
<mml:math id="M20">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:math>
</inline-formula>. As shown in <xref ref-type="disp-formula" rid="E1">Equation 1</xref>:</p>
<disp-formula id="E1">
<mml:math id="M21">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mtext mathvariant="italic">input</mml:mtext>
</mml:msub>
</mml:math>
<label>(1)</label>
</disp-formula>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>CSP-OmniKernel.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating a neural network architecture with branches for global, large, and local feature extraction using convolutional modules and specialized blocks labeled FSAM and DCAM, involving FFT and IFFT operations.</alt-text>
</graphic>
</fig>
<p>Here, <inline-formula>
<mml:math id="M22">
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mtext mathvariant="italic">input</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula> is the input feature map; <inline-formula>
<mml:math id="M23">
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M24">
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:math>
</inline-formula> are the weights of the 1&#x202F;&#x00D7;&#x202F;1 convolutions.</p>
<p>Next, the feature map is divided into two parts: one part undergoes processing through the OmniKernel module (OKM), while the other retains the original information. The two are concatenated with weights <italic>&#x03B1;</italic> and 1-&#x03B1;, then fused through a second 1&#x202F;&#x00D7;&#x202F;1 convolution <inline-formula>
<mml:math id="M25">
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula> to output the final feature map <inline-formula>
<mml:math id="M26">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mtext mathvariant="italic">output</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula>, as shown in <xref ref-type="disp-formula" rid="E2">Equation 2</xref>:</p>
<disp-formula id="E2">
<mml:math id="M27">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mtext mathvariant="italic">output</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi mathvariant="italic">Cat</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi mathvariant="italic">OKM</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>&#x03B1;</mml:mi>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03B1;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>
<p>Here, <inline-formula>
<mml:math id="M28">
<mml:mi>&#x03B1;</mml:mi>
</mml:math>
</inline-formula> is the feature split ratio; OKM is the OmniKernel module, used to enhance local and global information; Cat denotes the concatenation operation along the channel dimension; and <inline-formula>
<mml:math id="M29">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mtext mathvariant="italic">output</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula> is the final output feature map of the module.</p>
<p>The Dual-Domain Channel Attention Module (DCAM) enhances feature representation through dual-path processing in both the spatial and frequency domains. The spatial path extracts channel statistics using global pooling, while the frequency path performs spectral analysis via FFT/IFFT transformations, combined with 1&#x202F;&#x00D7;&#x202F;1 convolutions for feature restructuring. The Frequency Domain Spatial Attention Module (FSAM) then applies dual convolutional transformations to the DCAM output, strengthening key frequency components through spectral weighting, and finally restores the features to the spatial domain to improve detail reconstruction quality.</p>
</sec>
</sec>
<sec id="sec9">
<label>3.2</label>
<title>Modulation fusion module (MFM)</title>
<p>In the field of medical image analysis, challenges such as blurred boundaries and significant scale differences between lesion areas and normal tissues often arise, manifested as the dilution of semantic information during fusion and the difficulty of fixed fusion coefficients in adapting to dynamically changing imaging scenarios. To address these challenges, this paper introduces a lightweight Modulation Fusion Module (MFM) (<xref ref-type="bibr" rid="ref4">Deng et al., 2025</xref>), whose structure is illustrated in <xref ref-type="fig" rid="fig4">Figure 4</xref>. This module dynamically allocates weights to different input branches during the feature fusion process, enhancing the interaction capability among multi-scale features and thereby enabling efficient integration of cross-level semantic information. This method demonstrates strong adaptability and robustness in detecting low-resolution, small-scale targets such as micro-lesions in CT images. First, the module uses a 1&#x202F;&#x00D7;&#x202F;1 convolution to align the features of each branch with the target dimension <italic>C</italic>. Specifically, for the input feature map<inline-formula>
<mml:math id="M30">
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> with <inline-formula>
<mml:math id="M31">
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> input channels: If <inline-formula>
<mml:math id="M32">
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> &#x2260; <italic>C</italic>, a single 1&#x202F;&#x00D7;&#x202F;1 convolution projects it into the C-dimensional space. Otherwise, an identity mapping is applied to preserve the original features and avoid redundant computations. All aligned feature maps are concatenated along the channel dimension to form the tensor <inline-formula>
<mml:math id="M33">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. It is then reshaped into the tensor <inline-formula>
<mml:math id="M34">
<mml:mi>F</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> to preserve the independent semantics of each branch. Here, &#x211D; indicates that each element in the tensor belongs to the real number domain. This is the most common case, as feature values and computations in deep learning are typically based on floating-point numbers. <inline-formula>
<mml:math id="M35">
<mml:mi>B</mml:mi>
</mml:math>
</inline-formula> is Batch Size. Represents the number of samples (such as images) processed simultaneously in one forward pass. This is crucial for parallel computation and efficient training. <inline-formula>
<mml:math id="M36">
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is Input Channels. Denotes the depth of the feature map. For intermediate feature maps in the network, it represents the number of features extracted by that layer. <inline-formula>
<mml:math id="M37">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> is Height. The number of pixels or feature points in the vertical direction of the feature map. <inline-formula>
<mml:math id="M38">
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula> is Width. The number of pixels or feature points in the horizontal direction of the feature map.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Schematic diagram of the MFM structure.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram showing a neural network module that combines two sets of input features using global average pooling, multi-layer perceptron, softmax attention, convolution, and concatenation before producing an output feature map.</alt-text>
</graphic>
</fig>
<p>For the input feature map <inline-formula>
<mml:math id="M39">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> <inline-formula>
<mml:math id="M40">
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, it is first projected to the target channel number <italic>C</italic> through a 1&#x202F;&#x00D7;&#x202F;1 convolution. The aligned features from all branches are concatenated along the channel dimension to form a tensor <inline-formula>
<mml:math id="M41">
<mml:mi>F</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, which is then reshaped into <inline-formula>
<mml:math id="M42">
<mml:mi>F</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> to preserve the semantics of each branch. Subsequently, the global contextual representation <inline-formula>
<mml:math id="M43">
<mml:mi>g</mml:mi>
</mml:math>
</inline-formula> is computed, as shown in <xref ref-type="disp-formula" rid="E3">Equation 3</xref>:</p>
<disp-formula id="E3">
<mml:math id="M44">
<mml:mi>g</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">GAP</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(3)</label>
</disp-formula>
<p>Here, <italic>GAP</italic> stands for Global Average Pooling, and<inline-formula>
<mml:math id="M45">
<mml:mspace width="0.25em"/>
<mml:mi>g</mml:mi>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula>represents the aggregated global context representation.</p>
<p><inline-formula>
<mml:math id="M46">
<mml:mi>A</mml:mi>
</mml:math>
</inline-formula> weight vector for each branch is generated by a two-layer MLP (performing dimensionality reduction followed by expansion). This facilitates the adaptive learning of the relative importance of features from each branch according to the current context. The detailed computation is provided in <xref ref-type="disp-formula" rid="E4">Equation 4</xref>.</p>
<disp-formula id="E4">
<mml:math id="M47">
<mml:mi>A</mml:mi>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="italic">Softmax</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi mathvariant="italic">Con</mml:mi>
<mml:msubsup>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi mathvariant="italic">up</mml:mi>
</mml:msubsup>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext mathvariant="italic">ReLu</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi mathvariant="italic">Con</mml:mi>
<mml:msubsup>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mtext mathvariant="italic">down</mml:mtext>
</mml:msubsup>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(4)</label>
</disp-formula>
<p>Here, <inline-formula>
<mml:math id="M48">
<mml:mi>A</mml:mi>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula>represents the adaptive weight matrix. The <inline-formula>
<mml:math id="M49">
<mml:mi mathvariant="italic">Con</mml:mi>
<mml:msubsup>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mtext mathvariant="italic">down</mml:mtext>
</mml:msubsup>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula> and up of <inline-formula>
<mml:math id="M50">
<mml:mi mathvariant="italic">Con</mml:mi>
<mml:msubsup>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi mathvariant="italic">up</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula> represent dimensionality reduction and dimensionality enhancement, respectively.</p>
<p>Finally, The MFM module achieves effective fusion of multi-scale semantics by performing a weighted summation of the weights <inline-formula>
<mml:math id="M51">
<mml:mi>A</mml:mi>
</mml:math>
</inline-formula> with the aligned features <inline-formula>
<mml:math id="M52">
<mml:mi>F</mml:mi>
</mml:math>
</inline-formula>, The fused feature <inline-formula>
<mml:math id="M53">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mtext mathvariant="italic">out</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula> is obtained through a weighted summation. As shown in <xref ref-type="disp-formula" rid="E5">Equation 5</xref>:</p>
<disp-formula id="E5">
<mml:math id="M54">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mtext mathvariant="italic">out</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo movablelimits="false">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
<mml:mi>A</mml:mi>
<mml:mo stretchy="true">[</mml:mo>
<mml:mo>:</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo>&#x2217;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="true">[</mml:mo>
<mml:mo>:</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(5)</label>
</disp-formula>
</sec>
<sec id="sec10">
<label>3.3</label>
<title>SNI-GSConvE neck</title>
<p>To enhance the accuracy and robustness of CT pulmonary nodule detection under constrained real-time computational budgets, this study redesigns the neck structure of the model, with a focus on improving two core components: upsampling and feature fusion (<xref ref-type="bibr" rid="ref14">Li et al., 2024</xref>; <xref ref-type="bibr" rid="ref13">Li, 2024</xref>). These improvements are integrated into a novel module termed the Rethinking Features-Fused-Pyramid-Neck (RFPN). Traditional methods exhibit notable limitations: nearest-neighbor upsampling, by simply replicating low-frequency semantic information and directly overlaying it with high-frequency shallow textures, tends to induce feature misalignment and noise amplification. Meanwhile, commonly used CBS modules suffer from weak channel interaction capabilities and inefficient receptive field expansion under limited computational budgets, thereby constraining the quality of multi-scale feature fusion.</p>
<p>To address the upsampling stage, this paper introduces a soft nearest-neighbor interpolation method, whose mathematical expression is given by <xref ref-type="disp-formula" rid="E6">Equation 6</xref>:</p>
<disp-formula id="E6">
<mml:math id="M55">
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mtext mathvariant="italic">out</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mfrac>
<mml:mo>&#x22C5;</mml:mo>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mtext mathvariant="italic">nearest</mml:mtext>
</mml:msub>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>
<p><inline-formula>
<mml:math id="M56">
<mml:msub>
<mml:mi>U</mml:mi>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mtext mathvariant="italic">nearest</mml:mtext>
</mml:msub>
</mml:msub>
</mml:math>
</inline-formula> is the nearest neighbor upsampling operation. <italic>X</italic> is the input feature map. <inline-formula>
<mml:math id="M57">
<mml:mi>f</mml:mi>
</mml:math>
</inline-formula> is the upsampling factor, and the scaling factor <inline-formula>
<mml:math id="M58">
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mfrac>
</mml:math>
</inline-formula> is used to calibrate feature responses to avoid cross-layer feature imbalance.</p>
<p>This method applies a soft scaling factor related to the upsampling factor after upsampling, performing regional normalization on high-level semantic features to calibrate their feature responses in a smooth manner. This strategy effectively mitigates cross-layer feature imbalance without introducing any trainable parameters, significantly improving the recall rate and boundary segmentation quality of small-sized pulmonary nodules while preserving detailed textures. The structural design is illustrated in <xref ref-type="fig" rid="fig5">Figure 5</xref>.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Structure diagram of the SNI module. Nearest-neighbor upsampling is performed with scale <inline-formula>
<mml:math id="M59">
<mml:mspace width="0.25em"/>
<mml:mi>s</mml:mi>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula>, followed by the soft calibration factor <inline-formula>
<mml:math id="M60">
<mml:mi>&#x03B1;</mml:mi>
</mml:math>
</inline-formula>; the calibrated feature is the output <inline-formula>
<mml:math id="M61">
<mml:mi>Y</mml:mi>
</mml:math>
</inline-formula>.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart diagram showing Input X passed to a Nearest Neighbor module with scale equals s, then multiplied by &#x03B1;, resulting in Output Y, with each step represented by labeled blue shapes.</alt-text>
</graphic>
</fig>
<p>During the feature fusion stage, this paper adopts the lightweight aggregation unit GSConvE-I, whose structure is illustrated in <xref ref-type="fig" rid="fig6">Figure 6</xref>. This module takes the output X from the previous stage as input and initially performs channel compression and alignment through a 1&#x202F;&#x00D7;&#x202F;1 convolution, as shown in <xref ref-type="disp-formula" rid="E7">Equation 7</xref>:</p>
<disp-formula id="E7">
<mml:math id="M62">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mtext mathvariant="italic">Conv</mml:mtext>
<mml:mrow>
<mml:mo stretchy="true">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(7)</label>
</disp-formula>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>GSConvE-I lightweight fusion module. By employing parallel processing of the identity path and the enhancement path, combined with channel shuffling, this module effectively improves the efficiency of cross-scale feature aggregation and the preservation quality of edge features at a low computational cost.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Neural network diagram labeled GSConvE-I shows data flow from an input through a convolution layer, parallel paths performing linear fusion, GELU activation, channel concatenation, and shuffling, resulting in a final stacked output. A legend explains symbols for each operation.</alt-text>
</graphic>
</fig>
<p>Subsequently, the feature flow is split into two parallel processing paths: one path preserves the main information flow, while the other sequentially extracts deep features through 3&#x202F;&#x00D7;&#x202F;3 standard convolution, depthwise convolution, and the GELU activation function, as shown in <xref ref-type="disp-formula" rid="E8">Equation 8</xref>:</p>
<disp-formula id="E8">
<mml:math id="M63">
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="italic">GELU</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mtext mathvariant="italic">DWConv</mml:mtext>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mtext mathvariant="italic">StdConv</mml:mtext>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(8)</label>
</disp-formula>
<p>Finally, cross-path information interaction is achieved through channel concatenation and shuffling, as illustrated in <xref ref-type="disp-formula" rid="E9">Equation 9</xref>:</p>
<disp-formula id="E9">
<mml:math id="M64">
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mtext mathvariant="italic">out</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext mathvariant="italic">Shuffle</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mo stretchy="true">[</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(9)</label>
</disp-formula>
<p>Here, <inline-formula>
<mml:math id="M65">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mtext mathvariant="italic">DWConv</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula> represents depthwise separable convolution. <inline-formula>
<mml:math id="M66">
<mml:mtext mathvariant="italic">Shuffle</mml:mtext>
</mml:math>
</inline-formula> represents the channel shuffle operation.</p>
<p>This design significantly enhances channel interaction capability and effective receptive field while maintaining low computational complexity. It effectively suppresses feature aliasing and imaging artifacts, providing more discriminative multi-scale feature representations for the detection head.</p>
</sec>
</sec>
<sec id="sec11">
<label>4</label>
<title>Experiments</title>
<p>Based on the Tianchi dataset, this study conducts a comprehensive comparison between the proposed detection method and existing algorithms, including mAP curve analysis and visual comparison of detection results. Subsequently, through systematic ablation experiments, the performance contributions of each core module of the model are validated. Furthermore, k-fold cross-validation experiments are conducted to reduce the bias and variance introduced by inappropriate data partitioning. Finally, real-world medical images from hospital databases are retrieved for testing.</p>
<sec id="sec12">
<label>4.1</label>
<title>Dataset and preprocessing</title>
<p>In this study, to evaluate the effectiveness of the model, we utilized a widely recognized public dataset for pulmonary nodule detection: the Tianchi Pulmonary Nodule Dataset (<xref ref-type="bibr" rid="ref2">Cloud, 2017</xref>). The data were divided into training set (70%), validation set (15%), and test set (15%). Stratified sampling was employed to ensure that key dimensions such as nodule size, pathological type, and imaging characteristics maintained distributions consistent with the original dataset. The partitioning was conducted at the patient level to ensure that slices from the same patient did not span across subsets, thereby preventing data leakage and enhancing reproducibility. To validate the model&#x2019;s practical application value in real clinical environments, this study specifically selected CT imaging data from 50 confirmed pulmonary nodule patients randomly retrieved from the hospital PACS system, ensuring data accuracy and reliability.</p>
</sec>
<sec id="sec13">
<label>4.2</label>
<title>Experimental setting and evaluation metrics</title>
<sec id="sec14">
<label>4.2.1</label>
<title>Experimental setting</title>
<p>During the model training phase, the experiments were conducted on a Windows 11 system using Python 3.11 and the PyTorch 1.30 framework. The hardware configuration consisted of an Intel(R) Core(TM) i5-14600KF CPU, an NVIDIA GeForce RTX 5060Ti GPU, and 16&#x202F;GB of RAM. The network model parameters in this experiment were set as follows: the training epoch number was set to 300, and the batch size was set to 16.</p>
<p>Several commonly used evaluation metrics in the fields of medical imaging and object detection were employed to assess the performance of the proposed object detection method. These metrics include Precision (P), Recall (R), mean Average Precision (mAP), and the number of model parameters. The formulas for &#x201C;Precision&#x201D; and &#x201C;Recall&#x201D; are provided in <xref ref-type="disp-formula" rid="E10">Equations 10</xref> and <xref ref-type="disp-formula" rid="E11">11</xref>:</p>
<disp-formula id="E10">
<mml:math id="M67">
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">FP</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(10)</label>
</disp-formula>
<disp-formula id="E11">
<mml:math id="M68">
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">FN</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(11)</label>
</disp-formula>
<p><inline-formula>
<mml:math id="M69">
<mml:mi mathvariant="italic">TP</mml:mi>
</mml:math>
</inline-formula> is the positive sample correctly detected by the model. <inline-formula>
<mml:math id="M70">
<mml:mi mathvariant="italic">FP</mml:mi>
</mml:math>
</inline-formula> is the positive sample detected by model error detection. <inline-formula>
<mml:math id="M71">
<mml:mi mathvariant="italic">FN</mml:mi>
</mml:math>
</inline-formula> stands for False Negative.</p>
<p>AP (Average Precision) is an indicator used to measure the detection accuracy of the model. It reflects the average performance accuracy across different categories by calculating the area under the Precision&#x2013;Recall (P&#x2013;R) curve. The calculation formula for Average Precision (AP) is shown in <xref ref-type="disp-formula" rid="E12">Equation 12</xref>:</p>
<disp-formula id="E12">
<mml:math id="M72">
<mml:mi mathvariant="italic">AP</mml:mi>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mo>&#x222B;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi mathvariant="italic">pre</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi mathvariant="italic">rec</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mtext mathvariant="italic">drec</mml:mtext>
</mml:math>
<label>(12)</label>
</disp-formula>
<p>Here, Pre is Precision, which refers to the proportion of samples predicted as positive by the model that are truly positive. Rec is Recall, which refers to the proportion of all true positive samples correctly predicted by the model.</p>
<p>The mAP (mean Average Precision) is the average of all categories of AP, obtained by summing and averaging each AP value, The calculation formula is shown in <xref ref-type="disp-formula" rid="E13">Equation 13</xref>:</p>
<disp-formula id="E13">
<mml:math id="M73">
<mml:mi mathvariant="italic">mAP</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="italic">sum</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi mathvariant="italic">AP</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:mfrac>
</mml:math>
<label>(13)</label>
</disp-formula>
<p>To further analyze the characteristics of the model, this study evaluates it from two dimensions: computational complexity and structural efficiency. The parameter count (Parameters) reflects the model capacity, while the number of Giga Floating-Point Operations (GFLOPs) represents the computational cost. The formulas for calculating model parameters and GFLOPs are provided in <xref ref-type="disp-formula" rid="E14">Equations 14</xref> and <xref ref-type="disp-formula" rid="E15">15</xref>, respectively:</p>
<disp-formula id="E14">
<mml:math id="M74">
<mml:mtext mathvariant="italic">Params</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mo stretchy="true">[</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo>+</mml:mo>
<mml:mi>o</mml:mi>
</mml:math>
<label>(14)</label>
</disp-formula>
<disp-formula id="E15">
<mml:math id="M75">
<mml:mtext mathvariant="italic">GFLOPs</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mtext mathvariant="italic">Params</mml:mtext>
</mml:math>
<label>(15)</label>
</disp-formula>
<p>where <inline-formula>
<mml:math id="M76">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula> is the convolutional kernel size, <inline-formula>
<mml:math id="M77">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula> is the number of output channels, <inline-formula>
<mml:math id="M78">
<mml:mi>o</mml:mi>
</mml:math>
</inline-formula> is the number of output channels, and <inline-formula>
<mml:math id="M79">
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula> represents the spatial dimensions of the output feature map.</p>
<p>The F1 Score is the harmonic mean of Precision and Recall, calculated as <xref ref-type="disp-formula" rid="E16">Equation 16</xref>:</p>
<disp-formula id="E16">
<mml:math id="M80">
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mn>1</mml:mn>
<mml:mtext mathvariant="italic">score</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mtext mathvariant="italic">Precision</mml:mtext>
<mml:mo>&#x00D7;</mml:mo>
<mml:mtext mathvariant="italic">Recall</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext mathvariant="italic">Precision</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext mathvariant="italic">Recall</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(16)</label>
</disp-formula>
</sec>
</sec>
<sec id="sec15">
<label>4.3</label>
<title>Performance evaluation and loss analysis of the trained model</title>
<p><xref ref-type="fig" rid="fig7">Figure 7</xref> illustrates the performance evolution of the PrecisionMicro-DETR model over 300 training epochs. The model achieved its best performance on the mAP<sub>50</sub> metric, rapidly converging to 0.9 and maintaining stability, demonstrating its outstanding capability in identifying positive samples.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Curves of the training.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Four line graphs show the training progress of a model called PrecisionMicro-DETR, plotted by epoch on the x-axes. Precision, recall, and mAP_0.5 increase rapidly before stabilizing near one; mAP_0.5:0.95 rises steadily to about 0.67.</alt-text>
</graphic>
</fig>
<p>The loss convergence characteristics of PrecisionMicro-DETR during the training process are shown in <xref ref-type="fig" rid="fig8">Figure 8</xref>. The curve indicates that the loss value declined rapidly in the initial training phase and then stabilized. The overall convergence trajectory is smooth and free of significant fluctuations, indicating a stable and well-optimized model training process.</p>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>Loss analysis chart of the training process.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g008.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Six-panel line graph displays train and validation losses for giou, cls, and l1 metrics across three hundred epochs. All losses generally decrease over time, with each panel labeled for clarity.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec16">
<label>4.4</label>
<title>Ablation experiments on PrecisionMicro-DETR</title>
<p>Based on the analysis of ablation experiments, as shown in <xref ref-type="table" rid="tab1">Table 1</xref>, the proposed PrecisionMicro-DETR architecture achieves synergistic optimization of detection accuracy and computational efficiency in the task of CT pulmonary nodule detection. Regarding the core metric mAP<sub>50</sub> for pulmonary nodule detection, the MFM module demonstrates a significant improvement in the localization accuracy of small nodules, increasing the baseline model&#x2019;s performance from 0.932 to 0.947, thereby validating its effectiveness in enhancing feature representation capability within complex lung tissue backgrounds. The fully integrated PrecisionMicro-DETR further elevates mAP<sub>50</sub> to 0.949, highlighting the synergistic enhancement effect of multiple modules in pulmonary nodule detection.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Results of the ablation study.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">mAP<sub>50</sub></th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">F1-score</th>
<th align="center" valign="top">Parameters</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Baseline</td>
<td align="char" valign="middle" char=".">0.932</td>
<td align="char" valign="middle" char=".">0.884</td>
<td align="char" valign="middle" char=".">0.920</td>
<td align="center" valign="middle">31,106,233</td>
</tr>
<tr>
<td align="left" valign="middle">+MFM</td>
<td align="char" valign="middle" char=".">0.947</td>
<td align="char" valign="middle" char=".">0.904</td>
<td align="char" valign="middle" char=".">0.940</td>
<td align="center" valign="middle">19,709,204</td>
</tr>
<tr>
<td align="left" valign="middle">+RFPN</td>
<td align="char" valign="middle" char=".">0.944</td>
<td align="char" valign="middle" char=".">0.886</td>
<td align="char" valign="middle" char=".">0.923</td>
<td align="center" valign="middle"><bold>19,580,180</bold></td>
</tr>
<tr>
<td align="left" valign="middle">+SSTF</td>
<td align="char" valign="middle" char=".">0.94</td>
<td align="char" valign="middle" char=".">0.895</td>
<td align="char" valign="middle" char=".">0.932</td>
<td align="center" valign="middle">20,488,980</td>
</tr>
<tr>
<td align="left" valign="middle">+SSTF+RFPN</td>
<td align="char" valign="middle" char=".">0.936</td>
<td align="char" valign="middle" char=".">0.935</td>
<td align="char" valign="middle" char=".">0.932</td>
<td align="center" valign="middle">20,196,116</td>
</tr>
<tr>
<td align="left" valign="middle">+SSTF+MFM</td>
<td align="char" valign="middle" char=".">0.934</td>
<td align="char" valign="middle" char=".">0.927</td>
<td align="char" valign="middle" char=".">0.926</td>
<td align="center" valign="middle">20,302,356</td>
</tr>
<tr>
<td align="left" valign="middle">+SSTF+MFM+RFPN(This paper: PrecisionMicro-DETR)</td>
<td align="char" valign="middle" char="."><bold>0.949</bold></td>
<td align="char" valign="middle" char="."><bold>0.942</bold></td>
<td align="char" valign="middle" char="."><bold>0.946</bold></td>
<td align="center" valign="middle">20,009,492</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bolded text indicates the most advantageous value for that metric.</p>
</table-wrap-foot>
</table-wrap>
<p>From the perspective of comprehensive detection performance, the trend of F1-Score clearly reflects the continuous optimization of the model in the completeness of pulmonary nodule detection. The MFM module increases the F1 value from the baseline of 0.920&#x2013;0.940, primarily due to its enhancement in the recall capability for small nodules, with recall rising from 0.884 to 0.904. When the three modules are fully integrated, the F1-Score peaks at 0.9468, while recall significantly improves to 0.942. These results indicate that the model effectively reduces the rate of missed detection of pulmonary nodules while maintaining high-precision recognition, which holds significant value for clinical diagnosis.</p>
<p>In this ablation experiment, the F1 metric, as the harmonic mean of precision and recall, systematically reveals the contribution of each module to the comprehensive performance of the model. The continuous improvement in the F1 score&#x2014;from 0.920 of the baseline model to 0.946 of the fully integrated architecture&#x2014;validates the effectiveness of the module integration strategy. Among these, the MFM module demonstrates the most significant individual improvement effect, elevating the F1 value to 0.9403, primarily due to its multi-scale feature fusion capability enhancing the detection of small pulmonary nodules. Notably, when all three modules are fully integrated, the F1 score reaches its peak of 0.9468, while recall improves to 0.942. This indicates that the model significantly enhances nodule detection ability while maintaining high precision, which is of great importance for reducing missed diagnosis rates in clinical practice.</p>
<p>The experimental data further reveal the synergistic mechanisms among the modules. The combination of MFM and RFPN yields superior results compared to other dual-module configurations, indicating functional complementarity between the two in feature fusion and feature pyramid optimization. In contrast, the relatively lower F1 score observed with the SSTF and MFM combination suggests that careful design is required for module compatibility. Regarding model efficiency, all improved schemes maintain steady growth in F1-Score while reducing the parameter count by over 35%, demonstrating that this optimization approach not only ensures model lightweighting but also enhances overall performance through more refined architectural design.</p>
<p>The experimental results of this study demonstrate that PrecisionMicro-DETR, through the organic integration of multiple modules, successfully overcomes the trade-off between accuracy and computational complexity that traditional models face in CT pulmonary nodule detection tasks. By significantly reducing model parameters while comprehensively improving detection performance, this characteristic provides a new technical pathway for developing clinically applicable pulmonary nodule-assisted diagnostic systems, holding substantial clinical application value.</p>
<p><xref ref-type="fig" rid="fig9">Figure 9</xref> visualizes the attention regions of the models in the ablation study through heatmaps, where warm tones (red) indicate high attention and cool tones (blue) indicate low attention. The analysis reveals that: (a) The attention of the baseline model RT-DETR-R34 is the most dispersed, with numerous highlighted yet non-lesion responses, indicating that its attention is easily distracted by surrounding tissues; (b) After introducing partial improvement modules, the model&#x2019;s attention to lesion regions becomes more concentrated, but significant distracting responses still persist in the background, suggesting that the feature fusion and selection mechanisms are not yet fully developed; (c) The complete PrecisionMicro-DETR model (integrating modules such as SSTF and MFM) demonstrates highly focused attention on the lesion core and its edges, with background noise effectively suppressed. This demonstrates that the proposed multi-module collaborative mechanism can guide the model to allocate limited computational resources more precisely to discriminative lesion features, thereby achieving systematic improvement in detection performance while controlling computational complexity.</p>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>Heatmap.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g009.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Four-panel medical illustration showing a chest CT scan with a nodule outlined as ground truth, followed by three columns labeled PrecisionMicro-DETR, MFM, and RTDETR-R34. Each model column displays a CT scan with detected nodule location and confidence score above a corresponding heatmap visualization for detection activation. PrecisionMicro-DETR panel shows strong, focused activation near the nodule, while MFM and RTDETR-R34 heatmaps are more diffuse.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec17">
<label>4.5</label>
<title>Comparison with state-of-the-art detectors</title>
<p>To comprehensively evaluate the performance of PrecisionMicro-DETR, this study conducted a horizontal comparison with multiple mainstream detection models. The baseline model in this study is RT-DETR-34. In the comparative experiments, the same dataset split and training parameters were adopted to ensure fairness. PrecisionMicro-DETR demonstrated comprehensive competitive advantages, with the experimental results summarized in <xref ref-type="table" rid="tab2">Table 2</xref>.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Comparative results of different models.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">Precision</th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">mAP<sub>50</sub></th>
<th align="center" valign="top">mAP<sub>50-95</sub></th>
<th align="center" valign="top">Parameters</th>
<th align="center" valign="top">GFLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">YOLOv6</td>
<td align="char" valign="middle" char=".">0.768</td>
<td align="char" valign="middle" char=".">0.711</td>
<td align="char" valign="middle" char=".">0.766</td>
<td align="char" valign="middle" char=".">0.494</td>
<td align="center" valign="middle">4,233,843</td>
<td align="center" valign="middle">11.8</td>
</tr>
<tr>
<td align="left" valign="middle">YOLOv8</td>
<td align="char" valign="middle" char=".">0.905</td>
<td align="char" valign="middle" char=".">0.877</td>
<td align="char" valign="middle" char=".">0.906</td>
<td align="char" valign="middle" char=".">0.66</td>
<td align="center" valign="middle">3,005,843</td>
<td align="center" valign="middle">8.1</td>
</tr>
<tr>
<td align="left" valign="middle">YOLOv9</td>
<td align="char" valign="middle" char=".">0.9061</td>
<td align="char" valign="middle" char=".">0.85</td>
<td align="char" valign="middle" char=".">0.89</td>
<td align="char" valign="middle" char=".">0.55</td>
<td align="center" valign="middle">60,797,222</td>
<td align="center" valign="middle">266.1</td>
</tr>
<tr>
<td align="left" valign="middle">YOLOv10</td>
<td align="char" valign="middle" char=".">0.843</td>
<td align="char" valign="middle" char=".">0.852</td>
<td align="char" valign="middle" char=".">0.878</td>
<td align="char" valign="middle" char=".">0.68</td>
<td align="center" valign="middle">2,694,806</td>
<td align="center" valign="middle">8.2</td>
</tr>
<tr>
<td align="left" valign="middle">YOLO11</td>
<td align="char" valign="middle" char=".">0.902</td>
<td align="char" valign="middle" char=".">0.843</td>
<td align="char" valign="middle" char=".">0.908</td>
<td align="char" valign="middle" char=".">0.613</td>
<td align="center" valign="middle">2,582,347</td>
<td align="center" valign="middle">6.3</td>
</tr>
<tr>
<td align="left" valign="middle">YOLO12</td>
<td align="char" valign="middle" char=".">0.876</td>
<td align="char" valign="middle" char=".">0.794</td>
<td align="char" valign="middle" char=".">0.84</td>
<td align="char" valign="middle" char=".">0.607</td>
<td align="center" valign="middle">2,508,539</td>
<td align="center" valign="middle">5.8</td>
</tr>
<tr>
<td align="left" valign="middle">YOLOv13</td>
<td align="char" valign="middle" char=".">0.794</td>
<td align="char" valign="middle" char=".">0.668</td>
<td align="char" valign="middle" char=".">0.741</td>
<td align="char" valign="middle" char=".">0.469</td>
<td align="center" valign="middle">2,448,090</td>
<td align="center" valign="middle">6.2</td>
</tr>
<tr>
<td align="left" valign="middle">Faster R-CNN</td>
<td align="char" valign="middle" char=".">0.837</td>
<td align="char" valign="middle" char=".">0.765</td>
<td align="char" valign="middle" char=".">0.8090</td>
<td align="char" valign="middle" char=".">0.42</td>
<td align="center" valign="middle">&#x2013;</td>
<td align="center" valign="middle">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">SDD</td>
<td align="char" valign="middle" char=".">0.634</td>
<td align="char" valign="middle" char=".">0.530</td>
<td align="char" valign="middle" char=".">0.582</td>
<td align="char" valign="middle" char=".">0.19</td>
<td align="center" valign="middle">&#x2013;</td>
<td align="center" valign="middle">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle" char="&#x00D7;">RT-DETR-EfficientViT (<xref ref-type="bibr" rid="ref16">Liu et al., 2023</xref>)</td>
<td align="char" valign="middle" char=".">0.8909</td>
<td align="char" valign="middle" char=".">0.835</td>
<td align="char" valign="middle" char=".">0.859</td>
<td align="char" valign="middle" char=".">0.504</td>
<td align="center" valign="middle">10,702,612</td>
<td align="char" valign="middle" char="&#x00D7;">27.2</td>
</tr>
<tr>
<td align="left" valign="middle" char="&#x00D7;">RT-DETR -MobileNetV4 (<xref ref-type="bibr" rid="ref20">Qin et al., 2024</xref>)</td>
<td align="char" valign="middle" char=".">0.944</td>
<td align="char" valign="middle" char=".">0.896</td>
<td align="char" valign="middle" char=".">0.934</td>
<td align="char" valign="middle" char=".">0.678</td>
<td align="center" valign="middle">11,310,292</td>
<td align="char" valign="middle" char="&#x00D7;">39.5</td>
</tr>
<tr>
<td align="left" valign="top">RT-DETR-R50</td>
<td align="char" valign="top" char=".">0.927</td>
<td align="char" valign="top" char=".">0.911</td>
<td align="char" valign="top" char=".">0.93</td>
<td align="char" valign="top" char=".">0.678</td>
<td align="center" valign="top">41,956,163</td>
<td align="center" valign="top">129.5</td>
</tr>
<tr>
<td align="left" valign="top">RT-DETR-R34</td>
<td align="char" valign="top" char=".">0.96</td>
<td align="char" valign="top" char=".">0.884</td>
<td align="char" valign="top" char=".">0.932</td>
<td align="char" valign="top" char=".">0.687</td>
<td align="center" valign="top">31,106,233</td>
<td align="center" valign="top">88.8</td>
</tr>
<tr>
<td align="left" valign="middle">RT-DETR-R18</td>
<td align="char" valign="middle" char=".">0.917</td>
<td align="char" valign="middle" char=".">0.916</td>
<td align="char" valign="middle" char=".">0.9351</td>
<td align="char" valign="middle" char=".">0.7631</td>
<td align="center" valign="middle">20,083,028</td>
<td align="center" valign="middle">58.3</td>
</tr>
<tr>
<td align="left" valign="middle">PrecisionMicro-DETR(This paper)</td>
<td align="char" valign="middle" char=".">0.952</td>
<td align="char" valign="middle" char=".">0.942</td>
<td align="char" valign="middle" char=".">0.949</td>
<td align="char" valign="middle" char=".">0.698</td>
<td align="center" valign="middle">20,009,492</td>
<td align="center" valign="middle">62.4</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Compared to the YOLO series, the proposed method significantly outperformed YOLOv8 (0.906), YOLOv9 (0.89), and YOLOv10 (0.878) in terms of the mAP<sub>50</sub> metric, while also surpassing YOLOv9, which has 60.79 million parameters, in computational efficiency. In comparison with traditional detection algorithms, PrecisionMicro-DETR achieved a 17.3% improvement over Faster R-CNN (0.809) and a 63.1% improvement over SSD (0.582) in mAP<sub>50</sub>, highlighting the advantages of the Transformer-based detection framework in the field of medical imaging.</p>
<p>When compared to models within the same series, PrecisionMicro-DETR exhibited a unique balance of performance. Although RT-DETR R18 achieved a relatively high mAP<sub>50-95</sub> score of 0.763, its mAP<sub>50</sub> was 0.935, which is lower than the 0.949 achieved by the proposed method. More importantly, while maintaining high precision and a parameter count comparable to RT-DETR-R18, PrecisionMicro-DETR improved the recall rate from 0.916 to 0.942, which holds significant clinical importance for reducing the missed diagnosis rate of pulmonary nodules.</p>
<p>A comprehensive analysis indicates that PrecisionMicro-DETR successfully achieves a synergistic optimization of precision and efficiency. The overall improvement in key metrics such as mAP<sub>50</sub> and recall rate, combined with a substantial reduction in parameter count and computational load, demonstrates that the proposed architectural enhancements effectively address the challenge of balancing small target recognition and computational efficiency in medical image detection. Compared to existing mainstream algorithms, the proposed method significantly reduces computational resource requirements while maintaining competitive detection accuracy, offering a more viable solution for the clinical deployment of CT pulmonary nodule detection.</p>
<p>The value of this study lies in proposing a detection framework that balances precision and efficiency, providing new technical insights for the field of medical image analysis. Future work will focus on further optimizing the model structure and validating its generalization capability on more medical image datasets.</p>
</sec>
<sec id="sec18">
<label>4.6</label>
<title>K-fold cross-validation experiments</title>
<p>K-fold cross-validation systematically evaluates model performance across different data distributions by partitioning the dataset into multiple complementary subsets, providing more robust evaluation results compared to a single train-test split. This method maximizes the utilization of limited data resources while ensuring the independence of the test set, allowing each sample to participate once in testing and K-1 times in training&#x2014;making it particularly suitable for application scenarios with limited data scales, such as medical imaging. The results of this K-fold validation are presented in <xref ref-type="table" rid="tab3">Table 3</xref>, which indicates that Fold-1 achieved the best performance, with precision (95.2%), recall (94.2%), and mAP<sub>50</sub> (94.9%) all reaching high levels. In contrast, Fold-4 exhibited relatively weaker performance, with significant declines across all metrics. Although the PrecisionMicro-DETR model demonstrated outstanding performance under optimal conditions (e.g., Fold-1), its performance stability requires further improvement. The relatively poor performance of Fold-4 may be attributed to data quality issues or class imbalance, revealing potential problems in the model and providing direction for subsequent optimization.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>K-fold cross-validation results on the TianChi dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">K-fold</th>
<th align="center" valign="top">Precision</th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">mAP<sub>50</sub></th>
<th align="center" valign="top">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Fold-1</td>
<td align="char" valign="middle" char=".">0.952</td>
<td align="char" valign="middle" char=".">0.942</td>
<td align="char" valign="middle" char=".">0.949</td>
<td align="char" valign="middle" char=".">0.946</td>
</tr>
<tr>
<td align="left" valign="top">Fold-2</td>
<td align="char" valign="top" char=".">0.865</td>
<td align="char" valign="top" char=".">0.855</td>
<td align="char" valign="top" char=".">0.846</td>
<td align="char" valign="top" char=".">0.860</td>
</tr>
<tr>
<td align="left" valign="top">Fold-3</td>
<td align="char" valign="top" char=".">0.878</td>
<td align="char" valign="top" char=".">0.858</td>
<td align="char" valign="top" char=".">0.867</td>
<td align="char" valign="top" char=".">0.868</td>
</tr>
<tr>
<td align="left" valign="top">Fold-4</td>
<td align="char" valign="top" char=".">0.838</td>
<td align="char" valign="top" char=".">0.802</td>
<td align="char" valign="top" char=".">0.781</td>
<td align="char" valign="top" char=".">0.819</td>
</tr>
<tr>
<td align="left" valign="top">Fold-5</td>
<td align="char" valign="top" char=".">0.916</td>
<td align="char" valign="top" char=".">0.911</td>
<td align="char" valign="top" char=".">0.925</td>
<td align="char" valign="top" char=".">0.913</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To investigate the reasons for the relatively lower performance of Fold-4 (mAP 78.1%), this paper additionally trained several mainstream detection models as baseline comparisons on the same data split (i.e., Fold-4 of the Tianchi dataset), including YOLOv5, RT-DETR-R18, R34, and R50. All models were trained using identical train-validation-test splits and hyperparameter settings to ensure fair comparison. The experimental results, as shown in <xref ref-type="table" rid="tab4">Table 4</xref>, indicate that the mAP of all compared models on Fold-4 is significantly lower than their respective average performance on other folds. Under this specific split, this paper&#x2019;s model (78.1%) still consistently outperforms all compared models. These results strongly suggest that the performance fluctuation in Fold-4 is primarily due to the inherent challenges of this particular data subset (Fold-4), rather than unique shortcomings of our model. Further analysis of the dataset reveals that this fold may contain more small and indistinct nodules, or nodules distributed in more challenging locations (such as adjacent to the pleura or blood vessels), which inherently pose greater difficulties for any detection model.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Fold-4 cross-validation results on the TianChi dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" char="&#x00D7;">Model</th>
<th align="char" valign="top" char="&#x00D7;">Precision</th>
<th align="char" valign="top" char="&#x00D7;">Recall</th>
<th align="char" valign="top" char="&#x00D7;">mAP<sub>50</sub></th>
<th align="char" valign="top" char="&#x00D7;">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle" char="&#x00D7;">RT-DERT-R18</td>
<td align="char" valign="middle" char=".">0.807</td>
<td align="char" valign="middle" char=".">0.797</td>
<td align="char" valign="middle" char=".">0.750</td>
<td align="char" valign="middle" char="&#x00D7;">0.802</td>
</tr>
<tr>
<td align="left" valign="middle" char="&#x00D7;">RT-DETR-R34</td>
<td align="char" valign="middle" char=".">0.812</td>
<td align="char" valign="middle" char=".">0.811</td>
<td align="char" valign="middle" char=".">0.760</td>
<td align="char" valign="middle" char="&#x00D7;">0.811</td>
</tr>
<tr>
<td align="left" valign="middle" char="&#x00D7;">RT-DETR-R50</td>
<td align="char" valign="middle" char=".">0.835</td>
<td align="char" valign="middle" char=".">0.771</td>
<td align="char" valign="middle" char=".">0.724</td>
<td align="char" valign="middle" char="&#x00D7;">0.802</td>
</tr>
<tr>
<td align="left" valign="middle" char="&#x00D7;">YOLOv5</td>
<td align="char" valign="top" char=".">0.725</td>
<td align="char" valign="top" char=".">0.612</td>
<td align="char" valign="top" char=".">0.679</td>
<td align="char" valign="middle" char="&#x00D7;">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle" char="&#x00D7;">This paper (Fold-4)</td>
<td align="char" valign="middle" char="."><bold>0.838</bold></td>
<td align="char" valign="middle" char=".">0.802</td>
<td align="char" valign="middle" char="."><bold>0.781</bold></td>
<td align="char" valign="middle" char="&#x00D7;"><bold>0.819</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bolded text indicates the most advantageous value for that metric.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="sec19">
<label>5</label>
<title>Discussion</title>
<p>To systematically evaluate the clinical application value of PrecisionMicro-DETR detection results, this study further explores the advantages of this method through quantitative morphological analysis of detected nodules. Clinical imaging data randomly selected from the PACS system of Shunde Hospital, Guangzhou University of Chinese Medicine were used in the experiment to ensure the reliability of the detection results.</p>
<p>As shown in <xref ref-type="fig" rid="fig10">Figure 10</xref>, the proposed method accurately identifies nodules distributed across various regions of the lungs, including challenging locations such as those near the parietal pleura, with annotation regions highly consistent with the actual lesion morphology. In contrast, other comparative models exhibited varying degrees of missed or false detections in these complex cases.</p>
<fig position="float" id="fig10">
<label>Figure 10</label>
<caption>
<p>Comparison chart of different detection models.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g010.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Comparison figure presenting six columns of lung CT scan slices. Columns include Origin, Truly Exist with annotated nodules, and four algorithm results&#x2014;PrecisionMicro-DETR, RT-DETR-34, YOLOv13, and YOLOv8&#x2014;showing detected nodules marked with confidence scores in red or blue text.</alt-text>
</graphic>
</fig>
<p>According to the local enlarged view in <xref ref-type="fig" rid="fig11">Figure 11</xref>, PrecisionMicro-DETR achieved a detection confidence of 0.87 for small nodules, demonstrating the model&#x2019;s excellent performance in pulmonary nodule detection tasks. Compared to other advanced detection models, the proposed method outputs significantly higher confidence scores for suspicious lesion regions. Such performance not only reflects the accuracy of the model&#x2019;s judgments but also indicates its higher reliability in assisting clinical decision-making. However, the generalization ability of the proposed method remains to be validated. Subsequent studies will further evaluate the model&#x2019;s practical generalization performance on multi-center datasets.</p>
<fig position="float" id="fig11">
<label>Figure 11</label>
<caption>
<p>Detection results of PrecisionMicro-DETR.</p>
</caption>
<graphic xlink:href="fcomp-08-1763780-g011.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Three-panel comparison of chest CT scans displays the original scan, a manually labeled nodule with a yellow box and annotation, and an automated detection with a red box labeled &#x201C;nodule 0.90&#x201D; indicating high confidence.</alt-text>
</graphic>
</fig>
<p>The performance of the model largely depends on the distribution of the training data. For example, in our K-fold cross-validation (Section 4.6, <xref ref-type="table" rid="tab3">Table 3</xref>), we observed that when a particular fold (e.g., Fold-4) contains more small and indistinct nodules or nodules in special locations (such as a large number of subpleural nodules), the performance of all compared models (including baseline models) declines simultaneously. This clearly demonstrates that the inherent imbalance and challenging nature of the data are the primary factors contributing to performance fluctuations. Although our model still maintains a relative advantage in such cases, reducing sensitivity to specific data distributions remains key to improving robustness.</p>
<p>While PrecisionMicro-DETR achieves superior detection performance, particularly for small nodules, its parameter count warrants a balanced discussion regarding the trade-off between capability and efficiency. The design ethos intentionally prioritizes diagnostic sensitivity (recall) over minimalist design, given the clinical imperative to minimize missed nodules. Compared to the baseline RT-DETR-R34, our model reduces parameters by 35% while significantly improving all key metrics (<xref ref-type="table" rid="tab1">Table 1</xref>), demonstrating a targeted and effective rather than bloated design. The introduced modules (SSTF, MFM, SNI-GSConvE) are themselves lightweight innovations that address specific weaknesses in small object detection. We argue that for a life-critical diagnostic aid operating on standard clinical hardware, this level of complexity is justified by the substantial gain in reliability. Nevertheless, we acknowledge several limitations that point to future improvements: (1) Model Compression Potential: While efficient, the architecture may still accommodate advanced compression techniques such as pruning or quantization for further streamlining. (2) Inference Speed in Real-Time Workflows: Although designed for clinical hardware, extensive latency benchmarks in end-to-end diagnostic pipelines are needed to ensure seamless integration. Future work will actively explore these directions to enhance practicality without compromising the hard-won clinical efficacy.</p>
</sec>
<sec sec-type="conclusions" id="sec20">
<label>6</label>
<title>Conclusion</title>
<p>This study proposes a PrecisionMicro-DETR model designed to assist in detecting small lesions and alleviating the workload of clinicians. Through comparative experiments with different models, ablation studies, K-fold cross-validation, and validation with real CT images from the PACS system of Shunde Hospital, Guangzhou University of Chinese Medicine, the following conclusions are drawn:</p>
<list list-type="order">
<list-item>
<p>Ablation experiments on the Tianchi dataset show that after integrating the small-target feature fusion module SSTF, the MFM module, and the SNI-GSConv Neck structure, the PrecisionMicro-DETR model achieved mAP<sub>50</sub>, Recall, and F1-Score values of 0.949, 0.942, and 0.946, respectively, representing improvements of 1.8, 6.6, and 2.9% over the baseline model. The gradual introduction of each module significantly enhanced the model&#x2019;s ability to detect small pulmonary nodules, demonstrating that multi-module collaborative optimization is a key mechanism for performance improvement.</p>
</list-item>
<list-item>
<p>The proposed PrecisionMicro-DETR model for CT pulmonary nodule detection demonstrates comprehensive advantages in both accuracy and efficiency through comparative experiments. The experimental results indicate that the model achieved an mAP<sub>50</sub> of 0.949 and a recall of 0.942, outperforming existing mainstream detection models, which holds significant value for reducing clinical missed diagnosis rates. In terms of model efficiency, the method requires only 20 million parameters, achieving comprehensive superiority in key metrics while maintaining a model complexity comparable to RT-DETR-R18. Its parameter count is only 32.9% of that of YOLOv9, demonstrating excellent computational efficiency. Comprehensive analysis confirms that the proposed architectural improvements effectively address the challenge of balancing small-target detection and computational resources in medical imaging.</p>
</list-item>
<list-item>
<p>K-fold cross-validation revealed certain performance fluctuations, with precision ranging from 83.86 to 95.2% and recall ranging from 80.20 to 94.2%. These fluctuations reflect the inherent complexity of medical imaging data and suggest room for improvement in the model&#x2019;s adaptability to different data distributions. Despite these fluctuations, the model maintained stable high performance in most folds, such as in Fold-5, where all metrics remained above 91%, indicating robust baseline performance. By employing K-fold cross-validation, this study effectively addressed the challenge of limited medical imaging data scale, maximizing data utilization while providing a more robust evaluation of model performance.</p>
</list-item>
</list>
<p>Future work will strive to collect and construct a multi-center, multi-device clinical CT image dataset, and employ domain adaptation or test-time augmentation techniques to enable the model to robustly adapt to varying imaging conditions across different hospitals.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec21">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/supplementary material.</p>
</sec>
<sec sec-type="ethics-statement" id="sec22">
<title>Ethics statement</title>
<p>Ethical approval was not required for the study involving humans in accordance with the local legislation and institutional requirements. Written informed consent to participate in this study was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="sec23">
<title>Author contributions</title>
<p>JC: Project administration, Supervision, Methodology, Conceptualization, Visualization, Validation, Software, Writing &#x2013; original draft, Formal analysis, Writing &#x2013; review &#x0026; editing, Resources, Funding acquisition, Investigation, Data curation. JZ: Writing &#x2013; review &#x0026; editing, Software. YL: Validation, Supervision, Writing &#x2013; review &#x0026; editing. FD: Writing &#x2013; review &#x0026; editing, Validation, Project administration. LF: Writing &#x2013; review &#x0026; editing, Data curation. HL: Writing &#x2013; review &#x0026; editing, Project administration, Data curation.</p>
</sec>
<sec sec-type="COI-statement" id="sec24">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec25">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec26">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref5"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Bray</surname><given-names>F.</given-names></name> <name><surname>Ferlay</surname><given-names>J.</given-names></name> <name><surname>Soerjomataram</surname><given-names>I.</given-names></name> <name><surname>Siegel</surname><given-names>R. L.</given-names></name> <name><surname>Torre</surname><given-names>L. A.</given-names></name> <name><surname>Jemal</surname><given-names>A</given-names></name></person-group>. (<year>2018</year>). <article-title>Global cancer statistics 2018: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title>. <source>CA: A Cancer Journal for Clinicians</source>, <volume>68</volume>, <fpage>394</fpage>&#x2013;<lpage>424</lpage>. doi: <pub-id pub-id-type="doi">10.3322/caac.21492</pub-id></mixed-citation></ref>
<ref id="ref1"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Carion</surname><given-names>N.</given-names></name> <name><surname>Massa</surname><given-names>F.</given-names></name> <name><surname>Synnaeve</surname><given-names>G.</given-names></name></person-group> (<year>2020</year>). &#x201C;<article-title>End-to-end object detection with transformers</article-title>&#x201D; in <source>European conference on computer vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>213</fpage>&#x2013;<lpage>229</lpage>.</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Cloud</surname><given-names>A.</given-names></name></person-group> (<year>2017</year>). Tianchi medical AI competition [season 1]: intelligent diagnosis of pulmonary nodules. Available online at: <ext-link xlink:href="https://tianchi.aliyun.com/competition/entrance/231601/information" ext-link-type="uri">https://tianchi.aliyun.com/competition/entrance/231601/information</ext-link> (Accessed April, 2017).</mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cui</surname><given-names>Y.</given-names></name> <name><surname>Ren</surname><given-names>W.</given-names></name> <name><surname>Knoll</surname><given-names>A.</given-names></name></person-group> (<year>2024</year>). &#x201C;<article-title>Omni-kernel network for image restoration</article-title>&#x201D; in <source>Proceedings of the AAAI conference on artificial intelligence</source>, vol. <volume>38</volume> (<publisher-loc>Washington, D.C.</publisher-loc>: <publisher-name>AAAI</publisher-name>), <fpage>1426</fpage>&#x2013;<lpage>1434</lpage>.</mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Deng</surname><given-names>M</given-names></name> <name><surname>Sun</surname><given-names>S</given-names></name> <name><surname>Li</surname><given-names>Z</given-names></name> <name><surname>Hu</surname><given-names>X</given-names></name> <name><surname>Wu</surname><given-names>X</given-names></name></person-group>. (<year>2025</year>). <source>FMNet: frequency-assisted mamba-like linear attention network for camouflaged object detection</source>. <comment>arXiv preprint arXiv:2503.11030</comment>. Available online at: <ext-link xlink:href="https://arxiv.org/pdf/2503.11030?" ext-link-type="uri">https://arxiv.org/pdf/2503.11030?</ext-link></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Han</surname><given-names>L.</given-names></name> <name><surname>Li</surname><given-names>F.</given-names></name> <name><surname>Yu</surname><given-names>H.</given-names></name> <name><surname>Xia</surname><given-names>K.</given-names></name> <name><surname>Xin</surname><given-names>Q.</given-names></name> <name><surname>Zou</surname><given-names>X.</given-names></name></person-group> (<year>2023</year>). <article-title>BiRPN-YOLOvX: a weighted bidirectional recursive feature pyramid algorithm for lung nodule detection</article-title>. <source>J. Xray Sci. Technol.</source> <volume>31</volume>, <fpage>301</fpage>&#x2013;<lpage>317</lpage>. doi: <pub-id pub-id-type="doi">10.3233/XST-221310</pub-id>, <pub-id pub-id-type="pmid">36617767</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Haq</surname><given-names>I U</given-names></name></person-group>. (<year>2022</year>). <article-title>An overview of deep learning in medical imaging</article-title>. <comment>arXiv preprint arXiv:2202.08546</comment>. Available online at: <ext-link xlink:href="https://arxiv.org/pdf/2202.08546" ext-link-type="uri">https://arxiv.org/pdf/2202.08546</ext-link></mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Henschke</surname><given-names>C. I.</given-names></name></person-group> (<year>2001</year>). <article-title>Early lung cancer action project: overall design and findings from baseline screening</article-title>. <source>Cancer</source> <volume>89</volume>, <fpage>2474</fpage>&#x2013;<lpage>2482</lpage>. doi: <pub-id pub-id-type="doi">10.1002/1097-0142(20001201)89:11+3.0.CO;2-2</pub-id>, <pub-id pub-id-type="pmid">11147630</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ji</surname><given-names>Z.</given-names></name> <name><surname>Wu</surname><given-names>Y.</given-names></name> <name><surname>Zeng</surname><given-names>X.</given-names></name> <name><surname>An</surname><given-names>Y.</given-names></name> <name><surname>Zhao</surname><given-names>L.</given-names></name> <name><surname>Wang</surname><given-names>Z.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Lung nodule detection in medical images based on improved YOLOv5s</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>76371</fpage>&#x2013;<lpage>76387</lpage>. doi: <pub-id pub-id-type="doi">10.1109/access.2023.3296530</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname><given-names>H.</given-names></name> <name><surname>Yu</surname><given-names>C.</given-names></name> <name><surname>Gong</surname><given-names>Z.</given-names></name> <name><surname>Zheng</surname><given-names>R.</given-names></name> <name><surname>Zhao</surname><given-names>Y.</given-names></name> <name><surname>Fu</surname><given-names>Q.</given-names></name></person-group> (<year>2023</year>). <article-title>Machine learning techniques for pulmonary nodule computer-aided diagnosis using CT images: a systematic review</article-title>. <source>Biomed. Signal Process. Control</source> <volume>79</volume>:<fpage>104104</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2022.104104</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Karki</surname><given-names>R. F. A.</given-names></name></person-group> (<year>2017</year>). <article-title>Multiple pulmonary nodules in malignancy</article-title>. <source>Curr. Opin. Pulm. Med.</source> <volume>23</volume>, <fpage>285</fpage>&#x2013;<lpage>289</lpage>. doi: <pub-id pub-id-type="doi">10.1097/MCP.0000000000000393</pub-id>, <pub-id pub-id-type="pmid">28463856</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lanjewar</surname><given-names>M. G.</given-names></name> <name><surname>Panchbhai</surname><given-names>K. G.</given-names></name> <name><surname>Charanarur</surname><given-names>P.</given-names></name></person-group> (<year>2023</year>). <article-title>Lung cancer detection from CT scans using modified dense net with feature selection methods and ML classifiers</article-title>. <source>Expert Syst. Appl.</source> <volume>224</volume>:<fpage>119961</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2023.119961</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Li</surname><given-names>H.</given-names></name></person-group> (<year>2024</year>). &#x201C;<article-title>Rethinking features-fused-pyramid-neck for object detection</article-title>&#x201D; in <source>European conference on computer vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Nature Switzerland</publisher-name>), <fpage>74</fpage>&#x2013;<lpage>90</lpage>.</mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>H.</given-names></name> <name><surname>Li</surname><given-names>J.</given-names></name> <name><surname>Wei</surname><given-names>H.</given-names></name> <name><surname>Liu</surname><given-names>Z.</given-names></name> <name><surname>Zhan</surname><given-names>Z.</given-names></name> <name><surname>Ren</surname><given-names>Q.</given-names></name></person-group> (<year>2024</year>). <article-title>Slim-neck by GSConv: a lightweight-design for real-time detector architectures</article-title>. <source>J. Real-Time Image Proc.</source> <volume>21</volume>:<fpage>62</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s11554-024-01436-6</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>R.</given-names></name> <name><surname>Xiao</surname><given-names>C.</given-names></name> <name><surname>Huang</surname><given-names>Y.</given-names></name> <name><surname>Hassan</surname><given-names>H.</given-names></name> <name><surname>Huang</surname><given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>Deep learning applications in computed tomography images for pulmonary nodule detection and diagnosis: a review</article-title>. <source>Diagnostics</source> <volume>12</volume>:<fpage>298</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics12020298</pub-id>, <pub-id pub-id-type="pmid">35204388</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>X.</given-names></name> <name><surname>Peng</surname><given-names>H.</given-names></name> <name><surname>Zheng</surname><given-names>N.</given-names></name></person-group> (<year>2023</year>). &#x201C;<article-title>Efficientvit: memory efficient vision transformer with cascaded group attention</article-title>&#x201D; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>14420</fpage>&#x2013;<lpage>14430</lpage>.</mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Marques</surname><given-names>S.</given-names></name> <name><surname>Schiavo</surname><given-names>F.</given-names></name> <name><surname>Ferreira</surname><given-names>C. A.</given-names></name> <name><surname>Pedrosa</surname><given-names>J.</given-names></name> <name><surname>Cunha</surname><given-names>A.</given-names></name> <name><surname>Campilho</surname><given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>A multi-task CNN approach for lung nodule malignancy classification and characterization</article-title>. <source>Expert Syst. Appl.</source> <volume>184</volume>:<fpage>115469</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2021.115469</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mazzone</surname><given-names>P. J.</given-names></name> <name><surname>Lam</surname><given-names>L.</given-names></name></person-group> (<year>2022</year>). <article-title>Evaluating the patient with a pulmonary nodule: a review</article-title>. <source>JAMA</source> <volume>327</volume>, <fpage>264</fpage>&#x2013;<lpage>273</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2021.24287</pub-id>, <pub-id pub-id-type="pmid">35040882</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Milletari</surname><given-names>F.</given-names></name> <name><surname>Navab</surname><given-names>N.</given-names></name> <name><surname>Ahmadi</surname><given-names>S. A.</given-names></name></person-group> (<year>2016</year>). &#x201C;<article-title>V-net: fully convolutional neural networks for volumetric medical image segmentation</article-title>&#x201D; in <source>2016 fourth international conference on 3D vision (3DV)</source> (<publisher-name>IEEE</publisher-name>), <volume>2016</volume>:<fpage>565</fpage>&#x2013;<lpage>571</lpage>. Available online at: <ext-link xlink:href="https://arxiv.org/pdf/1606.04797" ext-link-type="uri">https://arxiv.org/pdf/1606.04797</ext-link></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Qin</surname><given-names>D.</given-names></name> <name><surname>Leichner</surname><given-names>C.</given-names></name> <name><surname>Delakis</surname><given-names>M.</given-names></name></person-group> (<year>2024</year>). &#x201C;<article-title>Mobile net V4: universal models for the mobile ecosystem</article-title>&#x201D; in <source>European conference on computer vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Nature Switzerland</publisher-name>), <fpage>78</fpage>&#x2013;<lpage>96</lpage>.</mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Sunkara</surname><given-names>R.</given-names></name> <name><surname>Luo</surname><given-names>T.</given-names></name></person-group> (<year>2022</year>). &#x201C;<article-title>No more strided convolutions or pooling: a new CNN building block for low-resolution images and small objects</article-title>&#x201D; in <source>Joint European conference on machine learning and knowledge discovery in databases</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Nature Switzerland</publisher-name>), <fpage>443</fpage>&#x2013;<lpage>459</lpage>.</mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname><given-names>S.</given-names></name> <name><surname>Bao</surname><given-names>Q.</given-names></name> <name><surname>Ji</surname><given-names>Q.</given-names></name> <name><surname>Wang</surname><given-names>T.</given-names></name> <name><surname>Wang</surname><given-names>N.</given-names></name> <name><surname>Yang</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Improvement of RT-DETR model for ground glass pulmonary nodule detection</article-title>. <source>PloS one</source> <volume>20</volume>:<fpage>e0317114</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0317114</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname><given-names>C.</given-names></name> <name><surname>Zhou</surname><given-names>F.</given-names></name> <name><surname>Sun</surname><given-names>J.</given-names></name> <name><surname>Zhang</surname><given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>Lung-YOLO: multiscale feature fusion attention and cross-layer aggregation for lung nodule detection</article-title>. <source>Biomed. Signal Process. Control</source> <volume>99</volume>:<fpage>106815</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2024.106815</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname><given-names>X.</given-names></name> <name><surname>Zhang</surname><given-names>H.</given-names></name> <name><surname>Sun</surname><given-names>J.</given-names></name> <name><surname>Wang</surname><given-names>S.</given-names></name> <name><surname>Zhang</surname><given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>YOLO-MSRF for lung nodule detection</article-title>. <source>Biomed. Signal Process. Control</source> <volume>94</volume>:<fpage>106318</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2024.106318</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>X.</given-names></name> <name><surname>Yang</surname><given-names>L.</given-names></name> <name><surname>Liu</surname><given-names>S.</given-names></name> <name><surname>Cao</surname><given-names>L.</given-names></name> <name><surname>Wang</surname><given-names>N.</given-names></name> <name><surname>Li</surname><given-names>H.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Interpretation of the 2022 global cancer statistics report</article-title>. <source>Chin. J. Cancer</source> <volume>46</volume>, <fpage>710</fpage>&#x2013;<lpage>721</lpage>. doi: <pub-id pub-id-type="doi">10.3760/cma.j.cn112152-20240416-00152</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Zhao</surname><given-names>Y.</given-names></name> <name><surname>Lv</surname><given-names>W.</given-names></name> <name><surname>Xu</surname><given-names>S.</given-names></name></person-group> (<year>2024</year>). &#x201C;<article-title>Detrs beat yolos on real-time object detection</article-title>&#x201D; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <volume>2024</volume>:<fpage>16965</fpage>&#x2013;<lpage>16974</lpage>. Available online at: <ext-link xlink:href="https://openaccess.thecvf.com/content/CVPR2024/html/Zhao_DETRs_Beat_YOLOs_on_Real-time_Object_Detection_CVPR_2024_paper.html" ext-link-type="uri">https://openaccess.thecvf.com/content/CVPR2024/html/Zhao_DETRs_Beat_YOLOs_on_Real-time_Object_Detection_CVPR_2024_paper.html</ext-link></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>D.</given-names></name> <name><surname>Xu</surname><given-names>H.</given-names></name> <name><surname>Liu</surname><given-names>W.</given-names></name> <name><surname>Liu</surname><given-names>F.</given-names></name></person-group> (<year>2025</year>). <article-title>LN-DETR: cross-scale feature fusion and re-weighting for lung nodule detection</article-title>. <source>Sci. Rep.</source> <volume>15</volume>:<fpage>15543</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-00309-7</pub-id>, <pub-id pub-id-type="pmid">40319047</pub-id></mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zuo</surname><given-names>W.</given-names></name> <name><surname>Zhou</surname><given-names>F.</given-names></name> <name><surname>Li</surname><given-names>Z.</given-names></name> <name><surname>Wang</surname><given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>Multi-resolution CNN and knowledge transfer for candidate classification in lung nodule detection</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>32510</fpage>&#x2013;<lpage>32521</lpage>. doi: <pub-id pub-id-type="doi">10.1109/access.2019.2903587</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1589149/overview">Xiaohao Cai</ext-link>, University of Southampton, United Kingdom</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3231343/overview">Madhusudan Lanjewar</ext-link>, Goa University, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3326866/overview">Liying Han</ext-link>, Hebei University of Technology, China</p>
</fn>
</fn-group>
</back>
</article>