<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Oncol.</journal-id>
<journal-title>Frontiers in Oncology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Oncol.</abbrev-journal-title>
<issn pub-type="epub">2234-943X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fonc.2025.1657159</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Oncology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>FC-YOLO: a fast inference backbone and lightweight attention mechanism-enhanced YOLO for detecting gastric adenocarcinoma in pathological image</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Zhang</surname>
<given-names>Hengtong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Jia</surname>
<given-names>Jianxin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Wenlian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yi</surname>
<given-names>Rigui</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yan</surname>
<given-names>Xusheng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sun</surname>
<given-names>Wenyue</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Xinxin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Gao</surname>
<given-names>Yunfei</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Basic and Forensic Medicine, Baotou Medical College</institution>, <addr-line>Inner Mongolia, Baotou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Key Laboratory of Human Anatomy at Universities of Inner Mongolia Autonomous Region</institution>, <addr-line>Inner Mongolia, Baotou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>School of Computer Science and Technology, Baotou Medical College</institution>, <addr-line>Inner Mongolia, Baotou</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>School of Medical Technology and Anesthesia, Baotou Medical College</institution>, <addr-line>Inner Mongolia, Baotou</addr-line>,&#xa0;<country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1130790/overview">Xi Wang</ext-link>, The Chinese University of Hong Kong, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3131884/overview">Abdullah Al Sakib</ext-link>, Westcliff University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3135585/overview">Govind Mudavadkar</ext-link>, Northeastern University, United States</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Xinxin Wang, <email xlink:href="mailto:57667478@qq.com">57667478@qq.com</email>; Yunfei Gao, <email xlink:href="mailto:gyfzfl88@hotmail.com">gyfzfl88@hotmail.com</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>29</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>15</volume>
<elocation-id>1657159</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>08</day>
<month>09</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Zhang, Jia, Zhang, Yi, Yan, Sun, Wang and Gao.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Zhang, Jia, Zhang, Yi, Yan, Sun, Wang and Gao</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>Gastric adenocarcinoma (GAC) is a leading cause of cancer-related mortality, but its histopathological diagnosis is challenged by image complexity and a shortage of pathologists. While deep learning models show promise, many are computationally demanding and lack the fine-grained feature extraction necessary for effective GAC detection.</p>
</sec>
<sec>
<title>Methods</title>
<p>We propose FC-YOLO, an optimized object detection framework for GAC histopathological image analysis. Based on the YOLOv11s architecture, FC-YOLO incorporates a FasterNet backbone for efficient multi-scale feature extraction, a lightweight Mixed Local-Channel Attention (MLCA) mechanism for feature recalibration, and Content-Aware ReAssembly of FEatures (CARAFE) for enhanced upsampling. The model was evaluated on a public dataset comprising 1,855 images and on a separate, independent clinical dataset consisting of 2,500 pathological images of gastric adenocarcinoma.</p>
</sec>
<sec>
<title>Results</title>
<p>On the public dataset, FC-YOLO achieved a mean Average Precision (mAP) of 82.8%, outperforming the baseline YOLOv11s by 2.6%, while maintaining a high inference speed of 131.56 FPS. On the independent clinical dataset, the model achieved an mAP of 85.7%, demonstrating strong generalization capabilities.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>The lightweight and efficient design of FC-YOLO enables superior performance at a low computational cost. It represents a promising tool to assist pathologists by enhancing diagnostic accuracy and efficiency, particularly in resource-limited settings.</p>
</sec>
</abstract>
<kwd-group>
<kwd>gastric cancer</kwd>
<kwd>pathological images</kwd>
<kwd>prediction</kwd>
<kwd>deep learning</kwd>
<kwd>target detection</kwd>
</kwd-group>
<contract-sponsor id="cn001">Natural Science Foundation of Inner Mongolia Autonomous Region<named-content content-type="fundref-id">10.13039/501100004763</named-content>
</contract-sponsor>
<counts>
<fig-count count="12"/>
<table-count count="10"/>
<equation-count count="5"/>
<ref-count count="45"/>
<page-count count="20"/>
<word-count count="10470"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Gastrointestinal Cancers: Gastric and Esophageal Cancers</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Although the global incidence rate of gastric cancer has declined over the past few decades, it remains the fifth most common cancer type worldwide and the fourth leading cause of cancer-related mortality (<xref ref-type="bibr" rid="B1">1</xref>). In terms of pathological classification, over 95% of gastric cancers are adenocarcinomas (<xref ref-type="bibr" rid="B2">2</xref>). Early detection, accurate diagnosis, and timely surgical intervention are critical to reducing gastric cancer mortality. Histopathological diagnosis serves as the gold standard for confirming gastric cancer, and its outcomes significantly influence treatment planning, underscoring the necessity of robust and efficient pathological diagnostics. However, there is a severe shortage of pathologists globally, including in China (<xref ref-type="bibr" rid="B3">3</xref>). Additionally, due to the complexity of pathological images, the analysis and diagnostic process is inherently challenging and time-consuming, which may compromise diagnostic accuracy. As caseloads increase, pathologists face heightened workloads and occupational overload, further exacerbating diagnostic inaccuracies&#x2014;a problem particularly pronounced in remote and underdeveloped regions. Therefore, the development of computer-aided diagnosis (CAD) tools capable of assisting pathologists and enhancing both the efficiency and accuracy of diagnosis holds significant clinical importance.</p>
<p>Recent years have witnessed remarkable advancements in deep learning for computer-aided diagnosis of medical images, particularly in gastric lesion detection and diagnosis systems. Compared with traditional machine learning methods such as random forests and support vector machines, deep learning demonstrates superior capability in capturing discriminative features from medical images with enhanced flexibility and diagnostic accuracy. In routine clinical pathological practice, histopathological examination of specimens, typically through hematoxylin and eosin (H&amp;E) stained slides, is conventionally conducted under optical microscopy. Notably, deep convolutional neural networks (CNNs) have emerged as pivotal tools in computer vision and medical image analysis. The application of CNNs in digital pathological image analysis for gastric disease classification has garnered significant research attention. Models based on the DeepLabv3 architecture, as employed by Song et&#xa0;al. (<xref ref-type="bibr" rid="B4">4</xref>). and Lan et&#xa0;al. (<xref ref-type="bibr" rid="B5">5</xref>)., have demonstrated high accuracy in gastric cancer detection. However, these models exhibit fluctuations in accuracy under specific scenarios. Furthermore, their inference times are often prolonged, which may impede diagnostic efficiency in clinical settings that demand the rapid processing of a large volume of samples, thereby highlighting a need for improvement in their real-time performance. Huang et&#xa0;al. (<xref ref-type="bibr" rid="B6">6</xref>). designed Gastro-MIL, a CNN-based model for the accurate diagnosis of gastric cancer directly from digital H&amp;E-stained images. While its diagnostic performance was reported to surpass that of junior pathologists, the model&#x2019;s processing pipeline is notably complex. It requires extensive partitioning of input images into tiles and relies on a multi-stage network architecture, which likely contributes to increased computational overhead and prolonged inference times. Kather et&#xa0;al. (<xref ref-type="bibr" rid="B7">7</xref>). trained a ResNet18 deep learning model to detect gastric cancer and predict Microsatellite Instability (MSI) in histological slides. Although their model is characterized by a low parameter count, it demonstrates notable deficiencies in accuracy, achieving an Area Under the Curve (AUC) of only 0.69 in a predominantly Asian cancer cohort, which suggests limited generalization capability across different ethnic populations. Zhang et&#xa0;al. (<xref ref-type="bibr" rid="B8">8</xref>). proposed a model named VENet, which accurately segments glands in pathological images. However, the model employs a complex network architecture and loss function, incorporating a multi-scale training strategy and an iterative optimization process, which increases computational complexity and results in high deployment and training costs. In a related study, Shi et&#xa0;al. (<xref ref-type="bibr" rid="B9">9</xref>). introduced GCLDNet, a deep learning model that, despite its excellent performance on the BOT dataset, may suffer from reduced detection accuracy for minute lesions. Furthermore, the model lacks validation on clinically acquired samples, and its large parameter count restricts its application on low-resource devices. Liang et&#xa0;al. (<xref ref-type="bibr" rid="B10">10</xref>). employed a weakly supervised learning approach for the subcellular-level segmentation of gastric adenocarcinoma, achieving an accuracy of 0.9109 on the BOT dataset. Although this method reduces the annotation burden for pathologists, its recognition capability in specific scenarios is insufficient, making it prone to misdiagnosis. Additionally, the model requires a lengthy training period, and its inference time is highly sensitive to parameter settings. Ning et&#xa0;al. (<xref ref-type="bibr" rid="B11">11</xref>). synergistically utilized U-Net and QuPath for diagnosing gastric adenocarcinoma; however, the model&#x2019;s generalization ability is constrained by a small sample size, and it similarly suffers from a complex processing workflow that prolongs inference time. Fu et&#xa0;al. (<xref ref-type="bibr" rid="B12">12</xref>) integrated Transformer and CNN architectures for the multi-class classification of gastric cancer pathology images, which enhanced model accuracy but concurrently increased computational overhead. Furthermore, the windowed attention mechanism and feature fusion process inherent to the Swin Transformer component increased per-sample processing time, leading to extended inference durations. Meanwhile, the application of the YOLOv4 model by Tung et&#xa0;al. (<xref ref-type="bibr" rid="B13">13</xref>) for detecting gastric adenocarcinoma, which lacked specific optimizations for histopathological features, was hampered by issues such as prolonged inference times and poor generalization capability. In a more recent study, Ma et&#xa0;al. (<xref ref-type="bibr" rid="B14">14</xref>) proposed an ensemble deep learning framework that fused VGG16, ResNet50, and MobileNetV2. This framework demonstrated exceptional performance in gastric cancer classification, surpassing standalone models. However, the fusion of three models resulted in high computational complexity and increased per-image processing time, posing challenges for real-time clinical applications and deployment on edge devices. Lomans et&#xa0;al. (<xref ref-type="bibr" rid="B15">15</xref>) trained an nnU-Net to assist in the pathological diagnosis of hereditary diffuse gastric cancer. Although its performance in lesion size estimation and cell type quantification approached that of pathologists, it exhibited insufficient adaptability to the complex tumor microenvironment. Moreover, its reliance on large-scale, meticulously annotated data&#xa0;restricts its broader adoption in resource-constrained medical institutions.</p>
<p>In summary, mainstream deep learning models for the detection of gastric adenocarcinoma in pathological images continue to face several critical challenges. High-accuracy models are often characterized by high computational complexity and prolonged inference times, which constrains their practical deployment in resource-limited hospitals and impedes their ability to meet the real-time requirements of clinical workflows. Conversely, lightweight models typically lack the requisite accuracy for reliable diagnosis. To address these limitations, we propose the FC-YOLO model to establish a deep learning framework capable of concurrently achieving high accuracy, rapid inference, and a lightweight design, tailored to the unique challenges of gastric adenocarcinoma pathology detection. This model incorporates three key modifications to the YOLOv11s baseline: first, the backbone network is replaced with FasterNet to more efficiently extract multi-scale features while enhancing inference speed; second, a lightweight MLCA attention mechanism is introduced to bolster the model&#x2019;s focus on critical diagnostic regions; and finally, the CARAFE upsampling operator is employed for a more precise restoration of lesion boundary details.</p>
<p>The contributions of this paper are as follows:</p>
<list list-type="simple">
<list-item>
<p>1. This study proposes an enhanced object detection framework for pathological images based on YOLOv11 architecture. We systematically integrate three key components: the FasterNet backbone network, MLCA attention mechanism, and CARAFE upsampling operator into the baseline model. This multi-component integration strategy significantly enhances detection accuracy while maintaining computational efficiency, offering a robust solution for target detection tasks in histopathological image analysis.</p>
</list-item>
<list-item>
<p>2. Our model achieves an mAP of 82.8% on the BOT gastric adenocarcinoma dataset, demonstrating a 2.6% performance enhancement compared to the baseline YOLOv11s architecture. This advancement establishes a novel framework for pathological image detection in gastric adenocarcinoma diagnostics, addressing critical challenges in clinical histopathology analysis.</p>
</list-item>
<list-item>
<p>3.Extensive comparative and ablation experiments have comprehensively validated the effectiveness of our model in detecting gastric adenocarcinoma within pathological images. This study presents an innovative modification of YOLOv11s for this purpose, and its findings are anticipated to be of considerable value to future researchers in the field of object detection who are focused on the identification and prediction of tumors in pathological imagery.</p>
</list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Proposed method</title>
<sec id="s2_1">
<label>2.1</label>
<title>Pathological image recognition model based on the YOLO network</title>
<p>The YOLO (You Only Look Once) framework represents a breakthrough in object detection neural networks, capable of simultaneous object localization and classification through a unified network architecture. Unlike conventional region proposal-based methods, YOLO demonstrates superior processing speed by employing an end-to-end detection paradigm. Built upon convolutional neural networks (CNNs), this framework processes entire images in a single forward pass rather than analyzing multiple image patches sequentially. This unified approach eliminates redundant computations inherent in sliding window techniques, enabling real-time performance while maintaining detection accuracy (<xref ref-type="bibr" rid="B16">16</xref>). Through continuous architectural evolution, multiple YOLO variants have been developed to enhance detection capabilities. In this study, we adopt YOLOv11s as our foundational framework, specifically optimized for histopathological image analysis tasks. Our rationale for selecting YOLOv11s is rooted in its integration of several state-of-the-art architectural optimizations. In comparison to the well-established YOLOv8 model, YOLOv11 introduces the C3K2 module. This module optimizes information propagation throughout the network by partitioning the feature map and applying a series of smaller kernels, which enhances feature representation using fewer parameters and at a lower computational cost than the C2f module in YOLOv8. Furthermore, its innovative C2PSA module refines the model&#x2019;s capacity for selective attention to regions of interest by applying spatial attention to the extracted features, all while maintaining a judicious balance between computational cost and detection accuracy. This architectural design affords YOLOv11 an advantage over YOLOv8 and its predecessors in scenarios requiring the precise detection of fine-grained object details. Consequently, this highly favorable performance-to-cost profile establishes it as an ideal starting point for our task of detecting gastric adenocarcinoma in pathological images, facilitating the integration and validation of our novel optimization modules without being encumbered by the overhead of an excessively large base model.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>The architecture of the FasterNet network</title>
<p>While the standard backbone in YOLOv11s is effective, it represents a general-purpose design. When applied to pathological images&#x2014;which are characterized by complex backgrounds and a high degree of similarity and redundancy between adjacent regions or across different feature map levels&#x2014;this generic approach can lead to exhaustive convolutional operations. This results in an inefficient allocation of computational resources and a constrained capacity for extracting critical features. To address this limitation, we introduce the FasterNet network architecture. In contrast to standard convolutional backbones, FasterNet is architected around the principle of Partial Convolution (PConv), a design that reduces redundant computations, thereby enhancing computational efficiency while simultaneously promoting hardware-friendly implementation. This novel neural network family demonstrates exceptional computational efficiency and robust performance across diverse vision tasks. The architecture prioritizes hardware-friendly design through structural simplification, as illustrated in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. The framework comprises four hierarchical levels, each initiated with either an embedding layer (4&#xd7;4 convolution with stride 4) or a merging layer (2&#xd7;2 convolution with stride 2) for spatial downsampling and channel expansion. Each stage incorporates multiple FasterNet modules, with increased module density in the final two stages where memory access costs diminish and computational intensity (FLOPS) escalates. The core FasterNet module employs a partial convolution (PConv) layer followed by two point-wise convolutional (PWConv) layers, forming an inverted residual block with expanded intermediate channels and skip connections for feature reuse. Strategic architectural optimizations include: 1) Streamlined deployment of normalization and activation layers (post-PWConv only) to balance feature diversity with reduced inference latency; 2) Preferential use of batch normalization (BN) over layer normalization (LN) to leverage operator fusion capabilities with adjacent convolutional layers; 3) Dynamic resource allocation through multi-scale feature fusion - preserving moderate computational capacity in shallow layers for microscopic pattern extraction while intensifying FasterNet module density in deeper layers for macroscopic feature interpretation. These design principles collectively enhance detection accuracy while maintaining computational efficiency, particularly crucial for high-resolution pathological image analysis (<xref ref-type="bibr" rid="B17">17</xref>).</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The overall architecture of FasterNet. The asterisk (*) in the &#x201c;Partial Convolution (PConv)&#x201d; diagram denotes the convolution.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g001.tif">
<alt-text content-type="machine-generated">Diagram illustrating a FasterNet model architecture. It begins with input preprocessing, followed by a sequence of four FasterNet blocks for stages one to four. Each block receives inputs through embedding and merging layers. Subsequent layers include global pooling, 1x1 convolution, and fully connected (FC) layers, ending with the output. Below, an inset details a Partial Convolution process with a schematic of input and output data flows within the FasterNet block.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>FC-YOLO for attention</title>
<p>The complex backgrounds and irregular target morphologies inherent to pathological images pose significant challenges to conventional attention mechanisms. Traditional attention methods, which establish channel-wise dependencies through global feature compression, often neglect critical local spatial context, thereby limiting their capacity to effectively represent features of heterogeneous lesions. While a recent study by Zubair et&#xa0;al. (<xref ref-type="bibr" rid="B18">18</xref>) proposed a multi-channel attention framework that enhanced the effectiveness and comprehensiveness of feature extraction in gastric cancer pathology images, the integration of this framework results in excessively high computational complexity. The parallel multi-channel computations and the stacking of attention mechanisms contribute to increased inference latency. Furthermore, this approach necessitates high-performance GPUs, rendering its deployment challenging in primary hospitals with limited resources. To address these limitations, we introduce a lightweight hybrid local attention mechanism, the Multi-scale Large-kernel Convolutional Attention (MLCA), which we integrate into the model&#x2019;s backbone network. This innovative design enables efficient channel-spatial joint attention modeling with local-global feature synergy under low computational budgets. As depicted in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>. The core principle of the MLCA mechanism is predicated upon a two-stage pooling process. Initially, the mechanism converts input features into a vector of dimensions 1 &#xd7; C &#xd7; ks &#xd7; ks via a local pooling operation, designed for the efficient capture of local spatial information. Subsequently, feature extraction proceeds along two parallel branches: one branch is dedicated to extracting global contextual information, while the other focuses on preserving fine-grained local spatial details. The one-dimensional vectors generated by these branches are then processed by a 1D convolution (Conv1d), after which their resolution is restored to the original dimensions. A final information fusion step realizes the objective of the hybrid attention mechanism. Notably, the kernel size, <italic>k</italic>, of the 1D convolution is proportional to the number of channels, C. Its primary function is to capture local cross-channel interactions between each channel and its <italic>k</italic> adjacent neighbors. The formula for computing <italic>k</italic> is provided in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>, where <italic>k</italic> represents the kernel size, C is the channel count, and &#x3b3; and b are hyperparameters with default values of 2. To ensure that the kernel size <italic>k</italic> is always odd, it is incremented by one if the calculated value is even. Furthermore, the MLCA mechanism simultaneously considers channel information, spatial dimensions, and multi-level features from both local and global perspectives. This design effectively addresses a common limitation of traditional channel attention mechanisms&#x2014;the neglect of spatial feature details&#x2014;thereby reducing the computational burden of spatial attention modules while enhancing the model&#x2019;s representational power and detection performance. Through the application of two-stage pooling and a dynamically optimized 1D convolution, MLCA not only improves processing speed but also circumvents the potential accuracy degradation associated with channel dimensionality reduction. Ultimately, MLCA strikes an ideal balance between model complexity and performance gain, significantly enhancing both the scalability of the attention mechanism and the efficacy of object detection (<xref ref-type="bibr" rid="B19">19</xref>). Consequently, the MLCA mechanism has the potential to improve the detection sensitivity of gastric adenocarcinoma foci by suppressing staining artifacts and interference from inflammatory cells, all while maintaining computational efficiency.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Left: Schematic diagram of the MLCA. Right: The structure of the MLCA. LAP (Local Average Pooling) which divides the feature map into k * k patches and applies average pooling to each patch; GAP (Global Average Pooling), which uses adaptive pooling to reduce the feature map to a 1 * 1 output size; UNAP (Anti-average Pooling), which mainly focuses on the figures) properties and scaling to the needed size.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating the Mixed Local Channel Attention (MLCA) process. The left section shows a 3D input tensor with markings for dimensions C, W, and H. Operations include LAP, GAP, convolution, reshaping, and UNAP. The process combines outputs, leading to the final output tensor. The right section provides a flowchart of interconnected steps labeled as Input, LAP, GAP, Reshape, Conv1d, UNAP, and output, reflecting the same operations visually.</alt-text>
</graphic>
</fig>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>&#x3a6;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mo>&#x2758;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>log</mml:mtext>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mi>C</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mi>b</mml:mi>
<mml:mi>&#x3b3;</mml:mi>
</mml:mfrac>
<mml:msub>
<mml:mo>&#x2758;</mml:mo>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>CARAFE for FC-YOLO</title>
<p>In histopathology, the precise delineation of lesion boundaries is of paramount importance for diagnosis. However, the feature fusion module in YOLOv11s utilizes conventional upsampling methods, which are characterized by fixed convolutional kernel weights that remain agnostic to the input content. This static approach can result in the imprecise demarcation of cancer cell boundaries within pathological images and may also lead to suboptimal contextual awareness. To address these limitations, we implement CARAFE, which enhances contextual perception through large receptive fields (up to 5&#xd7;5) and dynamically predicts upsampling kernel weights based on feature semantics. This adaptive mechanism significantly improves the model&#x2019;s capability to exploit discriminative features in histopathological images with complex backgrounds, particularly for detecting subtle malignant patterns obscured by stromal interference (<xref ref-type="bibr" rid="B20">20</xref>).</p>
<p>The CARAFE module comprises two synergistic components: an upsampling kernel prediction module and a feature reassembly module, with its primary workflow illustrated in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. In the initial stage, the input feature map &#x3c7; which contains target location information, undergoes channel compression. Subsequently, these compressed features are fed into a lightweight content encoder that dynamically predicts a unique reassembly kernel for each location. These kernels essentially define a set of weights for the neighboring pixels within the original feature map. A kernel normalization module then applies the softmax function to these weights, transforming them into a probability distribution. This mechanism enables the dynamic enhancement of minute details, such as alterations in cellular morphology, while simultaneously suppressing less relevant regions. In the second stage, based on a given upsampling rate &#x3c3;, the reassembly kernels predicted in the preceding stage are applied to the original feature map. Through a weighted reassembly of information from the local neighborhood of the original feature map, a new feature map &#x3c7;&#x2019; with dimensions of C &#xd7; &#x3c3;H &#xd7; &#x3c3;W is generated, thereby accomplishing the upsampling task. The integration of the CARAFE upsampling module enhances the feature extraction and fusion capabilities of the network&#x2019;s neck, effectively mitigating detection challenges posed by complex background interference and densely distributed small targets, such as cancer cell clusters, within pathological images.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Work principles of CARAFE. CARAFE is composed of two key components, kernel prediction module and content-aware reassembly module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating a two-part processing model for image analysis. The top section depicts a &#x201c;Kernel Prediction Module&#x201d; that includes a &#x201c;Content Encoder&#x201d;, &#x201c;Kernel Normalizer&#x201d;, and a &#x201c;Channel Compressor&#x201d; leading to a processed output. The bottom section shows a &#x201c;Content-aware Reassembly Module&#x201d; with operations involving reassembly and example locations, transforming input into output through specified grid operations. The model sequence involves input images, encoding, normalization, reassembly, and final output visualization, with specific operations denoted by symbols for example location and reassemble operation.</alt-text>
</graphic>
</fig>
<p>The CARAFE upsampling procedure operates through a systematic pipeline to achieve feature map resolution enhancement. Initially, the input feature map &#x3c7; with dimensions H&#xd7;W&#xd7;C undergoes channel compression via a 1&#xd7;1 convolutional layer, reducing the channel depth from C to C_m (where C_m &lt; C) to optimize computational efficiency. Subsequently, a content encoder comprising convolutional layers generates a reorganization kernel tensor of shape H&#xd7;W&#xd7;&#x3c3;&#xb2;&#xd7;k_up&#xb2;, where &#x3c3; denotes the upsampling factor and k_up represents the receptive field size governing feature recombination. This tensor is spatially expanded to dimensions &#x3c3;H&#xd7;&#x3c3;W&#xd7;k_up&#xb2;, followed by kernel normalization to ensure the summation of convolutional weights equals unity. Within the content-aware reassembly module, each spatial position (i&#x2019;, j&#x2019;) in the output feature map &#x3c7;&#x2019; corresponds to a k_up&#xd7;k_up neighborhood region centered at (i/&#x3c3;, j/&#x3c3;) in the input feature map. The output activation is computed as the dot product between the predicted kernel weights and the unfolded input features. Notably, kernel sharing is implemented across channels at identical spatial locations, effectively balancing parameter efficiency and feature discriminability. Through this mechanism, the upsampled feature map &#x3c7;&#x2019; with dimensions &#x3c3;H&#xd7;&#x3c3;W&#xd7;C is reconstructed while preserving structural coherence and semantic granularity.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>FC-YOLO</title>
<p>The FC-YOLO model proposed in this study employs a hierarchical feature processing mechanism. Following standardized preprocessing, input images are first fed into a FasterNet-based backbone for multi-scale feature extraction. The extracted deep features are then passed through an SPPF module, which fuses contextual information via multi-scale pooling. Subsequently, the features proceed to a C2PSA module, which incorporates Position-Sensitive Attention (PSA) to enhance feature extraction and selective residual connections to optimize gradient propagation. Finally, high-level semantic features are outputted by the MLCA attention mechanism. Within this stage, MLCA utilizes a local-global dual-path pooling strategy combined with dynamic weight allocation to intensify the feature response to atypical cells.</p>
<p>The enhanced neck branch adopts a bi-directional feature pyramid architecture to promote multi-scale information interaction. On the upsampling path, features are first upsampled by a CARAFE module and concatenated along the channel dimension with the output from Stage3 of the backbone. These fused features are then refined by a C3k2 module with cross-stage residual connections. This process is repeated: the refined features are again upsampled by CARAFE and fused with features from Stage2 of the backbone. Throughout this process, CARAFE&#x2019;s dynamic kernel generation mechanism effectively recovers the geometric structural features of minute lesions by modeling local context. The downsampling path utilizes 3&#xd7;3 standard convolutions with a stride of 2 to progressively compress spatial dimensions. Features at an 80&#xd7;80 resolution are downsampled to 40&#xd7;40, concatenated with the refined features from the first level of the upsampling path, and then fed into a C3k2 module to achieve cross-scale semantic fusion. A further downsampling to 20&#xd7;20 resolution follows, where the features are concatenated with the original MLCA-enhanced high-level features, thereby constructing a feature pyramid rich in multi-dimensional semantic information. The C3k2 module optimizes the parameter count by employing bottleneck structures and depth-wise separable convolutions, reducing the computational load while preserving representational capacity. Inference is ultimately performed synergistically by multi-scale detection heads. The entire architecture achieves efficient detection of multi-scale lesions in gastric cancer pathology images through the synergistic interplay of MLCA&#x2019;s cross-dimensional attention, CARAFE&#x2019;s dynamic content-aware upsampling, and FasterNet&#x2019;s high-efficiency feature extraction. A detailed illustration of the model architecture is provided in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The structure of FC-YOLO: Backbone <bold>(A)</bold>, Neck <bold>(B)</bold>, Head <bold>(C)</bold>, V11Detect <bold>(D)</bold>, C3k2 <bold>(E)</bold>, and C2PSA <bold>(F)</bold>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g004.tif">
<alt-text content-type="machine-generated">Flowchart of the FC-YOLO architecture, divided into three sections: Backbone, Neck, and Head. The Backbone includes FasterNet Blocks, followed by SPPF, C2PSA, and MLCA blocks. The Neck features CARAFE and C3k2 blocks, with concatenation operations linking them. The Head uses V11Detect for output processing. Inset diagrams for C3k2 and C2PSA show internal operations. The starting image is a pathology slide of tissue.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2_6">
<label>2.6</label>
<title>The fabrication of the dataset</title>
<p>Two distinct datasets were utilized in this study: the BOT Dataset and In-House Clinical Dataset. The BOT Dataset, initially comprising 700 gastric adenocarcinoma images, was expanded to a total of 1,855 images through random data augmentation and preprocessing. This expanded collection, serving as our internal dataset, was subsequently partitioned into training, validation, and test sets at an 8:1:1 ratio, resulting in 1,485 images for training, 175 for validation, and 184 for testing. The In-House Clinical Dataset was constructed from 2,500 pathological images of gastric adenocarcinoma, acquired via microscopic examination from samples of 50 patients at the Affiliated Hospital of Baotou Medical College, Inner Mongolia University of Science and Technology, between 2018 and 2023. This dataset was also partitioned according to an 8:1:1 ratio, yielding 2,000 images for the training set and 250 images each for the validation and test sets. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. provides a detailed summary of this dataset partitioning.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>The partitioning of the dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Name</th>
<th valign="middle" align="left">Dataset</th>
<th valign="middle" align="left">Data source</th>
<th valign="middle" align="left">Proportion</th>
<th valign="middle" align="left">Number of pictures</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="3" align="left" style="">BOT Dataset</td>
<td valign="middle" align="left" style="">training set</td>
<td valign="middle" rowspan="3" align="left" style="">The 2017 China Big Data Artificial Intelligence Innovation and Entrepreneurship Competition</td>
<td valign="middle" align="center" style="">80%</td>
<td valign="middle" align="center" style="">1485</td>
</tr>
<tr>
<td valign="middle" align="left" style="">validation set</td>
<td valign="middle" align="center" style="">10%</td>
<td valign="middle" align="center" style="">175</td>
</tr>
<tr>
<td valign="middle" align="left" style="">test set</td>
<td valign="middle" align="center" style="">10%</td>
<td valign="middle" align="center" style="">184</td>
</tr>
<tr>
<td valign="middle" rowspan="3" align="left" style="">In-House Clinical Dataset</td>
<td valign="middle" align="left" style="">training set</td>
<td valign="middle" rowspan="3" align="left" style="">Affiliated Hospital of Baotou Medical College, Inner Mongolia University of Science and Technology</td>
<td valign="middle" align="center" style="">80%</td>
<td valign="middle" align="center" style="">2000</td>
</tr>
<tr>
<td valign="middle" align="left" style="">validation set</td>
<td valign="middle" align="center" style="">10%</td>
<td valign="middle" align="center" style="">250</td>
</tr>
<tr>
<td valign="middle" align="left" style="">test set</td>
<td valign="middle" align="center" style="">10%</td>
<td valign="middle" align="center" style="">250</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_7">
<label>2.7</label>
<title>Evaluation indicators</title>
<p>The trained model was applied to the test set, generating predictions that included bounding box coordinates and confidence scores. Confidence scores, ranging from 0 to 1, quantify the probability of a predicted bounding box belonging to a specific category, with higher values indicating greater certainty.</p>
<p>To rigorously evaluate prediction accuracy, seven metrics were employed:</p>
<p>1. Precision (P): Ratioof true positives (TP) to all positive predictions and the calculation method is shown in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>.</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>2. Recall (R):Ratio of true positives to all actual positives and is calculated using <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>.</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mtext>Recall</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>3. mAP@50: Mean average precision at an intersection-over-union (IoU) threshold of 0.5, the calculation formula is shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>.</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mtext>mAP</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mrow><mml:mtext>m</mml:mtext></mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mi>1</mml:mi>
</mml:msubsup>
<mml:mtext>p</mml:mtext>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>&#x211d;</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mtext>dR</mml:mtext>
</mml:mrow>
<mml:mtext>N</mml:mtext>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>4. mAP@50-95: Mean average precision averaged over IoU thresholds from 0.5 to 0.95 (step: 0.05).</p>
<p>5. Frames Per Second (FPS): Real-time inference speed.</p>
<p>6. F1 Score: Harmonic mean of precision and recall, computed as indicated in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>.</p>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>Recall</mml:mtext>
<mml:mi>&#x200b;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>7. Operations Per Second (GFLOPS): Computational complexity in giga floating-point operations per second.</p>
</sec>
<sec id="s2_8">
<label>2.8</label>
<title>Model parameter configuration and training</title>
<p>The hyperparameter settings for the training are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Parameter settings.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Epoch</th>
<th valign="middle" align="center">Batch_size</th>
<th valign="middle" align="center">Initial-learning rate</th>
<th valign="middle" align="center">Optimizer</th>
<th valign="middle" align="center">Lr decay</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">600</td>
<td valign="middle" align="center">32</td>
<td valign="middle" align="center">0.01</td>
<td valign="middle" align="center">Adam</td>
<td valign="middle" align="center">Step</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Hyperparameter configurations were executed through a command-line interface (CLI). Upon initiating the training command, the model training process commenced. The program first loaded the predefined model architecture, followed by iterative training on the dataset. After completing 600 epochs, the training phase concluded, and the finalized model was saved to a user-specified directory path. Throughout this workflow, identical hyperparameter settings and training protocols were applied to all YOLOv11 model variants (e.g., YOLOv11n, YOLOv11s), ensuring consistency in comparative performance evaluation.</p>
</sec>
<sec id="s2_9">
<label>2.9</label>
<title>Data expansion</title>
<p>The histopathological images in both datasets for this study were annotated using the Labelme software. This was a two-tiered process: the initial annotations were performed by two physicians with over five years of experience in pathological diagnosis. To ensure accuracy, any annotations considered ambiguous were subsequently reviewed and finalized by a senior pathologist with more than ten years of diagnostic experience. This structured workflow was designed to minimize annotation bias and ensure the high quality of the ground-truth labels. To address the significant stain variations in histopathological images, we considered both standard normalization techniques and a more advanced component-wise approach. Standard methods, such as those proposed by Reinhard et&#xa0;al. (<xref ref-type="bibr" rid="B21">21</xref>). and Macenko et&#xa0;al. (<xref ref-type="bibr" rid="B22">22</xref>)., are effective for global stain normalization by mapping the color space of an image to a target template. However, for gastric adenocarcinoma diagnosis, where subtle nuclear atypia and chromatin texture (stained by hematoxylin) are critical diagnostic features, a global transformation risks obscuring these minute but vital local details. Moreover, such a global stain normalization approach is detrimental to enhancing the model&#x2019;s generalization capability and its robustness against the inherent variations present in pathological images. Consequently, we adopted the more sophisticated method proposed by Tellez et&#xa0;al. (<xref ref-type="bibr" rid="B23">23</xref>). This approach was chosen to ensure that the augmented images could more accurately reflect both the cellular atypia and the image transformations arising from staining variations inherent in clinical practice, thereby enhancing the model&#x2019;s detection performance. Specifically, our data augmentation pipeline began with a series of spatial transformations, including random rotations, horizontal and vertical flips, isotropic scaling sampled from a uniform distribution, and elastic deformations (with parameters &#x3b1;=100, &#x3c3;=10) designed to simulate minor tissue distortions. Subsequently, the RGB image patches were converted into the Hematoxylin (H), Eosin (E), and Residual (R) color space. The intensity of each channel was then perturbed using an independent multiplicative factor &#x3b1;, sampled from a uniform distribution [0.95, 1.05]. and an additive bias &#x3b2;, sampled from [&#x2212;0.05, 0.05]. After reconstructing the images back into the RGB space, we introduced further variations by randomly adjusting their brightness and contrast with factors sampled from a uniform distribution of [0.75, 1.5]. and their color saturation with factors from [0.75, 1.25]. Finally, a Gaussian blur of random intensity was applied to a subset of these images. The specific parameters and their corresponding distributions are detailed in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> of Appendix 1.</p>
</sec>
<sec id="s2_10">
<label>2.10</label>
<title>Visualization of the training network</title>
<p>The GradCAM++ method (<xref ref-type="bibr" rid="B24">24</xref>) was utilized to analyze the trained network, providing visual insights into the basis of the predictions.</p>
</sec>
<sec id="s2_11">
<label>2.11</label>
<title>Software and hardware</title>
<p>The computational framework was implemented using PyTorch 1.13.1. All experimental procedures - including data pipeline construction, preprocessing operations, network training, and inference tasks - were executed on a high-performance workstation equipped with an Intel Core i7-12700K CPU (3.20GHz base frequency, 4.40GHz turbo boost), 128GB DDR4 RAM, and dual NVIDIA GeForce RTX 4090 D GPUs with 24GB GDDR6X memory each.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Experiments and results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Comparison of basic models</title>
<p>The selection of an appropriate baseline model is a critical step in the development of computer-aided diagnosis systems intended for practical clinical application. Our choice of YOLOv11s as the base architecture was informed by a multi-faceted evaluation of its detection accuracy on pathological images, inference speed, and computational efficiency, the latter being a proxy for deployment feasibility. While numerous YOLO variants exist, such as the widely adopted YOLOv5 and the powerful YOLOv8, we specifically selected YOLOv11 (<xref ref-type="bibr" rid="B25">25</xref>).as it represents the latest advancements in optimizing the accuracy-efficiency trade-off for real-time applications. To substantiate this choice, we conducted a preliminary comparative analysis of the YOLOv11 variants (<xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>). This comparison illuminated a fundamental principle: larger models (e.g., YOLOv11m/l/x) deliver superior accuracy but at a substantial computational cost, rendering their deployment impractical in many clinical settings with limited hardware resources. Conversely, the smallest model (YOLOv11n), while highly efficient, lacks the requisite accuracy for high-stakes diagnostic tasks, where a missed detection could have severe consequences. YOLOv11s, however, strikes a judicious balance. It offers a significant improvement in accuracy over YOLOv11n while maintaining a modest parameter count (9.45M) and a low computational load (21.7 GFLOPS). This &#x201c;sweet spot&#x201d; is paramount; it establishes a robust performance baseline that is sufficiently high for meaningful clinical assistance, yet efficient enough to permit real-time inference on standard GPUs. This equilibrium makes it an ideal foundation for our subsequent optimizations. Furthermore, compared to other object detection models, YOLOv11s incorporates state-of-the-art architectural improvements, providing an advantageous starting point for the integration of our custom modules: FasterNet, MLCA, and CARAFE. Consequently, YOLOv11s was chosen as the foundational model for our study.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of different versions of YOLOv11.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Models</th>
<th valign="middle" align="center">mAP50(%)</th>
<th valign="middle" align="center">Params[m]</th>
<th valign="middle" align="center">GFLOPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" style="">YOLOv11n</td>
<td valign="middle" align="center" style="">78.2</td>
<td valign="middle" align="center" style="">2.62</td>
<td valign="middle" align="center" style="">6.6</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="center" style="">80.3</td>
<td valign="middle" align="center" style="">9.45</td>
<td valign="middle" align="center" style="">21.7</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11m</td>
<td valign="middle" align="center" style="">82.1</td>
<td valign="middle" align="center" style="">20.1</td>
<td valign="middle" align="center" style="">68.5</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11l</td>
<td valign="middle" align="center" style="">83.9</td>
<td valign="middle" align="center" style="">25.3</td>
<td valign="middle" align="center" style="">87.6</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11x</td>
<td valign="middle" align="center" style="">84.2</td>
<td valign="middle" align="center" style="">56.9</td>
<td valign="middle" align="center" style="">196</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>The comparison of accuracy among different backbone networks</title>
<p>To identify optimal CNN backbone architectures, we conducted a systematic evaluation of classical deep learning models (VGG16 (<xref ref-type="bibr" rid="B26">26</xref>). ResNet50 (<xref ref-type="bibr" rid="B27">27</xref>). MobileNetV2 (<xref ref-type="bibr" rid="B28">28</xref>). FasterNet, and EfficientViT (<xref ref-type="bibr" rid="B29">29</xref>)) under standardized experimental protocols. <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref> presents their validation performance in gastric adenocarcinoma detection tasks with data augmentation. The reported metrics represent mean accuracy scores derived from four-fold cross-validation. A comparative analysis revealed that FasterNet, ResNet50, and EfficientViT demonstrate comparable performance. Consequently, these architectures were selected to replace the backbone component of the YOLOv11 model. A subsequent comparative study was conducted to further investigate which of these CNN architectures is most adept for object detection tasks within the context of histopathological analysis.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>The comparison among backbone networks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">CNN architecture</th>
<th valign="middle" align="left">Validation accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">VGG16</td>
<td valign="middle" align="center">0.757</td>
</tr>
<tr>
<td valign="middle" align="left">ResNet50</td>
<td valign="middle" align="center">0.763</td>
</tr>
<tr>
<td valign="middle" align="left">FasterNet</td>
<td valign="middle" align="center">0.766</td>
</tr>
<tr>
<td valign="middle" align="left">EfficientNetViT</td>
<td valign="middle" align="center">0.768</td>
</tr>
<tr>
<td valign="middle" align="left">MobileNetV2</td>
<td valign="middle" align="center">0.755</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>During the training phase, it is crucial to monitor the evolution of precision and recall, while the final training outcomes are assessed using mAP@0.5 and mAP@[0.5:0.95]. These metrics offer valuable insights into a model&#x2019;s performance and its generalization capability. As detailed in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> and <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, we evaluated three high-performing backbone networks&#x2014;FasterNet, EfficientViT, and ResNet50&#x2014;by substituting them into the YOLOv11s architecture. The analysis of the results indicates that, compared to the baseline version, the FasterNet substitution yielded the most favorable performance. Specifically, it achieved a 1.2% increase in mAP and a 1.9% improvement in accuracy. Consequently, based on these findings, FasterNet was definitively selected as the backbone for our model.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Gastric cancer indicators under different models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Models</th>
<th valign="middle" align="center">P</th>
<th valign="middle" align="center">R</th>
<th valign="middle" align="center">mAP50</th>
<th valign="middle" align="center">mAP50-95</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" style="">Baseline(YOLOv11s)</td>
<td valign="middle" align="center" style="">0.835</td>
<td valign="middle" align="center" style="">0.713</td>
<td valign="middle" align="center" style="">0.802</td>
<td valign="middle" align="center" style="">0.577</td>
</tr>
<tr>
<td valign="middle" align="left" style="">FasterNet</td>
<td valign="middle" align="center" style="">0.854</td>
<td valign="middle" align="center" style="">0.715</td>
<td valign="middle" align="center" style="">0.814</td>
<td valign="middle" align="center" style="">0.615</td>
</tr>
<tr>
<td valign="middle" align="left" style="">Efficientvit</td>
<td valign="middle" align="center" style="">0.841</td>
<td valign="middle" align="center" style="">0.710</td>
<td valign="middle" align="center" style="">0.806</td>
<td valign="middle" align="center" style="">0.578</td>
</tr>
<tr>
<td valign="middle" align="left" style="">ResNet50</td>
<td valign="middle" align="center" style="">0.795</td>
<td valign="middle" align="center" style="">0.682</td>
<td valign="middle" align="center" style="">0.794</td>
<td valign="middle" align="center" style="">0.544</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Visual Comparison of Different Backbone Networks. <bold>(A)</bold> Visualization results after FasterNet replacement. <bold>(B)</bold> Visualization results after EfficientViT replacement. <bold>(C)</bold> Visualization results after ResNet50 replacement. <bold>(D)</bold> Visualization results of the basic version.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g005.tif">
<alt-text content-type="machine-generated">Four sets of graphs labeled A, B, C, and D compare performance metrics for different network replacements. Each set includes four graphs showing metrics for precision, recall, mAP50, and mAP50-95. The illustrations represent results after replacing with FasterNet (A), EfficientViT (B), ResNet50 (C), and a basic version (D). Each graph plots the metric values against the number of data points ranging from zero to six hundred.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Attention mechanism compatibility experiment</title>
<p>To validate the architectural compatibility of MLCA, we conducted comparative experiments with five prevalent attention mechanisms (GAM (<xref ref-type="bibr" rid="B30">30</xref>). BiFormer (<xref ref-type="bibr" rid="B31">31</xref>). CA (<xref ref-type="bibr" rid="B32">32</xref>). CPCA (<xref ref-type="bibr" rid="B33">33</xref>). SimAM (<xref ref-type="bibr" rid="B34">34</xref>). CBAM (<xref ref-type="bibr" rid="B35">35</xref>)) integrated into the baseline model. The Grad-CAM++ visualization technique was systematically applied to generate class-discriminative localization maps, enabling quantitative assessment of spatial attention distribution variations across different attention-enhanced architectures. This analytical approach elucidates region-specific feature prioritization patterns and establishes interpretable correlations between attention-driven feature selection and diagnostic performance metrics.</p>
<p>As shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, our comparative experiments revealed that the MLCA attention mechanism outperforms conventional modules like CBAM and CA. Specifically, MLCA generates heatmaps with significantly expanded spatial regions of interest, indicating its ability to guide the model in capturing multi-scale features from target regions. By fusing local details with global contextual information, MLCA creates a comprehensive representation of the Region of Interest (ROI), a capability crucial for detecting minute lesions in pathological images. Furthermore, the high-activation regions (indicated in red) generated by MLCA are more extensive, suggesting an enhanced capacity for extracting salient features. We attribute this advantage to MLCA&#x2019;s unique dual-path weight allocation strategy. One path captures multi-resolution features via an atrous convolution pyramid, while the other employs channel-spatial joint attention to filter critical information. This dual approach effectively suppresses interference from complex tissue backgrounds, such as stromal fibrosis and inflammatory infiltration. Consequently, MLCA demonstrated superior performance in detecting gastric adenocarcinoma in pathological images compared to the other attention mechanisms tested.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Heatmaps for various attention mechanisms.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g006.tif">
<alt-text content-type="machine-generated">Comparison of image processing techniques on medical images. The first column shows original images, the second shows detection results with blue rectangles, and subsequent columns display heatmaps using various baseline models including GAM, CBAM, SimAM, CPCA, BiFomer, CA, and MLCA. Each row represents a different set of images processed with the corresponding method above.</alt-text>
</graphic>
</fig>
<p>Subsequently, we conducted a quantitative analysis using the mean Average Precision (mAP) evaluation metric. In this ablation study, only the attention mechanism module was systematically varied, while other components remained constant, followed by the measurement of the mAP value for each resulting model configuration. This methodology allowed for a direct comparison of mAP scores across different models, thereby enabling an assessment of the compatibility and efficacy of various attention mechanisms within the base architecture. The comparative experimental data are presented in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>. The results demonstrate that the model variant equipped with the MLCA attention mechanism achieved superior detection performance (mAP) compared to both the baseline version (YOLOv11s without an additional attention module) and variants incorporating alternative attention mechanisms, specifically GAM, CBAM, SimAM, CPCA, BiFormer, and CA. Notably, we observed that the integration of the GAM and SimAM attention mechanisms resulted in a decrease in mAP by 0.3% and 0.7%, respectively, relative to the baseline. Conversely, incorporating the CBAM, CPCA, BiFormer, and CA attention mechanisms yielded improvements in mAP by 0.1%, 0.3%, 0.5%, and 0.7%, respectively. Significantly, the fusion of the MLCA attention mechanism led to the most substantial performance gain, enhancing the mAP by 1.1%. These findings strongly suggest that our proposed model, featuring the MLCA attention mechanism, is better suited and more effective for the task of detecting gastric adenocarcinoma in pathological images compared to the original YOLOv11s baseline model.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>The comparison between different attention modules.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Attention</th>
<th valign="middle" align="center">F1(%)</th>
<th valign="middle" align="center">mAP(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">&#x2014;&#x2014;</td>
<td valign="middle" align="center" style="">75.9</td>
<td valign="middle" align="center" style="">80.2</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">GAM</td>
<td valign="middle" align="center" style="">74.5</td>
<td valign="middle" align="center" style="">79.9</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">CBAM</td>
<td valign="middle" align="center" style="">75.2</td>
<td valign="middle" align="center" style="">80.3</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">SimAM</td>
<td valign="middle" align="center" style="">74.1</td>
<td valign="middle" align="center" style="">79.5</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">CPCA</td>
<td valign="middle" align="center" style="">74.2</td>
<td valign="middle" align="center" style="">80.5</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">BiFomer</td>
<td valign="middle" align="center" style="">75.8</td>
<td valign="middle" align="center" style="">80.7</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">CA</td>
<td valign="middle" align="center" style="">75.7</td>
<td valign="middle" align="center" style="">80.9</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="left" style="">MLCA</td>
<td valign="middle" align="center" style="">77.1</td>
<td valign="middle" align="center" style="">81.3</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Ablation experiment</title>
<p>We conducted ablation experiments to validate the impact of our three proposed improvements. As detailed in <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref>, we evaluated eight experimental configurations by comparing them against the baseline YOLOv11s model. The performance of each configuration was assessed using mAP, F1-score, Recall, and Precision. We designated the modified configurations based on the added components&#x2014;for example, &#x201c;YOLOv11s+FasterNet&#x201d; for the FasterNet backbone, &#x201c;YOLOv11s+CARAFE&#x201d; for the CARAFE upsampling module, and &#x201c;YOLOv11s+MLCA&#x201d; for the MLCA attention mechanism. This naming pattern was applied consistently across all combinations.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>The impact of the fusion of different modules of the model on the metrics.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">mAP (%)</th>
<th valign="middle" align="center">F1 (%)</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" colspan="6" align="left" style="">Baseline</th>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="center" style="">80.2</td>
<td valign="middle" align="center" style="">76.9</td>
<td valign="middle" align="center" style="">0.713</td>
<td valign="middle" align="center" style="">0.835</td>
<td valign="middle" align="center" style="">125.78</td>
</tr>
<tr>
<th valign="middle" colspan="6" align="left" style="">Single Component Improvements</th>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s+FasterNet</td>
<td valign="middle" align="center" style="">81.4</td>
<td valign="middle" align="center" style="">78.2</td>
<td valign="middle" align="center" style="">0.705</td>
<td valign="middle" align="center" style="">0.854</td>
<td valign="middle" align="center" style="">129.23</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s+CARAFE</td>
<td valign="middle" align="center" style="">80.4</td>
<td valign="middle" align="center" style="">76.2</td>
<td valign="middle" align="center" style="">0.71</td>
<td valign="middle" align="center" style="">0.821</td>
<td valign="middle" align="center" style="">130.12</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s+ MLCA</td>
<td valign="middle" align="center" style="">81.3</td>
<td valign="middle" align="center" style="">77.2</td>
<td valign="middle" align="center" style="">0.712</td>
<td valign="middle" align="center" style="">0.843</td>
<td valign="middle" align="center" style="">124.42</td>
</tr>
<tr>
<th valign="middle" colspan="6" align="left" style="">Paired Component Improvements</th>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s+ MLCA+CARAFE</td>
<td valign="middle" align="left" style="">79.8</td>
<td valign="middle" align="left" style="">76.3</td>
<td valign="middle" align="left" style="">0.722</td>
<td valign="middle" align="left" style="">0.809</td>
<td valign="middle" align="left" style="">128.47</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s+FasterNet+ MLCA</td>
<td valign="middle" align="left" style="">81.9</td>
<td valign="middle" align="left" style="">77.9</td>
<td valign="middle" align="left" style="">0.727</td>
<td valign="middle" align="left" style="">0.84</td>
<td valign="middle" align="left" style="">126.89</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s+FasterNet+CARAFE</td>
<td valign="middle" align="left" style="">81.1</td>
<td valign="middle" align="left" style="">77.6</td>
<td valign="middle" align="left" style="">0.716</td>
<td valign="middle" align="left" style="">0.848</td>
<td valign="middle" align="left" style="">133.52</td>
</tr>
<tr>
<th valign="middle" colspan="6" align="left" style="">Final Model</th>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s+FasterNet+CARAFE+MLCA</td>
<td valign="middle" align="left" style="">82.8</td>
<td valign="middle" align="left" style="">79.8</td>
<td valign="middle" align="left" style="">0.745</td>
<td valign="middle" align="left" style="">0.861</td>
<td valign="middle" align="left" style="">131.56</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref>, integrating the FasterNet backbone and the MLCA attention module into YOLOv11s improved accuracy by 1.2% and 1.1%, respectively. This result highlights FasterNet&#x2019;s superior ability to extract multi-scale histopathological features, guiding feature attention toward diagnostically critical patterns and improving feature representation. Notably, the MLCA module illustrates a clear accuracy-speed tradeoff: it increased mAP by 1.1% and the F1-score by 0.3%, with only a marginal reduction in inference speed. Furthermore, integrating CARAFE significantly improved computational efficiency. Its adaptive upsampling mechanism resulted in a 4.6% increase in inference speed compared to the baseline model.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Comparison experiments with other mainstream object detection algorithms</title>
<p>The proposed FC-YOLO algorithm enhances feature extraction from complex histopathological images, making it a high-performance solution for detecting gastric adenocarcinoma. To validate its performance, we compared it with several mainstream object detection architectures: Faster-RCNN (<xref ref-type="bibr" rid="B36">36</xref>). RT-DETR (<xref ref-type="bibr" rid="B37">37</xref>). RetinaNet (<xref ref-type="bibr" rid="B38">38</xref>). YOLOv5s (<xref ref-type="bibr" rid="B39">39</xref>). YOLOv7 (<xref ref-type="bibr" rid="B40">40</xref>). YOLOv8s (<xref ref-type="bibr" rid="B41">41</xref>). and the baseline YOLOv11s. To ensure a fair and valid assessment, we retrained each model from scratch under the same experimental conditions used for FC-YOLO. As detailed in <xref ref-type="table" rid="T8">
<bold>Table&#xa0;8</bold>
</xref> and <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>, the comprehensive benchmarking results confirm the diagnostic efficacy of FC-YOLO, a finding supported by this rigorous methodological consistency.</p>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>Different model variants.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="left">mAP (%)</th>
<th valign="middle" align="left">FPS</th>
<th valign="middle" align="left">F1 (%)</th>
<th valign="middle" align="left">Params[m]</th>
<th valign="middle" align="left">GLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" style="">Faster-RCNN<break/>RetinaNet<break/>RT-DETR<break/>Yolov5s<break/>Yolov7<break/>Yolov8s<break/>Yolov11s<break/>FC-YOLO</td>
<td valign="middle" align="center" style="">67.4<break/>71.8<break/>78.7<break/>79.1<break/>78.5<break/>78.9<break/>80.2<break/>82.8</td>
<td valign="middle" align="center" style="">85.42<break/>97.21<break/>119.97<break/>120.42<break/>117.35<break/>122.26<break/>125.78<break/>131.56</td>
<td valign="middle" align="center" style="">64.2<break/>68.7<break/>74.7<break/>75.5<break/>74.9<break/>74.3<break/>77.1<break/>79.8</td>
<td valign="middle" align="center" style="">127.35<break/>35.56<break/>32.7<break/>7.02<break/>37.1<break/>11.1<break/>9.45<break/>7.68</td>
<td valign="middle" align="center" style="">416.5<break/>120.4<break/>112.4<break/>16.5<break/>105.1<break/>28.8<break/>21.7<break/>15.9</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Compared with other mainstream object detection algorithms.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g007.tif">
<alt-text content-type="machine-generated">A comparison of various object detection models on two histological images. The models include Faster-RCNN, RetinaNet, RT-DETR, YOLOv5s, YOLOv7, YOLOv8s, YOLOv11s, and FC-YOLO. Each model's output is shown with bounding boxes highlighting detected features, such as adenocarcinoma, in different colors and confidences. The original images are included for reference in the first columns of each row.</alt-text>
</graphic>
</fig>
<p>A qualitative analysis of the visual detection results (<xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>) reveals distinct performance characteristics among the evaluated architectures. The transformer-based RT-DETR shows better detection consistency than Faster R-CNN and RetinaNet, with precision comparable to that of YOLOv5s. However, RT-DETR produces notable misclassification artifacts in histologically ambiguous regions. Faster R-CNN exhibits significant limitations, including both missed detections and classification errors. While YOLOv11s improves upon YOLOv5s by reducing missed detections, it still under-detects subtle pathological manifestations. In contrast, our proposed FC-YOLO architecture demonstrates an enhanced ability to discriminate microscopic lesions, particularly in challenging scenarios that require differentiating malignant glands (e.g., irregular lumen structures) from inflammatory infiltrates. These comparative evaluations confirm the diagnostic superiority of FC-YOLO, which reduces misclassifications while increasing true-positive detections relative to its state-of-the-art counterparts.</p>
<p>In our subsequent experimental evaluation, we used mean Average Precision (mAP), F1 score, Frames Per Second (FPS), parameter count, and Giga Floating-point Operations Per Second (GFLOPS) as key performance metrics. As detailed in <xref ref-type="table" rid="T8">
<bold>Table&#xa0;8</bold>
</xref>, our proposed FC-YOLO algorithm achieved a state-of-the-art mAP of 82.8%. Compared to the YOLOv11s baseline, FC-YOLO improved the mAP by 2.6% and the F1 score by 2.7% (to 79.8%), indicating a superior balance between precision and recall. Crucially, these accuracy gains were achieved alongside significant efficiency improvements. The integration of the FasterNet backbone and CARAFE upsampling module allowed us to reduce the parameter count by 18.7% (from 9.45M to 7.68M) and the computational load by 26.7% (from 21.7 to 15.9 GFLOPS), while increasing inference speed by 4.6%.</p>
<p>Beyond the baseline, we benchmarked FC-YOLO against other mainstream architectures. It significantly outperformed classic algorithms, achieving a 15.4% higher mAP than Faster R-CNN and an 11% higher mAP than RetinaNet. It also surpassed the modern hybrid Transformer-based model, RT-DETR, with a 4.1% mAP enhancement. When comparing other YOLO versions, we noted an intriguing result: in the specific context of gastric adenocarcinoma detection, YOLOv5s unexpectedly outperformed both YOLOv7 and YOLOv8s by 0.6% and 0.2% mAP, respectively. This observation is corroborated by <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>, which shows YOLOv5s produced fewer missed detections. Nevertheless, our FC-YOLO model still achieved a 3.7% higher mAP than YOLOv5s, despite having a comparable parameter count.</p>
<p>These results underscore that FC-YOLO establishes a new state-of-the-art balance between diagnostic accuracy and computational cost, a critical factor for practical clinical deployment (<xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>). This favorable trade-off is a direct result of our architectural choices. By replacing the standard backbone and upsampling modules with the more efficient FasterNet and CARAFE, we enhanced accuracy without increasing computational cost. The lightweight MLCA attention module further augmented the model&#x2019;s discriminative power at a negligible cost. In conclusion, FC-YOLO demonstrates a significant advantage for detecting gastric adenocarcinoma in pathological images, achieving superior accuracy while maintaining a lower computational cost and faster inference speed than current mainstream algorithms.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Accuracy-efficiency trade-offs.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g008.tif">
<alt-text content-type="machine-generated">Scatter plot illustrating the performance-efficiency trade-off for gastric adenocarcinoma detection. Detection accuracy (mAP at 0.5) is plotted against computational cost (GFLOPs). The proposed model is highlighted with a red star and shows the highest accuracy at approximately 0.82 with low computational cost. Other models, including Faster-RCNN, RetinaNet, and various YOLO versions, are represented with different shapes and colors. The legend specifies each model's symbol and color.</alt-text>
</graphic>
</fig>
<p>To gain a deeper understanding of FC-YOLO&#x2019;s diagnostic performance, we conducted a more granular analysis using Precision-Recall (PR) curves. These curves visualize the trade-off between precision and recall across various confidence thresholds, with a curve positioned closer to the top-right corner indicating superior performance. As shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>, we compared the PR curves for FC-YOLO and the baseline YOLOv11s on the BOT dataset. The FC-YOLO curve is consistently positioned above the baseline, visually demonstrating that our model achieves higher detection sensitivity while being less prone to making erroneous predictions. We also used confusion matrices to evaluate classification performance by comparing true and predicted labels. In these matrices, superior performance is indicated by higher values along the diagonal and lower values in the off-diagonal cells. <xref ref-type="fig" rid="f10">
<bold>Figures&#xa0;10A,B</bold>
</xref> show the normalized confusion matrices for the baseline YOLOv11s and our FC-YOLO model, respectively, on the BOT dataset. A direct comparison reveals that FC-YOLO achieves higher detection accuracy, identifying gastric adenocarcinoma regions more precisely and with fewer misclassifications. In contrast, the baseline YOLOv11s model shows a higher incidence of both false positives and false negatives.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Comparison of the Precision-Recall (PR) curves for YOLOv11s and FC-YOLO.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g009.tif">
<alt-text content-type="machine-generated">Precision-recall curve comparing YOLOv11s and FC-YOLO models. The curve for YOLOv11s is dashed blue, with a mean average precision of 0.802. The FC-YOLO curve is solid orange, with a mean average precision of 0.828. The graph shows precision on the y-axis and recall on the x-axis.</alt-text>
</graphic>
</fig>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Confusion matrices. <bold>(A)</bold> Confusion matrix for the YOLOv11s model. <bold>(B)</bold> Confusion matrix for the FC-YOLO model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g010.tif">
<alt-text content-type="machine-generated">Two confusion matrices labeled A and B, both normalized, display prediction accuracy for adenocarcinoma versus background. Matrix A shows an accuracy of 0.79 for adenocarcinoma and 1.00 for background, while B shows 0.82 for adenocarcinoma and 1.00 for background. The matrices indicate slight improvement in differentiation from A to B.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>The verification of accuracy for In-House Clinical dataset</title>
<p>Subsequently, the gastric adenocarcinoma pathological images collected from our hospital were utilized as an In-House Clinical Dataset, on which the model was trained from scratch. The objective was to ascertain whether our model&#x2019;s enhancements retain their superiority for detecting gastric adenocarcinoma within a real-world clinical environment. A core challenge presented by such independent clinical dataset is the inherent variability in staining protocols and slide preparation. Acknowledging that performing independent stain normalization is often impractical and difficult to maintain consistently in a live clinical workflow, we deliberately omitted this step. Instead, the focus of our evaluation was shifted to assessing the model&#x2019;s intrinsic robustness against these diverse staining schemes. The trained model was utilized to predict the collected gastric adenocarcinoma slices, and Grad-CAM heatmaps were conducted to verify the regions of interest of our model. Moreover, comparisons were made with yolov5s, yolov7, yolov8s, and yolov11s, which exhibited relatively superior performance in the control experiments, as depicted in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>. It can be observed from the detection maps of different models that YOLOv7 and YOLOv8s present more missed detections during the detection process, especially in regions with complex backgrounds. Some missed detections are also found in YOLOv5s and YOLOv11s, particularly in the detection of small targets. In contrast, our model demonstrates a superior ability in target recognition of gastric adenocarcinoma. From the heatmaps, it can be noticed that the regions that our model focuses on are mostly the cancer cavities and areas with significant cellular atypia, which are more consistent with the characteristics of gastric adenocarcinoma. Furthermore, we also tested the inference speed of FC-YOLO in the clinically collected slices. As presented in <xref ref-type="table" rid="T9">
<bold>Table&#xa0;9</bold>
</xref>. Different model variants., the inference speed and accuracy of our model surpass those of other YOLO series models.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Prediction comparison chart.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g011.tif">
<alt-text content-type="machine-generated">A series of medical imagery comparing original microscopic images of tissue with detection outputs from different YOLO model versions: YOLOv5s, YOLOv7, YOLOv8s, YOLOv11s, and FC-YOLO. Each model is shown with overlaid boxes and labels indicating identified adenocarcinoma areas, highlighting variations in detection accuracy and visualization between models.</alt-text>
</graphic>
</fig>
<table-wrap id="T9" position="float">
<label>Table&#xa0;9</label>
<caption>
<p>Different model variants.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">mAP</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">FPS</th>
<th valign="middle" align="center">F1(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" style="">YOLOv5s</td>
<td valign="middle" align="center" style="">81.3</td>
<td valign="middle" align="center" style="">0.864</td>
<td valign="middle" align="center" style="">105.73</td>
<td valign="middle" align="center" style="">80.7</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv7</td>
<td valign="middle" align="center" style="">80.4</td>
<td valign="middle" align="center" style="">0.847</td>
<td valign="middle" align="center" style="">102.62</td>
<td valign="middle" align="center" style="">78.3</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv8s</td>
<td valign="middle" align="center" style="">80.1</td>
<td valign="middle" align="center" style="">0.856</td>
<td valign="middle" align="center" style="">108.98</td>
<td valign="middle" align="center" style="">79.5</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="center" style="">82.9</td>
<td valign="middle" align="center" style="">0.878</td>
<td valign="middle" align="center" style="">113.58</td>
<td valign="middle" align="center" style="">81.5</td>
</tr>
<tr>
<td valign="middle" align="left" style="">FC-YOLO</td>
<td valign="middle" align="center" style="">85.7</td>
<td valign="middle" align="center" style="">0.897</td>
<td valign="middle" align="center" style="">121.77</td>
<td valign="middle" align="center" style="">82.3</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_7">
<label>3.7</label>
<title>Cross-dataset validation experiment</title>
<p>To rigorously evaluate the generalizability and robustness of our proposed FC-YOLO model, we conducted a cross-dataset validation experiment. This experiment utilized our two independent datasets: the BOT dataset and an In-House Clinical dataset. Both FC-YOLO and the baseline YOLOv11s adhered to a strictly consistent cross-validation protocol. In the first arm, models were trained exclusively on the BOT dataset and then directly evaluated on the validation set of the In-House Clinical dataset. Conversely, in the second arm, models were trained solely on the In-House Clinical dataset and subsequently assessed on the BOT dataset&#x2019;s validation set.</p>
<p>The results of this cross-dataset validation are summarized in <xref ref-type="table" rid="T10">
<bold>Table&#xa0;10</bold>
</xref>. As anticipated, both models exhibited a degradation in performance when evaluated on previously unseen data. When trained on the BOT dataset and tested on the In-House Clinical dataset, the mAP of FC-YOLO decreased from 85.7% to 80.8% (a 4.9% drop), while the baseline YOLOv11s saw its mAP fall from 82.9% to 77.6% (a 5.3% drop). In the reciprocal scenario, the mAPs for FC-YOLO and YOLOv11s declined by 5.5% and 5.8%, respectively. This performance gap can be attributed to the domain shift between the datasets, likely arising from variations in staining protocols and histomorphological features. Crucially, however, FC-YOLO consistently outperformed YOLOv11s in terms of mAP across both cross-dataset scenarios, which more comprehensively reflects the effectiveness of our enhancements. Furthermore, the magnitude of performance degradation for FC-YOLO was smaller than that of the baseline model, indicating that the features learned by FC-YOLO are more generalizable. In summary, these internal-external cross-validation experiments confirm that, compared to the baseline YOLOv11s, FC-YOLO demonstrates superior generalization capability and robustness for the detection of gastric adenocarcinoma in pathological images.</p>
<table-wrap id="T10" position="float">
<label>Table&#xa0;10</label>
<caption>
<p>Cross-dataset validation performance comparison.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Training dataset</th>
<th valign="middle" align="center">Validating dataset</th>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">mAP (%)</th>
<th valign="middle" align="center">Performance drop(%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="2" align="left" style="">BOT dataset</td>
<td valign="middle" rowspan="2" align="left" style="">In-House Clinical dataset</td>
<td valign="middle" align="left" style="">FC-YOLO</td>
<td valign="middle" align="center" style="">80.8</td>
<td valign="middle" align="center" style="">4.9</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLOv11s</td>
<td valign="middle" align="center" style="">77.6</td>
<td valign="middle" align="center" style="">5.3</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="left" style="">In-House Clinical dataset</td>
<td valign="middle" rowspan="2" align="left" style="">BOT dataset</td>
<td valign="middle" align="left" style="">FC-YOLO</td>
<td valign="middle" align="center" style="">77.3</td>
<td valign="middle" align="center" style="">5.5</td>
</tr>
<tr>
<td valign="middle" align="left" style="">YOLO11s</td>
<td valign="middle" align="center" style="">74.4</td>
<td valign="middle" align="center" style="">5.8</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_8">
<label>3.8</label>
<title>Analysis of representative failure cases</title>
<p>A critical evaluation of any diagnostic AI model requires a thorough analysis of its failure cases. To better understand the limitations of FC-YOLO and contextualize its performance for potential clinical application, we examined representative false-positive (FP) and false-negative (FN) cases from the test set. This analysis provides valuable insights into which specific histopathological types of gastric cancer challenge the model&#x2019;s detection capabilities and helps inform targeted strategies for future refinement.</p>
<p>Our analysis revealed that false positives (FPs) in both the BOT and In-House Clinical datasets occurred predominantly in regions of complex histological morphology with atypical hyperplasia (<xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12A</bold>
</xref>). These hyperplastic lesions often present with enlarged nuclei, hyperchromasia, pleomorphism, and prominent nucleoli&#x2014;characteristics that closely resemble well-differentiated adenocarcinoma. Our model likely captured these high-grade cytological features, leading to the erroneous classification of the region as malignant. This type of error highlights a key limitation: the model struggles to distinguish between true neoplastic changes and the severe atypia found in benign reactive lesions. Another common source of FPs was high-grade intraepithelial neoplasia (HGIN) (<xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12B</bold>
</xref>). Morphologically, HGIN cells are nearly indistinguishable from carcinoma cells. The gold standard for differentiating HGIN from invasive carcinoma is the integrity of the basement membrane&#x2014;specifically, whether it has been breached. As an object detection model, FC-YOLO is designed to identify overall cellular morphology and architecture, not to discern the integrity of a fine linear structure like the basement membrane. This task lies beyond the inherent capabilities of our object detection framework. Therefore, while the model&#x2019;s classification of HGIN as malignant is clinically incorrect, it is a logically consistent outcome from a pattern recognition standpoint.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Analysis of misclassified cases. <bold>(A)</bold> Atypical hyperplasia. <bold>(B)</bold> High-grade intraepithelial neoplasia. <bold>(C)</bold> Poorly differentiated carcinoma. <bold>(D)</bold> Signet ring cell carcinoma.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1657159-g012.tif">
<alt-text content-type="machine-generated">A four-panel figure, labeled A to D, showing histopathological images incorrectly identified by the FC-YOLO model. Panels A and B display examples of false positives, with yellow boxes highlighting regions wrongly classified as cancerous. Panels C and D show false negatives, with red boxes highlighting cancerous regions missed by the model. Each panel includes a magnified inset for cellular detail. Blue boxes within the images display the model's probability scores for adenocarcinoma.</alt-text>
</graphic>
</fig>
<p>False negatives (FNs) represent a significant clinical risk. Our analysis revealed that FNs typically occurred in cases where malignant cells were sparsely distributed, poorly differentiated, or lacked clear architectural patterns. For instance (<xref ref-type="fig" rid="f12">
<bold>Figures&#xa0;12C, D</bold>
</xref>), poorly differentiated cancer cells often infiltrated the stroma loosely, either as single cells or in thin cords, with minimal to no formation of glandular structures. Another common source of FNs was signet ring cell carcinoma, a specific subtype of poorly differentiated adenocarcinoma. In these cases, the cytoplasm is filled with mucin, displacing the nucleus eccentrically to create a characteristic &#x2018;ring-like&#x2019; appearance. These cells are frequently scattered individually and can be quite inconspicuous. The YOLO model, by its nature, learns to recognize targets based on their &#x2018;shape&#x2019; and &#x2018;structure&#x2019;. When malignant cells do not form well-defined structures, such as the glandular tubes typical of more differentiated adenocarcinoma, the model may lack the distinct features required for confident identification.</p>
<p>In summary, our model faces two primary challenges: first, improving its ability to discriminate between malignant and benign structures amid atypical hyperplasia, and second, increasing its sensitivity to poorly differentiated and diffusely infiltrative cancers. This analysis directly informs our future research. We plan to augment our training dataset with more of these challenging cases, particularly signet ring cell carcinomas and high-grade intraepithelial neoplasia. Furthermore, incorporating subtype-specific annotations during the labeling process will be crucial, as this will enable the model to learn a richer repertoire of pathological features and ultimately enhance its overall discriminative power.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>This paper introduces FC-YOLO, a deep learning framework for the automated detection of gastric adenocarcinoma in histopathological images. The architecture is specifically designed to address key challenges in this domain, including complex backgrounds, high-density lesions, irregular glandular morphologies, and low feature contrast between malignant and benign regions. The FasterNet backbone enhances the model&#x2019;s multi-scale feature representation while minimizing information loss during extraction. The MLCA attention mechanism improves detection accuracy by efficiently recalibrating channel-spatial features, all without increasing computational overhead. Furthermore, the CARAFE upsampling module replaces conventional methods to reduce model parameters and computational load, thereby accelerating inference speed. This synergistic design simultaneously improves both diagnostic precision and operational efficiency, making it well-suited for clinical histopathology applications.</p>
<p>It is noteworthy that our comparative experiments revealed a noteworthy phenomenon: for the specific task of detecting gastric adenocarcinoma, YOLOv5s marginally outperformed its more complex, larger-parameter counterparts, YOLOv7 and YOLOv8s. We hypothesize that this counterintuitive outcome stems from the dataset&#x2019;s unique characteristics. Architectures like YOLOv7 and YOLOv8s are highly optimized for large-scale, general-purpose datasets such as COCO, but this performance does not always translate to specialized domains like medical imaging. Histopathological images differ significantly from general datasets, as they feature complex tissue backgrounds and high inter-class similarity. In this context, the relative architectural simplicity of YOLOv5s may be more effective at extracting salient features from pathological images. This finding underscores the principle that the most complex model is not invariably the optimal choice for a given application, highlighting the indispensable need for empirical validation tailored to the task at hand, rather than an overreliance on model complexity (<xref ref-type="bibr" rid="B42">42</xref>).</p>
<p>Furthermore, our results also revealed the promising potential of the RT-DETR model in this detection task, which, to some extent, underscores the advantage of Transformer architectures in capturing global contextual information and offers valuable insights for the future direction of medical pathology image recognition. However, our observations during training indicate that Transformer-based models generally entail a larger parameter count and greater computational complexity compared to purely CNN-structured YOLO models. Although RT-DETR optimizes efficiency by integrating a hybrid CNN backbone with partial Transformer layers, its training and inference phases still demand more substantial computational resources and prolonged durations relative to certain lightweight YOLO variants. Crucially, the YOLO series of models offers a significant advantage in terms of ease of engineering and deployment, making them more accommodating for hospitals with limited hardware resources. Therefore, from a comprehensive standpoint that considers detection accuracy, computational efficiency, and practical deployment feasibility, our proposed FC-YOLO model still demonstrates a superior overall advantage compared to the Transformer-integrated RT-DETR model for the task of gastric adenocarcinoma pathology detection.</p>
<p>This study innovatively adopts YOLOv11s as the foundational architecture for gastric adenocarcinoma detection in histopathological images, representing the first application of this framework in pathological image analysis. We selected a YOLO-series model due to its advantages for clinical deployment, including simplified implementation and real-time detection capabilities. The&#xa0;framework&#x2019;s direct output of bounding box coordinates, confidence scores, and class probabilities enables seamless mapping to digital pathology coordinate systems. Furthermore, YOLO&#x2019;s single-stage detection paradigm allows for immediate lesion localization in a single forward pass-a critical feature allowing pathologists to prioritize suspicious regions and minimize non-diagnostic viewing time. While YOLOv11s inherently offers favorable accuracy-speed balance, our FC-YOLO enhancements further optimize this equilibrium by improving detection precision while significantly reducing computational overhead, maintaining lightweight characteristics essential for clinical integration. The clinical applicability of YOLO-series architectures in pathological image analysis is well-established through precedent studies. Yu et&#xa0;al. (<xref ref-type="bibr" rid="B43">43</xref>) demonstrated YOLOv5s&#x2019; efficacy in mitotic figure detection within uterine smooth muscle tumors on whole slide images. Lee et&#xa0;al. (<xref ref-type="bibr" rid="B44">44</xref>) employed YOLOX to detect foci of lymphovascular invasion in gastric cancer and subsequently constructed an integrated deep learning model in conjunction with ConViT. Li et&#xa0;al. (<xref ref-type="bibr" rid="B45">45</xref>) enhanced YOLOv7 with BiFormer attention mechanisms, CARAFE upsampling, and GSConv modules, achieving improved accuracy and efficiency in vascular structure detection. Our experimental results corroborate CARAFE effectiveness in accelerating inference speed for histopathological analysis. The FC-YOLO framework demonstrates exceptional diagnostic performance with 82.8% mAP and 79.8% F1-score in gastric adenocarcinoma detection, particularly excelling in challenging scenarios involving complex stromal backgrounds and irregular neoplastic morphologies. This advancement holds significant clinical value by reducing diagnostic oversights and enhancing pathologists&#x2019; workflow efficiency through prioritized lesion localization.</p>
<p>Although our proposed model performs well, this study has several limitations. First, the model&#x2019;s scope is confined to binary tumor versus non-tumor classification. It cannot perform more granular tasks, such as differentiating histological subtypes of adenocarcinoma (e.g., tubular, signet ring cell) or distinguishing adenocarcinoma from other malignancies like lymphoma or gastrointestinal stromal tumors. This specificity constrains its ability to provide comprehensive diagnostic support and limits its direct clinical applicability. Second, the study relies on a single-institution dataset, which restricts the model&#x2019;s generalization capabilities. A lack of multi-center, multi-device validation, as well as validation on authoritative public datasets, may compromise its performance in real-world clinical settings. Finally, while we used heatmaps for model interpretability, our analysis lacks a quantitative metric, such as the Dice coefficient, to rigorously evaluate the concordance between the model&#x2019;s attention and the true pathological regions.</p>
<p>Our future work will focus on addressing the following areas: enhancing the model&#x2019;s generalizability by collecting data from multiple medical centers; optimizing and training the model to identify various cancer subtypes for multi-cancer classification, with a strong emphasis on clinical utility; and exploring deployment strategies on hospital-grade equipment to construct a pathology diagnostic system applicable to large-scale screening, rapid preliminary triage, and multi-target correlational analysis. A critical subsequent step will be the seamless integration of this system with existing digital pathology workflows. This will necessitate the development of compatible interfaces or plugins for whole-slide image (WSI) viewers such as QuPath and Aperio ImageScope. Upon loading a WSI into these platforms, our model would operate in the background, automatically identifying and displaying suspicious regions of gastric adenocarcinoma as either heatmap overlays or interactive bounding boxes. This functionality would enable pathologists to rapidly locate areas of interest, while the heatmap overlay format would serve to increase their confidence in the model&#x2019;s findings. This approach does not render a diagnosis itself but rather flags regions that warrant expert review, thereby reducing the risk of missing minute or atypical foci, particularly under heavy workloads. In essence, the system is not intended to replace pathologists but to serve as an auxiliary tool that assists them in rapidly locating suspicious regions and provides diagnostic support, thus alleviating their workload. The ultimate objective is for the FC-YOLO technology to deliver tangible benefits to the patient population.</p>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>In this research, we propose an object detection model designated FC-YOLO, which employs deep learning techniques for the automated detection of gastric adenocarcinoma in pathological images. We replaced the backbone of YOLOv11s with FasterNet and integrated the MLCA attention mechanism and the CARAFE upsampling module. These modifications enhance the extraction of multi-scale critical features, thereby improving both the model&#x2019;s detection accuracy and inference speed. Experimental results obtained from the BOT dataset and a dataset collected from hospital sources demonstrate that FC-YOLO outperforms current mainstream object detection models in identifying gastric adenocarcinoma within pathological images, particularly in scenarios with complex backgrounds and for small target lesions. Our future research will pursue several avenues: (i) collecting a more extensive and varied pathological image dataset, encompassing a broader range of cancer categories and lesion types; (ii) To develop a lightweight pathology diagnostic system based on our model for deployment on devices in real-world clinical settings, thereby providing diagnostic assistance to physicians. (iii) further optimizing computational efficiency through methods such as model pruning and knowledge distillation.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found at Zenodo: <uri xlink:href="https://zenodo.org/records/15129785">https://zenodo.org/records/15129785</uri>. The private dataset is not available because of privacy regulations. The trained models for this project are available at <uri xlink:href="https://github.com/BTMC-JPAI/FC-YOLO">https://github.com/BTMC-JPAI/FC-YOLO</uri>.</p>
</sec>
<sec id="s7" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>This study was reviewed and approved by the Ethics Committee of Baotou Medical College and was conducted in accordance with local legislation and institutional requirements. Due to the retrospective nature of the study, the requirement for informed consent was waived.</p>
</sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>HZ: Validation, Conceptualization, Methodology, Software, Visualization, Writing &#x2013; original draft. JJ: Project administration, Methodology, Writing &#x2013; review &amp; editing, Software, Funding acquisition, Conceptualization. XW: Investigation, Writing &#x2013; review &amp; editing, Formal analysis, Data curation, Conceptualization. YG: Writing &#x2013; review &amp; editing, Resources, Conceptualization, Funding acquisition. WZ: Data curation, Resources, Writing &#x2013; original draft. RY: Data curation, Writing &#x2013; original draft, Resources. XY: Validation, Writing &#x2013; original draft. WS: Data curation, Writing &#x2013; original draft.</p>
</sec>
<sec id="s9" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research and/or publication of this article. This research was funded by Inner Mongolia Autonomous Region Higher Education Innovation Team Development Plan, grant number NMGIRT2328; Natural Science Foundation of Inner Mongolia Autonomous Region of China, grant number 2023LHMS08058.</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>This work received vigorous support from the Key Laboratory Platform of Human Anatomy in Colleges and Universities of the Inner Mongolia Autonomous Region.</p>
</ack>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If&#xa0;you identify any issues, please contact us.</p>
</sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sung</surname> <given-names>H</given-names>
</name>
<name>
<surname>Ferlay</surname> <given-names>J</given-names>
</name>
<name>
<surname>Siegel</surname> <given-names>RL</given-names>
</name>
<name>
<surname>Laversanne</surname> <given-names>M</given-names>
</name>
<name>
<surname>Soerjomataram</surname> <given-names>I</given-names>
</name>
<name>
<surname>Jemal</surname> <given-names>A</given-names>
</name>
<etal/>
</person-group>. <article-title>Global cancer statistics 2020: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title>. <source>CA Cancer J Clin</source>. (<year>2021</year>) <volume>71</volume>:<page-range>209&#x2013;49</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3322/caac.21660</pub-id>, PMID: <pub-id pub-id-type="pmid">33538338</pub-id></citation></ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morgan</surname> <given-names>E</given-names>
</name>
<name>
<surname>Arnold</surname> <given-names>M</given-names>
</name>
<name>
<surname>Camargo</surname> <given-names>MC</given-names>
</name>
<name>
<surname>Gini</surname> <given-names>A</given-names>
</name>
<name>
<surname>Kunzmann</surname> <given-names>AT</given-names>
</name>
<name>
<surname>Matsuda</surname> <given-names>T</given-names>
</name>
<etal/>
</person-group>. <article-title>The current and future incidence and mortality of gastric cancer in 185 countries, 2020-40: A population-based modelling study</article-title>. <source>EClinicalMedicine</source>. (<year>2022</year>) <volume>47</volume>:<elocation-id>101404</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eclinm.2022.101404</pub-id>, PMID: <pub-id pub-id-type="pmid">35497064</pub-id></citation></ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Metter</surname> <given-names>DM</given-names>
</name>
<name>
<surname>Colgan</surname> <given-names>TJ</given-names>
</name>
<name>
<surname>Leung</surname> <given-names>ST</given-names>
</name>
<name>
<surname>Timmons</surname> <given-names>CF</given-names>
</name>
<name>
<surname>Park</surname> <given-names>JY</given-names>
</name>
</person-group>. <article-title>Trends in the US and canadian pathologist workforces from 2007 to 2017</article-title>. <source>JAMA Netw Open</source>. (<year>2019</year>) <volume>2</volume>:<fpage>e194337</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.4337</pub-id>, PMID: <pub-id pub-id-type="pmid">31150073</pub-id></citation></ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>S</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>W</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>L</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>Clinically applicable histopathological diagnosis system for gastric cancer detection using deep learning</article-title>. <source>Nat Commun</source>. (<year>2020</year>) <volume>11</volume>:<fpage>4294</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41467-020-18147-8</pub-id>, PMID: <pub-id pub-id-type="pmid">32855423</pub-id></citation></ref>
<ref id="B5">
<label>5</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lan</surname> <given-names>J</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>M</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Du</surname> <given-names>M</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H</given-names>
</name>
<etal/>
</person-group>. <article-title>Using less annotation workload to establish a pathological auxiliary diagnosis system for gastric cancer</article-title>. <source>Cell Rep Med</source>. (<year>2023</year>) <volume>4</volume>:<elocation-id>101004</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.xcrm.2023.101004</pub-id>, PMID: <pub-id pub-id-type="pmid">37044091</pub-id></citation></ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>B</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>S</given-names>
</name>
<name>
<surname>Zhan</surname> <given-names>N</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>J</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>C</given-names>
</name>
<etal/>
</person-group>. <article-title>Accurate diagnosis and prognosis prediction of gastric cancer using deep learning on digital pathological images: A retrospective multicentre study</article-title>. <source>EBioMedicine</source>. (<year>2021</year>) <volume>73</volume>:<elocation-id>103631</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ebiom.2021.103631</pub-id>, PMID: <pub-id pub-id-type="pmid">34678610</pub-id></citation></ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kather</surname> <given-names>JN</given-names>
</name>
<name>
<surname>Pearson</surname> <given-names>AT</given-names>
</name>
<name>
<surname>Halama</surname> <given-names>N</given-names>
</name>
<name>
<surname>J&#xe4;ger</surname> <given-names>D</given-names>
</name>
<name>
<surname>Krause</surname> <given-names>J</given-names>
</name>
<name>
<surname>Loosen</surname> <given-names>SH</given-names>
</name>
<etal/>
</person-group>. <article-title>Deep learning can predict microsatellite instability directly from histology in gastrointestinal cancer</article-title>. <source>Nat Med</source>. (<year>2019</year>) <volume>25</volume>:<page-range>1054&#x2013;6</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41591-019-0462-y</pub-id>, PMID: <pub-id pub-id-type="pmid">31160815</pub-id></citation></ref>
<ref id="B8">
<label>8</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>S</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<etal/>
</person-group>. <article-title>VENet: Variational energy network for gland segmentation of pathological images and early gastric cancer diagnosis of whole slide images</article-title>. <source>Comput Methods Programs Biomed</source>. (<year>2024</year>) <volume>250</volume>:<elocation-id>108178</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cmpb.2024.108178</pub-id>, PMID: <pub-id pub-id-type="pmid">38652995</pub-id></citation></ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>L</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>H</given-names>
</name>
</person-group>. <article-title>GCLDNet: Gastric cancer lesion detection network combining level feature aggregation and attention feature fusion</article-title>. <source>Front Oncol</source>. (<year>2022</year>) <volume>12</volume>:<elocation-id>901475</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fonc.2022.901475</pub-id>, PMID: <pub-id pub-id-type="pmid">36106104</pub-id></citation></ref>
<ref id="B10">
<label>10</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liang</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Nan</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Coppola</surname> <given-names>G</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>K</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>D</given-names>
</name>
<etal/>
</person-group>. <article-title>&#x201c;Weakly supervised biomedical image segmentation by reiterative learning,&#x201d;</article-title>. In: <source>IEEE journal of biomedical and health informatics</source>, (<publisher-loc>Piscataway, USA</publisher-loc>: <publisher-name>IEEE, IEEE Journal of Biomedical and Health Informatics</publisher-name>) vol. <volume>23</volume>. (<year>2019</year>). p. <page-range>1205&#x2013;14</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JBHI.2018.2850040</pub-id>, PMID: <pub-id pub-id-type="pmid">29994489</pub-id></citation></ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ning</surname> <given-names>X</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>R</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>N</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>S</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<etal/>
</person-group>. <article-title>Development of a deep learning-based model to diagnose mixed-type gastric cancer accurately</article-title>. <source>Int J Biochem Cell Biol</source>. (<year>2023</year>) <volume>162</volume>:<elocation-id>106452</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biocel.2023.106452</pub-id>, PMID: <pub-id pub-id-type="pmid">37482265</pub-id></citation></ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname> <given-names>B</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>M</given-names>
</name>
<name>
<surname>He</surname> <given-names>J</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R</given-names>
</name>
<etal/>
</person-group>. <article-title>StoHisNet: A hybrid multi-classification model with CNN and Transformer for gastric pathology images</article-title>. <source>Comput Methods Programs B iomed</source>. (<year>2022</year>) <volume>221</volume>:<elocation-id>106924</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cmpb.2022.106924</pub-id>, PMID: <pub-id pub-id-type="pmid">35671603</pub-id></citation></ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tung</surname> <given-names>CL</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>HC</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>BZ</given-names>
</name>
<name>
<surname>Hou</surname> <given-names>KJ</given-names>
</name>
<name>
<surname>Tsai</surname> <given-names>HH</given-names>
</name>
<name>
<surname>Tsai</surname> <given-names>CY</given-names>
</name>
<etal/>
</person-group>. <article-title>Identifying pathological slices of gastric cancer via deep learning</article-title>. <source>J Formos Med Assoc</source>. (<year>2022</year>) <volume>121</volume>:<page-range>2457&#x2013;64</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jfma.2022.05.004</pub-id>, PMID: <pub-id pub-id-type="pmid">35667953</pub-id></citation></ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname> <given-names>J</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>F</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>R</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>Interpretable deep learning for gastric cancer detection: a fusion of AI architectures and explainability analysis</article-title>. <source>Front Immunol</source>. (<year>2025</year>) <volume>16</volume>:<elocation-id>1596085</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fimmu.2025.1596085</pub-id>, PMID: <pub-id pub-id-type="pmid">40510366</pub-id></citation></ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lomans</surname> <given-names>R</given-names>
</name>
<name>
<surname>Angerilli</surname> <given-names>V</given-names>
</name>
<name>
<surname>Spronck</surname> <given-names>J</given-names>
</name>
<name>
<surname>Kodach</surname> <given-names>LL</given-names>
</name>
<name>
<surname>Gullo</surname> <given-names>I</given-names>
</name>
<name>
<surname>Carneiro</surname> <given-names>F</given-names>
</name>
<etal/>
</person-group>. <article-title>Deep learning for multiclass tumor cell detection in histopathology slides of hereditary diffuse gastric cancer</article-title>. <source>iScience</source>. (<year>2025</year>) <volume>28</volume>:<elocation-id>113064</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isci.2025.113064</pub-id>, PMID: <pub-id pub-id-type="pmid">40727932</pub-id></citation></ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yanhua</surname> <given-names>SHAO</given-names>
</name>
<name>
<surname>Duo</surname> <given-names>ZHANG</given-names>
</name>
<name>
<surname>Hongyu</surname> <given-names>CHU</given-names>
</name>
<name>
<surname>Xiaoqiang</surname> <given-names>ZHANG</given-names>
</name>
<name>
<surname>Yunbo</surname> <given-names>RAO</given-names>
</name>
</person-group>. <article-title>A review on YOLO object detection based on deep learning</article-title>. <source>J Electron Inf Technol</source>. (<year>2022</year>) <volume>10)</volume>:<page-range>3697&#x2013;708</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.11999/JEIT210790</pub-id>
</citation></ref>
<ref id="B17">
<label>17</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J</given-names>
</name>
<name>
<surname>Kao</surname> <given-names>S-h</given-names>
</name>
<name>
<surname>He</surname> <given-names>H</given-names>
</name>
<name>
<surname>Zhuo</surname> <given-names>W</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>S</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>C-H</given-names>
</name>
<etal/>
</person-group>. <article-title>&#x201c;Run, don&#x2019;t walk: chasing higher FLOPS for faster neural networks,&#x201d;</article-title>. In: <source>2023 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>. <publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE, 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</publisher-name> (<year>2023</year>). p. <page-range>12021&#x2013;31</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.01157</pub-id>
</citation></ref>
<ref id="B18">
<label>18</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zubair</surname> <given-names>M</given-names>
</name>
<name>
<surname>Owais</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hassan</surname> <given-names>T</given-names>
</name>
<name>
<surname>Bendechache</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hussain</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hussain</surname> <given-names>I</given-names>
</name>
<etal/>
</person-group>. <article-title>An interpretable framework for gastric cancer classification using multi-channel attention mechanisms and transfer learning approach on histopathology images</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>13087</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-025-97256-0</pub-id>, PMID: <pub-id pub-id-type="pmid">40240457</pub-id></citation></ref>
<ref id="B19">
<label>19</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname> <given-names>D</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>R</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>S</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>T</given-names>
</name>
<name>
<surname>Lang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>Z</given-names>
</name>
<etal/>
</person-group>. <article-title>Mixed local channel attention for object detection</article-title>. <source>Eng Appl Artif Intell</source>. (<year>2023</year>) <volume>123</volume>:<elocation-id>106442</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2023.106442</pub-id>
</citation></ref>
<ref id="B20">
<label>20</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>K</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>R</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Loy</surname> <given-names>CC</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>D</given-names>
</name>
<etal/>
</person-group>. <article-title>&#x201c;CARAFE: content-aware reAssembly of FEatures,&#x201d;</article-title>. In: <source>2019 IEEE/CVF international conference on computer vision (ICCV)</source>. <publisher-loc>Seoul, Korea (South</publisher-loc>: <publisher-name>IEEE, 2019 IEEE/CVF International Conference on Computer Vision (ICCV)</publisher-name> (<year>2019</year>). p. <page-range>3007&#x2013;16</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2019.00310</pub-id>
</citation></ref>
<ref id="B21">
<label>21</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Reinhard</surname> <given-names>E</given-names>
</name>
<name>
<surname>Adhikhmin</surname> <given-names>M</given-names>
</name>
<name>
<surname>Gooch</surname> <given-names>B</given-names>
</name>
<name>
<surname>Shirley</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>Color transfer between images</article-title>. In: <source>IEEE computer graphics and applications 21</source> (<publisher-loc>Seattle, USA</publisher-loc>: <publisher-name>IEEE, IEEE Computer Graphics and Applications</publisher-name>), vol. <volume>34,li</volume>. (<year>2001</year>).</citation></ref>
<ref id="B22">
<label>22</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Macenko</surname> <given-names>M</given-names>
</name>
<name>
<surname>Niethammer</surname> <given-names>M</given-names>
</name>
<name>
<surname>Marron</surname> <given-names>JS</given-names>
</name>
<name>
<surname>Borland</surname> <given-names>D</given-names>
</name>
<name>
<surname>Woosley</surname> <given-names>JT</given-names>
</name>
<name>
<surname>Guan</surname> <given-names>X</given-names>
</name>
<etal/>
</person-group>. <article-title>A method for normalizing histology slides for quantitative analysis</article-title>. In: <source>2009 IEEE international symposium on biomedical imaging: from nano to macro 1107ong</source> (<publisher-loc>Boston, MA, USA</publisher-loc>: <publisher-name>IEEE, 2009 IEEE International Symposium on Biomedical Imaging: From Nano to Macro</publisher-name>) (<year>2009</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ISBI.2009.5193250</pub-id>
</citation></ref>
<ref id="B23">
<label>23</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Tellez</surname> <given-names>D</given-names>
</name>
<name>
<surname>Balkenhol</surname> <given-names>M</given-names>
</name>
<name>
<surname>Karssemeijer</surname> <given-names>N</given-names>
</name>
<name>
<surname>Litjens</surname> <given-names>G</given-names>
</name>
<name>
<surname>van der Laak</surname> <given-names>J</given-names>
</name>
<name>
<surname>Ciompi</surname> <given-names>F</given-names>
</name>
<etal/>
</person-group>. <article-title>&#x201c;H and E stain augmentation improves generalization of convolutional networks for histopathological mitosis detection,&#x201d;</article-title>. In: <source>the proc.SPIE</source> (<publisher-loc>Houston, Texas, USA</publisher-loc>: <publisher-name>Proc. SPIE, Medical Imaging 2018</publisher-name>) (<year>2018</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.1117/12.2293048</pub-id>
</citation></ref>
<ref id="B24">
<label>24</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chattopadhay</surname> <given-names>A</given-names>
</name>
<name>
<surname>Sarkar</surname> <given-names>A</given-names>
</name>
<name>
<surname>Howlader</surname> <given-names>P</given-names>
</name>
<name>
<surname>Balasubramanian</surname> <given-names>VN</given-names>
</name>
</person-group>. <article-title>&#x201c;Grad-CAM++: generalized gradient-based visual explanations for deep convolutional networks,&#x201d;</article-title>. In: <source>2018 IEEE winter conference on applications of computer vision (WACV)</source> (<publisher-loc>Lake Tahoe, NV, USA</publisher-loc>: <publisher-name>IEEE, 2018 IEEE Winter Conference on Applications of Computer Vision (WACV)</publisher-name>) (<year>2018</year>). p. <page-range>839&#x2013;47</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/WACV.2018.00097</pub-id>
</citation></ref>
<ref id="B25">
<label>25</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Khanam</surname> <given-names>R</given-names>
</name>
<name>
<surname>Hussain</surname> <given-names>M</given-names>
</name>
</person-group>. <source>&#x201c;ussain,W An overview of the key architectural enhancements</source> (<year>2024</year>). Available online at: <uri xlink:href="https://arxiv.org/abs/2410.17725">https://arxiv.org/abs/2410.17725</uri>. (Accessed September 15, 2025).</citation></ref>
<ref id="B26">
<label>26</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Simonyan</surname> <given-names>K</given-names>
</name>
<name>
<surname>Zisserman</surname> <given-names>A</given-names>
</name>
</person-group>. <article-title>&#x201c;Very Deep Convolutional Networks for Large-Scale Image Recognition&#x201d;</article-title>. In: <source>3rd International Conference on Learning Representations, ICLR 2015</source> (<publisher-loc>San Diego, CA, USA</publisher-loc>) (<year>2015</year>). Available online at: <uri xlink:href="http://arxiv.org/abs/1409.1556">http://arxiv.org/abs/1409.1556</uri>.</citation></ref>
<ref id="B27">
<label>27</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>S</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>&#x201c;Deep residual learning for image recognition,&#x201d;</article-title>. In: <source>2016 IEEE conference on computer vision and pattern recognition (CVPR)</source> (<publisher-loc>Las Vegas, NV, USA</publisher-loc>: <publisher-name>IEEE, 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</publisher-name>) (<year>2016</year>). p. <page-range>770&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
</citation></ref>
<ref id="B28">
<label>28</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname> <given-names>M</given-names>
</name>
<name>
<surname>Howard</surname> <given-names>A</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Zhmoginov</surname> <given-names>A</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>L-C</given-names>
</name>
</person-group>. <article-title>&#x201c;MobileNetV2: inverted residuals and linear bottlenecks,&#x201d;</article-title>. In: <source>2018 IEEE/CVF conference on computer vision and pattern recognition</source> (<publisher-loc>Salt Lake City, UT, USA</publisher-loc>: <publisher-name>IEEE, 2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition</publisher-name>), vol. <volume>pp</volume>. (<year>2018</year>). p.&#xa0;<page-range>4510&#x2013;20</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2018.00474</pub-id>
</citation></ref>
<ref id="B29">
<label>29</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cai</surname> <given-names>H</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Gan</surname> <given-names>C</given-names>
</name>
<name>
<surname>Han</surname> <given-names>S</given-names>
</name>
</person-group>. <article-title>&#x201c;EfficientViT: lightweight multi-scale attention for high-resolution dense prediction,&#x201d;</article-title>. In: <source>2023 IEEE/CVF international conference on computer vision (ICCV)</source>. <publisher-loc>Paris, France</publisher-loc>: <publisher-name>Paris, France</publisher-name> (<year>2023</year>). p. <page-range>17256&#x2013;67</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV51070.2023.01587</pub-id>
</citation></ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>Z</given-names>
</name>
<name>
<surname>&amp; Hoffmann</surname> <given-names>N</given-names>
</name>
</person-group>. <article-title>Global attention mechanism: retain information to enhance channel-spatial interactions</article-title>. <source>ArXiv abs/2112.05561</source>. (<year>2021</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2112.05561</pub-id>
</citation></ref>
<ref id="B31">
<label>31</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>L</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Ke</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W</given-names>
</name>
<name>
<surname>Lau</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>&#x201c;BiFormer: vision transformer with bi-level routing attention,&#x201d;</article-title>. In: <source>2023 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE, 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</publisher-name>) (<year>2023</year>). p. <page-range>10323&#x2013;33</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00995</pub-id>
</citation></ref>
<ref id="B32">
<label>32</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hou</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>D</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>&#x201c;Coordinate attention for efficient mobile network design,&#x201d;</article-title>. In: <source>2021 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<publisher-loc>Nashville, TN, USA</publisher-loc>: <publisher-name>IEEE, 2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</publisher-name>) (<year>2021</year>). p. <page-range>13708&#x2013;17</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01350</pub-id>
</citation></ref>
<ref id="B33">
<label>33</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>H</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>C</given-names>
</name>
<name>
<surname>Song</surname> <given-names>Y</given-names>
</name>
<etal/>
</person-group>. <article-title>Channel prior convolutional attention for medical image segmentation</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>178</volume>:<elocation-id>108784</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108784</pub-id>, PMID: <pub-id pub-id-type="pmid">38941900</pub-id></citation></ref>
<ref id="B34">
<label>34</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>R-Y</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>X</given-names>
</name>
</person-group>. <article-title>&#x201c;SimAM: A simple, parameter-free attention module for convolutional neural networks,&#x201d;</article-title>. In: <person-group person-group-type="editor">
<name>
<surname>Meila</surname> <given-names>M</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>T</given-names>
</name>
</person-group>, editors. <source>roceedings of the 38th international conference on machine learning</source>, vol. <volume>139</volume> . <publisher-loc>Virtual Event</publisher-loc>: <publisher-name>Proceedings of Machine Learning Research</publisher-name> (<year>2021</year>). p. <page-range>11863&#x2013;74</page-range>. Available online at: <uri xlink:href="https://proceedings.mlr.press/v139/yang21o.html">https://proceedings.mlr.press/v139/yang21o.html</uri>. (Accessed September 15, 2025).</citation></ref>
<ref id="B35">
<label>35</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Woo</surname> <given-names>S</given-names>
</name>
<name>
<surname>Park</surname> <given-names>J</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>JY</given-names>
</name>
<name>
<surname>Kweon</surname> <given-names>IS</given-names>
</name>
</person-group>. <article-title>&#x201c;CBAM: convolutional block attention module,&#x201d;</article-title>. In: <source>Computer vision &#x2013; ECCV 2018: 15th european conference, munich, Germany, september 8&#x2013;14, 2018, proceedings, part VII</source>. <publisher-name>Springer-Verlag</publisher-name>, <publisher-loc>Berlin, Heidelberg</publisher-loc> (<year>2018</year>). p. <fpage>3</fpage>&#x2013;<lpage>19</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_1</pub-id>
</citation></ref>
<ref id="B36">
<label>36</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S</given-names>
</name>
<name>
<surname>He</surname> <given-names>K</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>&#x201c;Faster R-CNN: towards real-time object detection with region proposal networks,&#x201d;</article-title>. In: <source>IEEE transactions on pattern analysis and machine intelligence</source> (<publisher-loc>Boston, MA, USA</publisher-loc>: <publisher-name>IEEE, 2016 IEEE Transactions on Pattern Analysis and Machine Intelligence</publisher-name>), vol. <volume>39</volume>. (<year>2017</year>). p. <page-range>1137&#x2013;49</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>, PMID: <pub-id pub-id-type="pmid">27295650</pub-id></citation></ref>
<ref id="B37">
<label>37</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>W</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>S</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>J</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>G</given-names>
</name>
<name>
<surname>Dang</surname> <given-names>Q</given-names>
</name>
<etal/>
</person-group>. <article-title>&#x201c;DETRs beat YOLOs on real-time object detection,&#x201d;</article-title>. In: <source>2024 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<publisher-loc>Seattle, WA, USA</publisher-loc>: <publisher-name>IEEE, 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</publisher-name>) (<year>2024</year>). p. <page-range>16965&#x2013;74</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01605</pub-id>
</citation></ref>
<ref id="B38">
<label>38</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname> <given-names>T-Y</given-names>
</name>
<name>
<surname>Goyal</surname> <given-names>P</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R</given-names>
</name>
<name>
<surname>He</surname> <given-names>K</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>&#x201c;Focal loss for dense object detection,&#x201d;</article-title>. In: <source>2017 IEEE international conference on computer vision (ICCV)</source> (<publisher-loc>Venice, Italy</publisher-loc>: <publisher-name>IEEE, 2017 IEEE International Conference on Computer Vision (ICCV)</publisher-name>) (<year>2017</year>). p. <fpage>2999</fpage>&#x2013;<lpage>3007</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2017.324</pub-id>
</citation></ref>
<ref id="B39">
<label>39</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jocher</surname> <given-names>G</given-names>
</name>
<name>
<surname>Chaurasia</surname> <given-names>A</given-names>
</name>
<name>
<surname>Stoken</surname> <given-names>A</given-names>
</name>
<name>
<surname>Borovec</surname> <given-names>J</given-names>
</name>
<name>
<surname>Kwon</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Michael</surname> <given-names>K</given-names>
</name>
<etal/>
</person-group>. <article-title>ultralytics/yolov5: v7.0 - YOLOv5 SOTA realtime instance segmentation</article-title>. <source>Zenodo</source>. (<year>2022</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.5281/zenodo.7347926</pub-id>
</citation></ref>
<ref id="B40">
<label>40</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C-Y</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H-YM</given-names>
</name>
</person-group>. <article-title>&#x201c;YOLOv7: trainable bag-of-freebies sets new state-of-the-art for real-time object detectors,&#x201d;</article-title>. In: <source>2023 IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source> (<publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>IEEE, 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</publisher-name>) (<year>2023</year>). p. <page-range>7464&#x2013;75</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00721</pub-id>
</citation></ref>
<ref id="B41">
<label>41</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Varghese</surname> <given-names>R</given-names>
</name>
<name>
<surname>S.</surname> <given-names>M</given-names>
</name>
</person-group>. <article-title>&#x201c;YOLOv8: A novel object detection algorithm with enhanced performance and robustness,&#x201d;</article-title>. In: <source>2024 international conference on advances in data engineering and intelligent computing systems (ADICS)</source> (<publisher-loc>Chennai, India</publisher-loc>: <publisher-name>IEEE, 2024 International Conference on Advances in Data Engineering and&#xa0;Intelligent Computing Systems (ADICS)</publisher-name>) (<year>2024</year>). p. <fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ADICS58448.2024.10533619</pub-id>
</citation></ref>
<ref id="B42">
<label>42</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Vakanski</surname> <given-names>A</given-names>
</name>
<name>
<surname>Xian</surname> <given-names>M</given-names>
</name>
</person-group>. <article-title>Evaluation of complexity measures for deep learning generalization in medical image analysis</article-title>. In: <source>2021 IEEE 31st international workshop on machine learning for signal processing (MLSP)</source> (<publisher-loc>Gold Coast, Australia</publisher-loc>: <publisher-name>IEEE, 2021 IEEE 31st International Workshop on Machine Learning for Signal Processing (MLSP)</publisher-name>) (<year>2021</year>). p. <fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/MLSP52302.2021.9596501</pub-id>, PMID: <pub-id pub-id-type="pmid">35527797</pub-id></citation></ref>
<ref id="B43">
<label>43</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>H</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>S</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>J</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Zhi</surname> <given-names>W</given-names>
</name>
<name>
<surname>Mo</surname> <given-names>N</given-names>
</name>
<etal/>
</person-group>. <article-title>A deep-learning-based artificial intelligence system for the pathology diagnosis of uterine smooth muscle tumor</article-title>. <source>Life (Basel)</source>. (<year>2022</year>) <volume>13</volume>:<elocation-id>3</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/life13010003</pub-id>, PMID: <pub-id pub-id-type="pmid">36675952</pub-id></citation></ref>
<ref id="B44">
<label>44</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname> <given-names>J</given-names>
</name>
<name>
<surname>Cha</surname> <given-names>S</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>J</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>JJ</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>N</given-names>
</name>
<name>
<surname>Jae Gal</surname> <given-names>SG</given-names>
</name>
<etal/>
</person-group>. <article-title>Ensemble deep learning model to predict lymphovascular invasion in gastric cancer</article-title>. <source>Cancers (Basel)</source>. (<year>2024</year>) <volume>16</volume>:<elocation-id>430</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/cancers16020430</pub-id>, PMID: <pub-id pub-id-type="pmid">38275871</pub-id></citation></ref>
<ref id="B45">
<label>45</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C</given-names>
</name>
<name>
<surname>Che</surname> <given-names>S</given-names>
</name>
<name>
<surname>Gong</surname> <given-names>H</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Xi</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>PI-YOLO: dynamic sparse attention and lightweight convolutional based YOLO for vessel detection in pathological images</article-title>. <source>Front Oncol</source>. (<year>2024</year>) <volume>14</volume>:<elocation-id>1347123</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fonc.2024.1347123</pub-id>, PMID: <pub-id pub-id-type="pmid">39184041</pub-id></citation></ref>
</ref-list>
</back>
</article>