<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1529814</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Precision enhancement in wireless capsule endoscopy: a novel transformer-based approach for real-time video object detection</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Habe</surname> <given-names>Tsedeke Temesgen</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2897297/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Haataja</surname> <given-names>Keijo</given-names></name>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Toivanen</surname> <given-names>Pekka</given-names></name>
<uri xlink:href="http://loop.frontiersin.org/people/1787641/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff><institution>School of Computing, University of Eastern Finland, Kuopio</institution>, <addr-line>North Savo</addr-line>, <country>Finland</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Pengcheng Liu, University of York, United Kingdom</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Aydin Eresen, University of California, Irvine, United States</p>
<p>Yan Wen, University of Lincoln, United Kingdom</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Tsedeke Temesgen Habe <email>tshabe&#x00040;uef.fi</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>04</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1529814</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>11</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>04</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Habe, Haataja and Toivanen.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Habe, Haataja and Toivanen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>Wireless Capsule Endoscopy (WCE) enables non-invasive imaging of the gastrointestinal tract but generates vast video data, making real-time and accurate abnormality detection challenging. Traditional detection methods struggle with uncontrolled illumination, complex textures, and high-speed processing demands.</p></sec>
<sec>
<title>Methods</title>
<p>This study presents a novel approach using Real-Time Detection Transformer (RT-DETR), a transformer-based object detection model, specifically optimized for WCE video analysis. The model captures contextual information between frames and handles variable image conditions. It was evaluated using the Kvasir-Capsule dataset, with performance assessed across three RT-DETR variants: Small (S), Medium (M), and X-Large (X).</p></sec>
<sec>
<title>Results</title>
<p>RT-DETR-X achieved the highest detection precision. RT-DETR-M offered a practical trade-off between accuracy and speed, while RT-DETR-S processed frames at 270 FPS, enabling real-time performance. All three models demonstrated improved detection accuracy and computational efficiency compared to baseline methods.</p></sec>
<sec>
<title>Discussion</title>
<p>The RT-DETR framework significantly enhances precision and real-time performance in gastrointestinal abnormality detection using WCE. Its clinical potential lies in supporting faster and more accurate diagnosis. Future work will focus on further optimization and deployment in endoscopic video analysis systems.</p></sec></abstract>
<kwd-group>
<kwd>capsule endoscopy</kwd>
<kwd>object detection</kwd>
<kwd>real-time processing</kwd>
<kwd>transformer models</kwd>
<kwd>video analysis</kwd>
<kwd>wireless communication</kwd>
<kwd>medical imaging</kwd>
<kwd>deep learning</kwd>
</kwd-group>
<counts>
<fig-count count="8"/>
<table-count count="4"/>
<equation-count count="0"/>
<ref-count count="52"/>
<page-count count="16"/>
<word-count count="10564"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Pattern Recognition</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Wireless capsule endoscopy (WCE) is an advanced technique that has been introduced to capture images of the gastrointestinal tract from inside using a capsule that was wireless and could be swallowed by the patient. While in traditional endoscopy, it is only possible to partially examine the small intestine due to the invasive procedure of colonoscopy.</p>
<p>A number of gastrointestinal disorders are frequently seen during wireless capsule endoscopy, such as abnormalities of the ampulla of Vater (Weerakkody et al., <xref ref-type="bibr" rid="B45">2024</xref>), angiectasia (Igawa et al., <xref ref-type="bibr" rid="B15">2015</xref>; Saltzman, <xref ref-type="bibr" rid="B34">2024</xref>), fresh blood and blood hematin (Kimberly and Baillie, <xref ref-type="bibr" rid="B18">2006</xref>), erosion (Feldman et al., <xref ref-type="bibr" rid="B9">2020</xref>), erythema (Ginsberg et al., <xref ref-type="bibr" rid="B10">2011</xref>), foreign bodies (Ikenberry et al., <xref ref-type="bibr" rid="B3">2011</xref>), lymphatic edema (Strober et al., <xref ref-type="bibr" rid="B39">1967</xref>), polyps (Machicado et al., <xref ref-type="bibr" rid="B26">2020</xref>), and ulcers (Kuipers et al., <xref ref-type="bibr" rid="B19">1995</xref>). This study focuses on 10 of these pathology classes for detection and analysis using the Kvasir-Capsule dataset.</p>
<p>However, unlike colonoscopy WCE offers clinical benefits which include early diagnosis of the disease, the technology however poses a challenge in data analysis. WCE procedure can produce more than fifty thousand images, which later creates several hours of video that a specialist has to go through carefully. However, this is not efficient in the sense that it is done manually and involves a lot of time in contrast to the computer-aided ones and this exposes the patient to wrong diagnosis and ability to detect abnormalities that are noticeable. It is therefore important to have automated systems because of the efficiency and accuracy needs of detecting abnormalities in WCE videos in real-time.</p>
<p>Our prior study benchmarked deep learning models for WCE detection, identifying RT-DETR as a promising solution (Habe et al., <xref ref-type="bibr" rid="B13">2024</xref>). The following models were implemented and compared: RT-MDET; RT-MDET variants; SSD; SSD variants; YOLOv3; Faster R-CNN; EfficientDet; and RetinaNet. When these above-mentioned models were used on WCE data, the strength of each model could be seen in terms of the benefits it provided; however, restrictions could also be observed in terms of their weaknesses in the context of the ever-changing environment of the WCE data.</p>
<p>In our earlier studies (Habe et al., <xref ref-type="bibr" rid="B13">2024</xref>), we tested and investigated several deep-learning models to overcome these issues in WCE video analysis. They include RT-MDET variants, SSD variants, YOLOv3, Faster R-CNN, EfficientDet, and RetinaNet, which have been developed and benchmarked. All of these models were useful in terms of the unique features they provided, yet when implemented in the WCE data context, was comprised of certain weaknesses.</p>
<list list-type="bullet">
<list-item><p><bold>RT-MDET variants:</bold> These models were intended to work in real-time detection in WCE videos because of their focus on two parameters; efficiency and precision. Although the method was effective in detecting abnormalities that were visualized under stable light conditions and simple structures, they faced issues with low precision with variability in lighting conditions and complex structures.</p></list-item>
<list-item><p><bold>SSD variants and YOLOv3:</bold> These two models are light weight models perfect for real time operation. But at the same time, they have provided significantly worse detection results in most cases, especially in the presence of certain low-contrast abnormalities or areas with poor lighting.</p></list-item>
<list-item><p><bold>Faster R-CNN and EfficientDet:</bold> These models were especially effective in the aspect of detection which was occasionally even higher compared to other techniques in terms of distinguishing the minor elements on the WCE videos. However, this increased their computational complexity and often the processing time and therefore were not as suitable for real time clinical uses.</p></list-item>
<list-item><p><bold>RetinaNet:</bold> This allowed for a more balanced model, which gave good accuracy with realistic processing time. However, like most models, it had its limitations in the fact that it could not easily be applied to WCE data which had different textures and also contained fluids.</p></list-item>
</list>
<sec>
<title>1.1 Problem statement</title>
<p>The challenges in WCE imaging, particularly in the gastrointestinal tract, are significant. Detecting abnormalities becomes harder because of image quality variability together with visualization blurring and motion artifacts and also mucus and bubbles and food residues that exist in the images (Sadeghi et al., <xref ref-type="bibr" rid="B32">2024</xref>). The model faces difficulties predicting across all pathology types because of its data imbalance. High data volumes (often exceeding 50,000 frames per patient) create complexity in WCE analysis while demanding significant storage capacities along with powerful computational capabilities (Sadeghi et al., <xref ref-type="bibr" rid="B32">2024</xref>; Pascual et al., <xref ref-type="bibr" rid="B29">2022</xref>). The evaluation process by medical experts takes significant time which proves the necessity for advanced automated processing methods. The scarcity of annotated data presents difficulties in training reliable models because additional methods must be employed (Pascual et al., <xref ref-type="bibr" rid="B29">2022</xref>). Algorithmic constraints also play a role, as prior object detection models like Faster R-CNN and YOLOv3 struggle with detecting small lesions and handling the complex, variable textures found in the gastrointestinal tract (Zhang et al., <xref ref-type="bibr" rid="B51">2024c</xref>). Studies have shown that these models often fail due to the intricate background patterns and illumination variability present in WCE datasets (Gui et al., <xref ref-type="bibr" rid="B12">2024</xref>). By implementing real-time processing capabilities and advanced feature extraction methods the RT-DETR model addresses existing limitations in Gastrointestinal endoscopy systems.</p></sec>
<sec>
<title>1.2 Proposed solution</title>
<p>In order to overcome these challenges, we introduce the Real-Time Detection Transformer (RT-DETR) model as a video analysis architecture built with the transformer network. Transformers (Wang et al., <xref ref-type="bibr" rid="B44">2025</xref>) have shown excellent performance in capturing long-range dependencies and, more recently, in computer vision, owing to their capacity to model long distance relations and context in sequences of data. These strengths are utilized in the RT-DETR model to improve the identification of GI pathologies in WCE videos, and particularly in poor light conditions. RT-DETR effectively addresses limitations which are listed in Section 1.1 by leveraging multi-scale feature interaction, an optimized lightweight design, and loss functions, self-attention mechanisms and real-time processing (Zhang et al., <xref ref-type="bibr" rid="B49">2024a</xref>; Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>; Guemas et al., <xref ref-type="bibr" rid="B11">2024</xref>).</p></sec>
<sec>
<title>1.3 Contributions</title>
<p>The primary contributions of this work are as follows:</p>
<list list-type="bullet">
<list-item><p><bold>Novel application:</bold> We propose the RT-DETR model for real-time object detection on WCE videos since there are existing models that do not achieve adequate performance.</p></list-item>
<list-item><p><bold>Model enhancements:</bold> Below, we offer several architectural modifications aimed explicitly at enhancing the performance of WCE data, such as the use of various preprocessing techniques and efficient attention mechanisms.</p></list-item>
<list-item><p><bold>Comprehensive evaluation:</bold> We perform several performance evaluation experiments on a highly selected WCE dataset and show that the proposed RT-DETR model is superior to all the other existing models with regard to both accuracy and speed.</p></list-item>
<list-item><p><bold>Clinical relevance:</bold> Thus, the use and implementation of the presented RT-DETR model in diagnosing the WCE can accelerate the process of video analysis and increase the effectiveness of detection results.</p></list-item>
</list></sec></sec>
<sec id="s2">
<title>2 Related work</title>
<p>Various deep learning methods have been applied to WCE diagnosis, evolving from traditional feature-based approaches to advanced deep learning techniques (Bordbar et al., <xref ref-type="bibr" rid="B4">2023</xref>; Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Alawode et al., <xref ref-type="bibr" rid="B2">2024</xref>; Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>; Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>). These methods aim to enhance detection accuracy and efficiency in analyzing WCE data.</p>
<sec>
<title>2.1 Traditional object detection approaches</title>
<p>Bordbar et al. (<xref ref-type="bibr" rid="B4">2023</xref>) conducted a study where a 3D-CNN model is used for multiclass classification of WCE frames which is a major improvement as compared to traditional approaches where handcrafted features are used along with classical machine learning techniques such as SVM. Bordbar et al. (<xref ref-type="bibr" rid="B4">2023</xref>) noted that traditional techniques did not do well when handling variability in WCE images and specifically in identifying small and intricate lesions. The 3D-CNN (Bordbar et al., <xref ref-type="bibr" rid="B4">2023</xref>), that included temporal information across frames of the video improved the accuracy but it was still a problem regarding computational load and real-time performance. The implementation of SVM demonstrates exceptional accuracy through its ability to achieve 99.41% detection precision when using color features based on HSI color space to identify between normal and abnormal patterns (Li et al., <xref ref-type="bibr" rid="B21">2012</xref>; Khun et al., <xref ref-type="bibr" rid="B17">2009</xref>). Current research on Random Forest and k-NN methods in WCE detection remains scarce because these classifiers appear in comparative analyses because of their ability to handle diverse datasets while Random Forest offers additional benefits in feature selection and overfitting reduction (Khun et al., <xref ref-type="bibr" rid="B17">2009</xref>). Clinicians face a major challenge with the vast number of WCE images so additional research must focus on creating automated systems which combine deep learning models with current classifiers to boost accuracy and decrease processing times (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Li et al., <xref ref-type="bibr" rid="B21">2012</xref>).</p></sec>
<sec>
<title>2.2 Deep learning in medical imaging</title>
<p>Deep learning techniques have significantly advanced medical imaging applications, particularly in WCE image analysis. Several CNN-based models, including InceptionV3, EfficientNetV2, ResNet, DenseNet, and MobileNet, have been developed to improve feature extraction and classification performance (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>). However, CNNs have limitations in incorporating global contextual features, leading to misclassification of visually similar disease classes (Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>). ShuffleNetV1 and ResNet56 demonstrated effectiveness in binary classification tasks but struggled with multiclass cases due to class imbalance and the lack of global contextual information (Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>).</p>
<p>To address these limitations, hybrid architectures have been proposed. The Minimum Spanning Tree (MST) and Spatial Pyramid Pooling, integrated with an EfficientNet-CondConv architecture, have improved hierarchical feature extraction for WCE images (Sharmila and Geetha, <xref ref-type="bibr" rid="B36">2024</xref>). Additionally, PitTree Fusion Algorithms and conditional convolutions have been incorporated to enhance adaptability to varying input resolutions and complexity (Sharmila and Geetha, <xref ref-type="bibr" rid="B36">2024</xref>).</p>
<p>Transformer-based models have also shown promise in WCE analysis. YOLOv8, enhanced with VanillaNet and an Advanced Feature Pyramid Network (AFPN), has been optimized for real-time detection with high feature extraction accuracy (Liang et al., <xref ref-type="bibr" rid="B24">2024</xref>). Comparisons across 14 different CNN-based models indicate that YOLO series models, particularly YOLOv8n, achieve high accuracy and fast inference speeds, reaching up to 416 FPS (Zhang et al., <xref ref-type="bibr" rid="B50">2024b</xref>).</p>
<p>Vision Transformer (ViT) architectures have been introduced to further improve feature extraction. FLATer, a ViT-derived architecture, has demonstrated superior performance by capturing long-range dependencies and global features in endoscopic images (Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>). Swin Transformer and CaiT models, achieving 79.15% accuracy on the Kvasir Capsule dataset and 98.63% on the Red Lesion Endoscopy (RLE) dataset, have been identified as more effective than CNNs due to their ability to model long-distance dependencies (Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>). However, computational overhead remains a limitation in clinical applications.</p>
<p>Recent research has explored Multi-Scale Coupled Attention (MSCA) networks, designed to improve object detection in varying scales (Li et al., <xref ref-type="bibr" rid="B20">2024a</xref>). Ablation studies confirm that MSCCA and MSCSA modules enhance feature recognition precision and stability, making these networks suitable for complex visual scenarios. The combination of CNNs and Transformers has been shown to enhance global feature extraction, leading to improved performance in differentiating GI lesions, including polyps and cancers (Tang et al., <xref ref-type="bibr" rid="B40">2023</xref>).</p>
<p>Furthermore, active learning techniques have been incorporated into ViT models to improve training efficiency in scenarios with limited labeled data (Tang et al., <xref ref-type="bibr" rid="B40">2023</xref>). Despite their advantages, ViTs require extensive computational resources, and further research is needed to optimize these architectures for real-time applications (Li et al., <xref ref-type="bibr" rid="B23">2024c</xref>). Advances in lightweight transformer models and hybrid architectures continue to refine the balance between accuracy, computational efficiency, and real-time diagnostic applicability (Pornvoraphat et al., <xref ref-type="bibr" rid="B31">2023</xref>; Chen et al., <xref ref-type="bibr" rid="B6">2023</xref>).</p></sec>
<sec>
<title>2.3 Real-Time DEtection TRansformer</title>
<p>The VST model that applies T2T-ViT is designed to focus on regions likely to contain polyps, thereby improving detection accuracy (de Moura Lima et al., <xref ref-type="bibr" rid="B8">2023</xref>). The DETR model, when integrated with a ResNet-50 backbone, effectively addresses various object detection tasks, benefiting from transformers&#x00027; ability to learn long-range dependencies within images (de Moura Lima et al., <xref ref-type="bibr" rid="B8">2023</xref>). These properties enable accurate detection of polyps (de Moura Lima et al., <xref ref-type="bibr" rid="B8">2023</xref>). Additionally, ViT-H/14 has been utilized as the primary classification model for gastroscopic images, leveraging transfer learning with pre-training on the ImageNet-21k dataset (Chae and Cho, <xref ref-type="bibr" rid="B5">2023</xref>). The ViT-H/14 and BiT-L models facilitate relevant feature extraction from small image patches, improving model performance and classification accuracy (Chae and Cho, <xref ref-type="bibr" rid="B5">2023</xref>).</p>
<p>RT-DETR and Deformable DETR models, both based on the Transformer architecture, have been evaluated for real-time object detection (Zhang et al., <xref ref-type="bibr" rid="B49">2024a</xref>). RT-DETR achieves a balanced trade-off between precision and recall, demonstrating high inference speed at 46.9 FPS. Although this is lower than the YOLO series, it remains well-suited for real-time detection tasks (Zhang et al., <xref ref-type="bibr" rid="B49">2024a</xref>).</p>
<p>RT-DETRv2 introduces several enhancements over the original RT-DETR model, optimizing its performance for real-time applications. The deformable attention module has been modified to include multi-scale sampling points, improving the model&#x00027;s ability to learn selective multi-scale features (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>). Additionally, a discrete sampling operator is employed to replace the grid_sample function, eliminating deployment issues without affecting performance (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>). Dynamic data augmentation is incorporated, adjusting augmentation strength during training to enhance generalization to target domains (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>). Moreover, RT-DETRv2 introduces scale-adaptive hyperparameter tuning, which optimizes learning rates based on model size, improving feature quality in smaller networks like ResNet18 while preserving efficiency in larger networks such as ResNet101 (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>).</p>
<p>A comparative study demonstrated that RT-DETRv2 outperformed its predecessor in both AP and FPS metrics on the COCO dataset across different model sizes (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>). For instance, RT-DETRv2-S, based on ResNet18, achieved an AP of 47.9, marking a 1.4-point improvement, while maintaining a stable FPS of 217 (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>). Further ablation studies validated the improvements, showing that reducing sampling points in the deformable attention module did not significantly compromise accuracy while maintaining efficiency (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>).</p>
<p>Alavala et al. (<xref ref-type="bibr" rid="B1">2024</xref>) proposed a pipeline utilizing the Swin Transformer model for classifying WCE frames into bleeding and non-bleeding categories, while RT-DETR was employed for bleeding region detection and segmentation. The Swin Transformer captures both local and global spatial dependencies, while RT-DETR integrates a hybrid encoder and uncertainty-minimal query selection for precise abnormality detection (Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>). The preprocessing techniques of Lab color space conversion and CLAHE help models perform better through contrast enhancement and artifact reduction according to Alavala et al. (<xref ref-type="bibr" rid="B1">2024</xref>). The model attained 66.7% average precision (AP) as well as 98.5% classification accuracy while performing on the validation set (Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>). The study (Muzammul et al., <xref ref-type="bibr" rid="B27">2024</xref>) introduced a novel approach for UAV aerial image analysis, leveraging Slicing Aided Hyper Inference (SAHI) alongside the RT-DETR-X model. The objective was to improve detection accuracy and efficiency in high-resolution aerial imagery, using the VisDrone-DET dataset for evaluation. The RT-DETR-X model demonstrated real-time object detection capabilities, enhanced by the SAHI method, particularly in identifying small objects within high-resolution scenes (Muzammul et al., <xref ref-type="bibr" rid="B27">2024</xref>).</p>
<p>DETR and Faster R-CNN have also been applied to the localization, detection, and characterization of focal liver lesions (FLLs) in ultrasound images (Dadoun et al., <xref ref-type="bibr" rid="B7">2022</xref>). While DETR achieved superior accuracy with a specificity of 90% and sensitivity of 97%, making it well-suited for real-time clinical applications, Faster R-CNN performed better in certain lesion characterization tasks (Dadoun et al., <xref ref-type="bibr" rid="B7">2022</xref>). This comparison highlights the potential of transformer-based models to enhance diagnostic accuracy in medical imaging.</p>
<p>The RT-DETR model has further been applied for malaria diagnosis by automating the detection and classification of four Plasmodium species in thin blood films (Guemas et al., <xref ref-type="bibr" rid="B11">2024</xref>). The model exhibited high sensitivity, achieving a 90% recall rate in detecting <italic>P. falciparum</italic>. However, distinguishing species such as <italic>P. vivax</italic> and <italic>P. ovale</italic> remains challenging due to their morphological similarities. Overall, RT-DETR was found to be as effective as YOLOv8x for patient-level detection, demonstrating potential for real-time diagnostic applications on low-cost devices, including smartphones (Guemas et al., <xref ref-type="bibr" rid="B11">2024</xref>).</p>
<p>A two-stage detection algorithm incorporating depth maps, Visual Saliency Transformer, and DETR has been developed for polyp detection in colonoscopy images (de Moura Lima et al., <xref ref-type="bibr" rid="B8">2023</xref>). This approach achieved a detection accuracy of 92.6% on the Kvasir-SEG dataset, demonstrating improvements in depth map utilization, saliency extraction, and transformer-based feature learning (de Moura Lima et al., <xref ref-type="bibr" rid="B8">2023</xref>).</p>
<p>The Residual Convolution DETR (RPC-DETR) model introduces several optimizations relevant to medical image analysis, including WCE video detection (Shao et al., <xref ref-type="bibr" rid="B35">2024</xref>). The addition of a Residual Convolution block (RPC-block) enhances feature extraction while reducing computational costs, making it suitable for real-time applications. Additionally, the Shape-IoU loss function improves bounding box regression by accounting for shape variations, which is particularly useful for detecting gastrointestinal abnormalities in WCE images (Shao et al., <xref ref-type="bibr" rid="B35">2024</xref>).</p>
<p>For colorectal cancer screening, YOLOv5 has been enhanced with a P-C3 module and Context Feature Augmentation (CFA) to improve the detection of small and low-contrast polyps in colonoscopy images (Wan et al., <xref ref-type="bibr" rid="B43">2024b</xref>). The integration of a Coordinate Attention Mechanism (CAM) further refines feature selection, enhancing model focus on relevant areas. Evaluation results indicate that the improved YOLOv5 outperformed YOLOv8, RT-DETR R50, and other state-of-the-art methods in polyp detection (Wan et al., <xref ref-type="bibr" rid="B43">2024b</xref>).</p>
<p>The Deformable DETR model has also been applied for breast cancer detection in mammographic images (Xu et al., <xref ref-type="bibr" rid="B47">2024</xref>). The study examined the effectiveness of design choices from Deformable DETR in medical imaging and found that multi-scale feature fusion and complex encoder structures, while beneficial for natural images, may not always improve performance in medical datasets. Instead, simpler architectures were found to be more effective, particularly when handling high-resolution images with small regions of interest (Xu et al., <xref ref-type="bibr" rid="B47">2024</xref>). This insight is relevant to RT-DETR in WCE analysis, as optimizing model complexity may enhance both speed and detection accuracy.</p>
<p>In segmentation tasks, Point SEGTR has been introduced as a deep weakly semi-supervised model derived from DETR (Shi et al., <xref ref-type="bibr" rid="B37">2023</xref>). This framework leverages fully supervised and weakly supervised data, incorporating multi-point and symmetric consistency constraints to improve segmentation stability and effectiveness. Such techniques are particularly beneficial for RT-DETR applications in colonoscopy, where annotated training data is often limited (Shi et al., <xref ref-type="bibr" rid="B37">2023</xref>).</p>
<p>Researchers assessed RT-DETR variants (ResNet18, ResNet34, and ResNet50) when detecting colorectal polyps on both Kvasir-SEG and CVC-ColonDB datasets (Yu et al., <xref ref-type="bibr" rid="B48">2025</xref>). RT-DETR-ResNet34 demonstrated the best AP&#x00040;0.5 performance with 0.8859 on Kvasir-SEG and 0.8551 on CVC-ColonDB by outscoring RT-DETR-ResNet18 and RT-DETR-ResNet50 in most test cases (Yu et al., <xref ref-type="bibr" rid="B48">2025</xref>). PD-YOLO outperformed all other models in the experiments while demonstrating an AP&#x00040;0.5 score of 0.8828 on CVC-ColonDB and 0.9478 on Kvasir-SEG and also exhibited better recall values and F1-scores according to the research findings Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>)</p></sec>
<sec>
<title>2.4 Classification of WCE frames</title>
<p>The study classified WCE frames into three categories: Lesion Frames as frames that contain pathologies like ulcers, polyps, and bleeding; Normal Frames as frames with no pathologically altered tissues; and poor frames, in which visibility is compromised due to appearances such as mucus, shadows, or bubbles (Bordbar et al., <xref ref-type="bibr" rid="B4">2023</xref>).</p>
<p>The study was able to categorize the WCE frames into nine classes, and these included foreign body, reduced mucosal view, ileocecal valve, pylorus, ulcer, erosion, lymphangiectasia, erythema, and normal mucosa (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>). There was high classification accuracy obtained in a variety of gastrointestinal diseases, and therefore the robustness of the ViT (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>) model in managing diversified diseases was well demonstrated. Thus, some challenges were revealed in the study, the key of which was the Classification issue, especially between apparently similar classes like Erosion and Angiectasia (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>).</p>
<p>Another author performed the classification of WCE frames based on various categories such as normal frames, inflammatory diseases, vascular lesions, polyps, tumors, and bleeding and achieved a real-time execution with an average frame rate of 30 FPS (Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>).</p></sec>
<sec>
<title>2.5 Datasets</title>
<p>The research utilized a large, publicly available WCE dataset known as the Kvasir-Capsule (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Pogorelov et al., <xref ref-type="bibr" rid="B30">2017</xref>; Smedsrud et al., <xref ref-type="bibr" rid="B38">2021</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>; Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>; Sharmila and Geetha, <xref ref-type="bibr" rid="B36">2024</xref>), Red Lesion Endoscopy (RLE) (Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>), Kvasir-SEG (de Moura Lima et al., <xref ref-type="bibr" rid="B8">2023</xref>; Wan et al., <xref ref-type="bibr" rid="B43">2024b</xref>), and ETIS-Larib Polyp DB (Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>; Wan et al., <xref ref-type="bibr" rid="B43">2024b</xref>) datasets. Kvasir-Capsule datasets (Smedsrud et al., <xref ref-type="bibr" rid="B38">2021</xref>) consist of slightly more than 47,238 partially labeled images that were manually reviewed and allocated to one out of 14 categories of gastrointestinal lesions. Because of class imbalance issues, the authors performed under-sampling operations in the preparation of balanced samples for training the models (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>). This approach was critical in order to prevent class imbalance towards less complex but frequent classes like Normal Mucosa and in enhancing the performance of the model especially with clinically relevant classes which are less frequent (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>).</p>
<p>The study (de Moura Lima et al., <xref ref-type="bibr" rid="B8">2023</xref>) uses four public datasets for training and validation: CVC-ClinicDB with 612 images, CVC-ColonDB with 300 images, ETIS-LaribPolypDB with 196 high-resolution images, and Kvasir-SEG with 1,000 images. The study (Chae and Cho, <xref ref-type="bibr" rid="B5">2023</xref>) uses two datasets: Gastroscopic Dataset A and Gastroscopic Dataset B are pathological data of gastric abnormalities and early gastric cancer, and also from AI Hub of the National Information Society Agency of South Korea. This study (Liang et al., <xref ref-type="bibr" rid="B24">2024</xref>) used a dataset from Zhujiang Hospital with 105 GIST pathological slides that was reviewed by two pathologists and adopted data augmentation (Random cropping and Mosaic augmentation).</p></sec>
<sec>
<title>2.6 Summary of related work</title>
<p>The presented literature review also shows the development of object detection and classification models in WCE video analysis from traditional approaches to deep learning. Earlier studies used models such as 3D-CNNs which improved the possibilities to detect spatial and temporal characteristics in the WCE frames but suffered from high computational costs and real-time performance (Bordbar et al., <xref ref-type="bibr" rid="B4">2023</xref>). Progressively with the development of deep learning, models including ResNet (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>), EfficientNet (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>), and Vision Transformers (ViTs) (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>) especially enhance the ability to classify by their potential to capture the global context and the long-range dependence. However, these models can be computationally intensive making their application in real-time clinical settings challenging.</p>
<p>Recent studies have increasingly focused on integrating transformers with CNNs to develop more effective models for handling the ambiguity of WCE data (Sharmila and Geetha, <xref ref-type="bibr" rid="B36">2024</xref>; Liang et al., <xref ref-type="bibr" rid="B24">2024</xref>). Transformer-based architectures have been shown to improve class imbalance issues and enhance the detection of small and intricate lesions (Sharmila and Geetha, <xref ref-type="bibr" rid="B36">2024</xref>; Liang et al., <xref ref-type="bibr" rid="B24">2024</xref>; Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>). Additionally, improvements to models such as RT-DETR have been introduced to enhance real-time object detection, achieving better accuracy while maintaining high operational speed-an essential requirement for clinical applications (Zhang et al., <xref ref-type="bibr" rid="B49">2024a</xref>; Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>; Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>).</p>
<p>Besides, model innovations, the access to as well as usage of such large and varied data repositories like Kvasir-Capsule and Kvasir-SEG has been critical to the development of such solutions. These datasets together with data augmentation methods have been used to overcome the issues of class imbalance and to enhance the ability of the models to generalize (Varam et al., <xref ref-type="bibr" rid="B41">2023</xref>; Smedsrud et al., <xref ref-type="bibr" rid="B38">2021</xref>; Oh et al., <xref ref-type="bibr" rid="B28">2023</xref>; Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>).</p>
<p>In summary, the related work is consistent with the fact that video analysis in WCE has been progressively enhanced by deep learning and transformer-based models, as well as the ongoing research to improve the accuracy, efficiency, and capacity of handling various medical imaging tasks.</p></sec></sec>
<sec sec-type="methods" id="s3">
<title>3 Methodology</title>
<p>The employed method is designed to enhance the precision, computational effectiveness, and real-time suitability of RT-DETR for WCE video analysis, as suggested in this section. The primary problems in this technique are the class-imbalance problem, architecture improvements, lesion detection accuracy enhancement, and clinical significance of the data. In accordance with best practices, the original developers provided their code, which we used to build the model and benefit from the features and optimizations they introduced (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>).</p>
<sec>
<title>3.1 Proposed method: RT-DETR with ResNet for WCE pathology detection</title>
<p>In this research, we put forward a customized object detection system that uses the RT-DETR framework, equipped with a modified ResNet backbone for feature extraction. To work with our WCE dataset, we adapt the ResNet architecture by starting with pre-trained weights and fine-tuning the later layers, all while keeping the early layers frozen to preserve valuable features from the pre-training. The Hybrid Encoder exploits multi-scale feature extraction from the varying stages of its backbone to grab both fine and large features essential for finding small pathologies. A customized backbone, integrated with a transformer-based decoder, is designed to enhance both precision and computational performance in WCE video pathology detection.</p>
<sec>
<title>3.1.1 Data acquisition and preprocessing</title>
<p>The dataset used in this study consists of 16,938 WCE images, covering various gastrointestinal pathologies (<xref ref-type="table" rid="T1">Table 1</xref>). To ensure compatibility with the RT-DETR model, all images were converted to COCO format for integration with the MMDetection framework. Several preprocessing steps were applied to standardize image dimensions, enhance visual quality, and optimize model performance.</p>


<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Performance metrics for RT-DETR small, medium, and large size models as per classes.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Classes</bold></th>
<th valign="top" align="center" colspan="3"><bold>RT-DETR-S</bold></th>
<th valign="top" align="center" colspan="3"><bold>RT-DETR-M</bold></th>
<th valign="top" align="center" colspan="3"><bold>RT-DETR-X</bold></th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>F1-Score</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>F1-Score</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>F1-Score</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Ampulla of vater</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
</tr> <tr>
<td valign="top" align="left">Angiectasia</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
</tr> <tr>
<td valign="top" align="left">Blood fresh</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">0.95</td>
</tr> <tr>
<td valign="top" align="left">Blood hematin</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
</tr> <tr>
<td valign="top" align="left">Erosion</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.94</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.90</td>
</tr> <tr>
<td valign="top" align="left">Erythema</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.97</td>
</tr> <tr>
<td valign="top" align="left">Foreign body</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
</tr> <tr>
<td valign="top" align="left">Lymphangiectasia</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.99</td>
</tr> <tr>
<td valign="top" align="left">Polyp</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.93</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
</tr> <tr>
<td valign="top" align="left">Ulcer</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">0.94</td>
<td valign="top" align="center">0.96</td>
</tr> <tr>
<td valign="top" align="left">Background</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
</tr></tbody>
</table>
</table-wrap>


<p>Each image was resized to 512 &#x000D7; 512 pixels to maintain uniform input dimensions. EXIF orientation metadata was stripped to ensure consistent pixel alignment. To enhance contrast and improve lesion visibility, contrast adjustment was performed using CLAHE (Contrast Limited Adaptive Histogram Equalization), which enhances local contrast while preventing over-amplification of noise. Normalization was applied using mean and standard deviation scaling to standardize pixel intensity values across the dataset.</p>
<p>To improve generalization and synthetically extend the dataset, data augmentation was applied, generating five additional copies per image. The transformations included:</p>
<list list-type="bullet">
<list-item><p>Flipping (horizontal and vertical) with a 50% probability.</p></list-item>
<list-item><p>Random rotations of 90&#x000B0; (clockwise, counterclockwise, upside-down, or none) and minor random rotations between &#x02013;12&#x000B0; and &#x0002B;12&#x000B0;.</p></list-item>
<list-item><p>Random horizontal shearing between &#x02013;5&#x000B0; and &#x0002B;5&#x000B0;.</p></list-item>
<list-item><p>Random brightness adjustments between &#x02013;25% and &#x0002B;25%.</p></list-item>
<list-item><p>Random exposure corrections between &#x02013;11% and &#x0002B;11%.</p></list-item>
<list-item><p>Gaussian blurring using variable kernel sizes, where the standard deviation for the Gaussian filter was randomly selected between 0 and 3.1 pixels. Since Gaussian filters operate on discrete kernel sizes, the fractional standard deviations were rounded to the nearest applicable kernel size.</p></list-item>
<list-item><p>Salt and pepper noise applied to 1.5% of image pixels to simulate real-world noise artifacts.</p></list-item>
</list>
<p>These augmentations enhance model robustness by ensuring exposure to various transformations that may occur in real-world WCE images.</p></sec>
<sec>
<title>3.1.2 Data loading strategies</title>
<p>Important steps in the creation and training of models include the loading of data and the techniques used to optimize it. The loading and preprocessing of data is done in parallel which speeds up the training process overall by minimizing the amount of time required to load and prepare each batch and making the best use of the computational resources. After organizing the dataset, additional augmentations were made dynamically during training to increase the effectiveness of model generalization. The augmentations incorporated random photometric distortion with a probability of 50%, random zooming out to provide padding, and random IoU-based cropping with a 80% chance, to teach the model to detect pathologies in different light conditions, scales, and contexts. Bounding boxes were processed to guarantee their validity following transformations. To keep input dimensions consistent, horizontal flipping was randomly applied alongside the resizing of images to 640 &#x000D7; 640 pixels. The augmentation method was applied through epoch 117, at which time augmentations were stopped to stabilize the training process. The batch size of 12 per single GPU was applied for training, along with a validation batch size of 32, which ensured effective data processing in both training and evaluation. However, the developed model uses the weight decay in conjunction with gradient clipping techniques and the AdamW optimizer in conjunction with the dynamic learning rate to improve the training efficiency. All of these techniques provide high accuracy and generality while facilitating quick model convergence.</p></sec>
<sec>
<title>3.1.3 Model architecture</title>
<p>This work extends the RT-DETR framework, presented by Lv et al. (<xref ref-type="bibr" rid="B25">2024</xref>) as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>, aimed at performing real-time object detection. The fundamental part of our model is the different size of ResNet, pre-initialized with ImageNet pre-trained weights. To maintain learned features and modify the model for the WCE dataset, we keep the ResNet lower layers fixed, while fine-tuning the upper layers for pathology detection. Within RT-DETR Model, a backbone (ResNet-18, ResNet-34, or ResNet-101) serves to pull out hierarchical features from the images supplied. These features are then fed into the Hybrid Encoder, which passes the data through several modules: the AIFI Module (Adaptive Intra-Feature Interaction), the CCFM Module (Cross-Scale Context Fusion Module), along with the IoU Aware Query Selection mechanism. The Transformer Decoder carries out final detection and returns predicted bounding boxes and object labels after the IoU Aware Query Selection module enhances object queries prior to their passage (refer <xref ref-type="fig" rid="F1">Figure 1</xref>).</p>


<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>RT-DETR model for wireless capsule endoscopy image detection. The figure is adapted and redrawn by the author based on the architecture presented in Lv et al. (<xref ref-type="bibr" rid="B25">2024</xref>).</p></caption>
<alt-text>A system architecture diagram showing a pipeline for medical image analysis using a capsule endoscopy device and an RT-DETR model. The process begins with a capsule device capturing gastrointestinal images and wirelessly transmitting them. These images are fed into an RT-DETR model composed of components: Backbone, Hybrid Encoder, AIFI Module, CCFM Module, IoU Aware Query Selection, and Transformer Decoder. The output is a medical image with a label indicating the detected region, Ampulla of Vater: 0.94, highlighted in a red bounding box.</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0001.tif"/>
</fig>


<p>Basically, our architecture integrates efficient feature extraction with transformers&#x00027; multi-scale abilities to provide accurate and real-time identification of pathologies in WCE data, utilizing pre-trained weights for initialization and fine-tuning the higher layers while leaving the early layers frozen to retain helpful pre-trained features. The Hybrid Encoder takes advantage of multi-scale feature extraction at different backbone stages, which helps it detect both finer and larger features important for the discovery of small pathologies. Integrating this personalized backbone with an effective transformer-based decoder, our approach successfully tackles the problem of detecting pathologies in WCE videos, making sure to provide both precision and timely performance. The RT-DETR model, which is known for its performance in real-time object detection without using NMS has been developed for WCE video analysis problem.</p>
<p>The preference of the RT-DETR model is based on the use of ResNet backbone that is composed of customized connections with the ResNet. The customized ResNet (Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>) connections are beneficial in increasing the gradient flow and decreasing the computation while the residual connections come in handy in the vanishing gradient problem that would allow the training of deeper networks. This backbone is very useful for capturing details in WCE images where many of the features are small and the shapes irregular. The backbone architecture employed is customized structures of varying sizes of ResNet which is to provide both detection accuracy and computational complexity and they include ResNet-18, ResNet-34, and ResNet-101.</p>
<p>RT-DETR incorporates a hybrid encoder that processes WCE images through convolutional layers together with attention mechanisms for local and global feature extraction. The model extracts basic spatial features while concentrating on particular areas of interest. The deformable attention module (Yu et al., <xref ref-type="bibr" rid="B48">2025</xref>; Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>) allows the model to extract features at different resolutions which improves its capability to detect pathology features in lesions with diverse sizes and textures.</p>
<p>Adaptive attention offsets were used for feature selection optimization by altering the receptive field during training. Positional encoding techniques were integrated for the spatial consistency of WCE video sequences. All modifications were implemented following the official RT-DETR repository, ensuring compatibility with transformer-based object detection methods (Zhao et al., <xref ref-type="bibr" rid="B52">2023</xref>; Lv et al., <xref ref-type="bibr" rid="B25">2024</xref>).</p></sec></sec>
<sec>
<title>3.2 Training procedure</title>
<p>The training is carried out step by step to refine the RT-DETR model for WCE pathology detection. Training and validation are carried out on the Kvasir-Capsule dataset, which has been converted to COCO format. The model is trained for 120 epochs with two NVIDIA Quadro RTX 8000 GPUs setting and an overall batch size of 24 (12 per GPU).</p>
<p>A custom loss function is employed, which is a mix of classification, bounding box regression, and localization loss. In order to balance class distributions and enhance detection reliability, the loss components are dynamically weighted. The AdamW optimizer is used with an initial learning rate of 0.0001 under a cosine annealing schedule where the learning rate is progressively decreased. Weight decay of 0.05 and gradient clipping are employed to avoid overfitting and stabilize training.</p>
<p>Scale-adaptive hyperparameters dynamically adjust learning rates based on detector size to ensure consistency in feature extraction across model sizes. Data augmentation techniques outlined in Section 3.1.1 are utilized to increase model robustness and generalization.</p></sec>
<sec>
<title>3.3 Evaluation</title>
<p>Model performance evaluation for WCE pathology detection relies on standard object detection metrics that include accuracy, precision, recall, F1-score, and mean Average Precision together with Intersection over Union. The selected metrics evaluate detection reliability for clinical purposes across various pathology groups. Real-time feasibility is determined by measuring the inference speed (FPS) and performing ROC-AUC analysis as well as confusion matrix evaluations. Robustness testing occurs under different conditions such as illumination levels and image artifacts and frame rates to validate the model&#x00027;s performance. The evaluation procedures follow previously used methods to ensure consistency in result reporting (Habe et al., <xref ref-type="bibr" rid="B13">2024</xref>; Jin and Zhang, <xref ref-type="bibr" rid="B16">2024</xref>; Wu et al., <xref ref-type="bibr" rid="B46">2023</xref>; Li et al., <xref ref-type="bibr" rid="B22">2024b</xref>).</p></sec></sec>
<sec sec-type="results" id="s4">
<title>4 Results</title>
<p>This study evaluates the performance of three RT-DETR variants RT-DETR-S, RT-DETR-M, and RT-DETR-X on the Kvasir-Capsule dataset for WCE pathology detection. The evaluation considers key metrics such as Average Precision (AP), Recall, F1-score, ROC AUC, and inference speed (FPS). The models are compared against existing object detection methods to assess their accuracy and efficiency.</p>
<sec>
<title>4.1 Model performance and accuracy</title>
<p>The results in <xref ref-type="table" rid="T2">Table 2</xref> highlight the high detection accuracy of RT-DETR models. RT-DETR-X achieves the highest AP (78.3%) at IoU 0.50:0.95, confirming its superior capability in pathology detection. However, RT-DETR-M follows closely with AP 78.1%, showing a marginal 0.2 percentage points difference while offering a balanced approach between accuracy and computational efficiency. RT-DETR-S achieves AP 77.8%, demonstrating competitive accuracy while significantly outperforming in inference speed.</p>


<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Performance comparison of RT-DETR models.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Metric</bold></th>
<th valign="top" align="center"><bold>RT-DETR-S</bold></th>
<th valign="top" align="center"><bold>RT-DETR-M</bold></th>
<th valign="top" align="center"><bold>RT-DETR-X</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><bold>Total time(s)</bold></td>
<td valign="top" align="center">14.00</td>
<td valign="top" align="center">17.00</td>
<td valign="top" align="center">24.00</td>
</tr> <tr>
<td valign="top" align="left"><bold>Average FPS</bold></td>
<td valign="top" align="center">270.52</td>
<td valign="top" align="center">187.08</td>
<td valign="top" align="center">59.30</td>
</tr> <tr>
<td valign="top" align="left"><bold>Evaluation time (s)</bold></td>
<td valign="top" align="center">14.00</td>
<td valign="top" align="center">17.00</td>
<td valign="top" align="center">24.00</td>
</tr> <tr>
<td valign="top" align="left"><bold>Average precision (AP) &#x00040;[IoU=0.50:0.95]</bold></td>
<td valign="top" align="center">0.778</td>
<td valign="top" align="center">0.781</td>
<td valign="top" align="center">0.783</td>
</tr> <tr>
<td valign="top" align="left"><bold>AP &#x00040;[IoU=0.50]</bold></td>
<td valign="top" align="center">0.982</td>
<td valign="top" align="center">0.980</td>
<td valign="top" align="center">0.974</td>
</tr> <tr>
<td valign="top" align="left"><bold>AP [IoU=0.75]</bold></td>
<td valign="top" align="center">0.841</td>
<td valign="top" align="center">0.853</td>
<td valign="top" align="center">0.855</td>
</tr> <tr>
<td valign="top" align="left"><bold>AP [IoU=0.50:0.95 | area=small]</bold></td>
<td valign="top" align="center">0.515</td>
<td valign="top" align="center">0.471</td>
<td valign="top" align="center">0.463</td>
</tr> <tr>
<td valign="top" align="left"><bold>AP [IoU=0.50:0.95 | area=medium]</bold></td>
<td valign="top" align="center">0.714</td>
<td valign="top" align="center">0.720</td>
<td valign="top" align="center">0.718</td>
</tr> <tr>
<td valign="top" align="left"><bold>AP [IoU=0.50:0.95 | area=large]</bold></td>
<td valign="top" align="center">0.821</td>
<td valign="top" align="center">0.828</td>
<td valign="top" align="center">0.836</td>
</tr> <tr>
<td valign="top" align="left"><bold>Average Recall (AR) [IoU=0.50:0.95 | maxDets=1]</bold></td>
<td valign="top" align="center">0.803</td>
<td valign="top" align="center">0.806</td>
<td valign="top" align="center">0.807</td>
</tr> <tr>
<td valign="top" align="left"><bold>AR [IoU=0.50:0.95 | maxDets=10]</bold></td>
<td valign="top" align="center">0.820</td>
<td valign="top" align="center">0.822</td>
<td valign="top" align="center">0.824</td>
</tr> <tr>
<td valign="top" align="left"><bold>AR [IoU=0.50:0.95 | maxDets=100]</bold></td>
<td valign="top" align="center">0.847</td>
<td valign="top" align="center">0.854</td>
<td valign="top" align="center">0.839</td>
</tr> <tr>
<td valign="top" align="left"><bold>AR [IoU=0.50:0.95 | area=small]</bold></td>
<td valign="top" align="center">0.646</td>
<td valign="top" align="center">0.603</td>
<td valign="top" align="center">0.556</td>
</tr> <tr>
<td valign="top" align="left"><bold>AR [IoU=0.50:0.95 | area=medium]</bold></td>
<td valign="top" align="center">0.800</td>
<td valign="top" align="center">0.815</td>
<td valign="top" align="center">0.792</td>
</tr> <tr>
<td valign="top" align="left"><bold>AR [IoU=0.50:0.95 | area=large]</bold></td>
<td valign="top" align="center">0.884</td>
<td valign="top" align="center">0.883</td>
<td valign="top" align="center">0.873</td>
</tr></tbody>
</table>
</table-wrap>


<p>In AP at IoU 0.50, RT-DETR-M scores slightly higher (98.0%) than RT-DETR-X (97.4%), suggesting that it maintains strong detection confidence at a relaxed threshold. However, RT-DETR-X achieves the highest AP at IoU 0.75 (85.5%), making it the most reliable in precise localization of pathology regions. These minor differences indicate that the training process for RT-DETR-X could be further optimized to fully utilize its parameter-rich architecture and maximize performance.</p></sec>
<sec>
<title>4.2 Inference speed and computational efficiency</title>
<p>Inference speed is critical for real-time medical applications. RT-DETR-S achieves the highest FPS (270.52), making it the best choice for real-time WCE analysis. RT-DETR-M follows with 187.08 FPS, offering a strong balance between speed and accuracy. RT-DETR-X, while achieving the highest detection precision, operates at 59.3 FPS due to its larger architecture. Given the Kvasir-Capsule dataset&#x00027;s sizable 47,238 images, RT-DETR-M&#x00027;s competitive performance relative to RT-DETR-X suggests that further tuning of hyperparameters and training epochs for RT-DETR-X might unlock additional gains in accuracy.</p>
<p>These results confirm that RT-DETR models maintain computational efficiency while ensuring high accuracy. RT-DETR-M provides a strong trade-off between inference speed and detection performance, making it ideal for clinical settings where both precision and efficiency are required.</p></sec>
<sec>
<title>4.3 Comparative analysis with other models</title>
<p>A comparison of RT-DETR models with existing object detection frameworks is presented in <xref ref-type="table" rid="T3">Table 3</xref>. For RT-DETR-M (ours) and RT-DETR-S (ours), Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>), with Kvasir-SEG dataset serves as the direct baseline. RT-DETR-M(ours) achieves an AP50:95 of 78.1%, marking a 7.08 percentage points improvement over RT-DETR-ResNet34 (71.02%) from Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>). Similarly, RT-DETR-S records an AP50:95 of 77.8%, surpassing RT-DETR-ResNet18 (Yu et al., <xref ref-type="bibr" rid="B48">2025</xref>) (70.38%) by 7.42 percentage points. These improvements confirm that our RT-DETR models outperform previous RT-DETR implementations in accuracy while maintaining a better balance between computational efficiency and detection performance.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Comparative analysis of RT-DETR variants and current object detection models for WCE pathology detection.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Input Size</bold></th>
<th valign="top" align="center"><bold>AP50:95<sub>val</sub></bold></th>
<th valign="top" align="center"><bold>AP50<sub>val</sub></bold></th>
<th valign="top" align="center"><bold>&#x00023;Params (M)</bold></th>
<th valign="top" align="center"><bold>&#x00023;Epochs</bold></th>
<th valign="top" align="center"><bold>FPS</bold></th>
<th valign="top" align="center"><bold>Ref</bold>.</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">YOLOv8-L</td>
<td valign="top" align="center">WCE-BleedGen</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">68.9</td>
<td valign="top" align="center">80.2</td>
<td valign="top" align="center">43</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">Alavala et al. (<xref ref-type="bibr" rid="B1">2024</xref>) </td>
</tr> <tr>
<td valign="top" align="left">CRH-YOLO</td>
<td valign="top" align="center">LDPolypVideo</td>
<td valign="top" align="center">320</td>
<td valign="top" align="center">67.8</td>
<td valign="top" align="center">95.7</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">300</td>
<td valign="top" align="center">96.5</td>
<td valign="top" align="center">Wan et al. (<xref ref-type="bibr" rid="B42">2024a</xref>) </td>
</tr> <tr>
<td valign="top" align="left">PD-YOLO</td>
<td valign="top" align="center">CVC-ColonDB</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">70.6</td>
<td valign="top" align="center">94.7</td>
<td valign="top" align="center">11.9</td>
<td valign="top" align="center">300</td>
<td valign="top" align="center">45.2</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">PD-YOLO</td>
<td valign="top" align="center">Kvasir-SEG</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">63.9</td>
<td valign="top" align="center">88.2</td>
<td valign="top" align="center">11.9</td>
<td valign="top" align="center">300</td>
<td valign="top" align="center">45</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-ResNet50</td>
<td valign="top" align="center">CVC-ColonDB</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">65.04</td>
<td valign="top" align="center">84.11</td>
<td valign="top" align="center">42.8</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">11.8</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-ResNet18</td>
<td valign="top" align="center">CVC-ColonDB</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">64.17</td>
<td valign="top" align="center">85.23</td>
<td valign="top" align="center">20.1</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">21.9</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-ResNet34</td>
<td valign="top" align="center">CVC-ColonDB</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">66.39</td>
<td valign="top" align="center">85.51</td>
<td valign="top" align="center">30.1</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">16.9</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-ResNet50</td>
<td valign="top" align="center">Kvasir-SEG</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">68.66</td>
<td valign="top" align="center">88.30</td>
<td valign="top" align="center">42.8</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">11.3</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-ResNet34</td>
<td valign="top" align="center">Kvasir-SEG</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">71.02</td>
<td valign="top" align="center">88.59</td>
<td valign="top" align="center">30.1</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">17.3</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-ResNet18</td>
<td valign="top" align="center">Kvasir-SEG</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">70.38</td>
<td valign="top" align="center">89.42</td>
<td valign="top" align="center">20.1</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">21.6</td>
<td valign="top" align="center">Yu et al. (<xref ref-type="bibr" rid="B48">2025</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-R101</td>
<td valign="top" align="center">WCE-BleedGen</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">66.7</td>
<td valign="top" align="center">75</td>
<td valign="top" align="center">150</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">Alavala et al. (<xref ref-type="bibr" rid="B1">2024</xref>) </td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-R50</td>
<td valign="top" align="center">LDPolypVideo</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">65.2</td>
<td valign="top" align="center">90.2</td>
<td valign="top" align="center">42.8</td>
<td valign="top" align="center">300</td>
<td valign="top" align="center">17.2</td>
<td valign="top" align="center">Wan et al. (<xref ref-type="bibr" rid="B42">2024a</xref>) </td>
</tr> <tr>
<td valign="top" align="left">DETR-DC5-R101</td>
<td valign="top" align="center">WCE-BleedGen</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center">72.3</td>
<td valign="top" align="center">61.2</td>
<td valign="top" align="center">58</td>
<td valign="top" align="center">500</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">Alavala et al. (<xref ref-type="bibr" rid="B1">2024</xref>) </td>
</tr> <tr>
<td valign="top" align="left">DETR-R50</td>
<td valign="top" align="center">WCE-BleedGen</td>
<td valign="top" align="center">224</td>
<td valign="top" align="center">73.28</td>
<td valign="top" align="center">74.47</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">500</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">Alawode et al. (<xref ref-type="bibr" rid="B2">2024</xref>) </td>
</tr> <tr>
<td valign="top" align="left"><bold>RT-DETR-S-R18</bold></td>
<td valign="top" align="center">Kvasir-Capsule</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center"><bold>77.8</bold></td>
<td valign="top" align="center"><bold>98.2</bold></td>
<td valign="top" align="center"><bold>20</bold></td>
<td valign="top" align="center"><bold>120</bold></td>
<td valign="top" align="center"><bold>270.52</bold></td>
<td valign="top" align="center">ours </td>
</tr> <tr>
<td valign="top" align="left"><bold>RT-DETR-M-R34</bold></td>
<td valign="top" align="center">Kvasir-Capsule</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center"><bold>78.1</bold></td>
<td valign="top" align="center"><bold>98.0</bold></td>
<td valign="top" align="center"><bold>31</bold></td>
<td valign="top" align="center"><bold>120</bold></td>
<td valign="top" align="center"><bold>187.08</bold></td>
<td valign="top" align="center">ours </td>
</tr> <tr>
<td valign="top" align="left"><bold>RT-DETR-X-R101</bold></td>
<td valign="top" align="center">Kvasir-Capsule</td>
<td valign="top" align="center">640</td>
<td valign="top" align="center"><bold>78.3</bold></td>
<td valign="top" align="center"><bold>97.4</bold></td>
<td valign="top" align="center"><bold>76</bold></td>
<td valign="top" align="center"><bold>120</bold></td>
<td valign="top" align="center"><bold>59.3</bold></td>
<td valign="top" align="center">ours </td>
</tr></tbody>
</table>
<table-wrap-foot>
<fn id="TN1"><p>RT-DETR-S-R18, RT-DETR-M-R34, and RT-DETR-X-R101 denote our models evaluated on the Kvasir-Capsule dataset with different ResNet backbone sizes.</p></fn>
<fn id="TN2"><p><sup>&#x0002A;&#x0002A;</sup>HarDNet-CPS <sup>&#x0002A;&#x0002A;</sup>achieved the highest <sup>&#x0002A;&#x0002A;</sup>AP50 (91.10%)<sup>&#x0002A;&#x0002A;</sup> on Kvasir-SEG, demonstrating strong segmentation performance.</p></fn>
<fn id="TN3"><p><sup>&#x0002A;&#x0002A;</sup>RT-DETR models from PD-YOLO study<sup>&#x0002A;&#x0002A;</sup> (ResNet18, ResNet34, and ResNet50) were tested on <sup>&#x0002A;&#x0002A;</sup>Kvasir-SEG<sup>&#x0002A;&#x0002A;</sup> and <sup>&#x0002A;&#x0002A;</sup>CVC-ColonDB<sup>&#x0002A;&#x0002A;</sup> datasets, showing strong detection accuracy.</p></fn>
<fn id="TN4"><p><sup>&#x0002A;&#x0002A;</sup>RT-DETR-R50 was tested on LDPolypVideo<sup>&#x0002A;&#x0002A;</sup> and showed competitive results with an &#x0002A;&#x0002A;AP50 of 90.2% and FPS of 17.2&#x0002A;&#x0002A;.</p></fn>
<fn id="TN5"><p><sup>&#x0002A;&#x0002A;</sup>CRH-YOLO achieved the best FPS (96.5) and highest AP50 (95.7%) on LDPolypVideo<sup>&#x0002A;&#x0002A;</sup>, showing its efficiency in real-time detection.</p></fn>
<fn id="TN6"><p><sup>&#x0002A;&#x0002A;</sup>Our models achieve the highest AP50 scores, demonstrating their effectiveness for polyp detection in endoscopic images.<sup>&#x0002A;&#x0002A;</sup></p></fn>
<fn id="TN7"><p>RT-DETR achieves an <sup>&#x0002A;&#x0002A;</sup>AP50 of 88.9% on ImageNet-VID<sup>&#x0002A;&#x0002A;</sup> (Chae and Cho, <xref ref-type="bibr" rid="B5">2023</xref>; Hao et al., <xref ref-type="bibr" rid="B14">2024</xref>).</p></fn>
<fn id="TN8"><p><sup>&#x0002A;&#x0002A;</sup>Real-time deep learning processing<sup>&#x0002A;&#x0002A;</sup> enables <sup>&#x0002A;&#x0002A;</sup>WCE video analysis in endoscopic procedures<sup>&#x0002A;&#x0002A;</sup>, with an <sup>&#x0002A;&#x0002A;</sup>average inference speed of 14.1 ms<sup>&#x0002A;&#x0002A;</sup> (Sahafi et al., <xref ref-type="bibr" rid="B33">2022</xref>).</p></fn>
</table-wrap-foot>
</table-wrap>


<p>The RT-DETR-R101 in Alavala et al. (<xref ref-type="bibr" rid="B1">2024</xref>) attains an AP50:95 of 81.0% which is 2.7 percentage points higher than RT-DETR-X (ours). This can be explained by the fact that RT-DETR-R101 (Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>) has been trained for 150 epochs while RT-DETR-X (ours) for 120 epochs, which has given additional time for feature enhancement. However, RT-DETR-X (ours) reaches a value of 97.4 for AP50 compared to 66.7 for RT-DETR-R101 (Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>) which shows better detection quality at several IoU thresholds. Because of this, the RT-DETR-R101 (Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>) has a higher AP50:95 but the AP50 score is lower, it also has the potential to be overfitting, resulting in poor stability in analyzing real-world WCE video.</p>
<p>Comparing RT-DETR models with other object detection frameworks, PD-YOLO (Yu et al., <xref ref-type="bibr" rid="B48">2025</xref>) and CRH-YOLO (Wan et al., <xref ref-type="bibr" rid="B42">2024a</xref>) achieve AP50 scores of 94.7% and 95.7%, respectively. However, their AP50:95 scores drop to 70.6% and 67.8%, indicating weaker localization precision under stricter IoU thresholds. This suggests that while these models excel in high-confidence detections, they struggle with more challenging pathology cases that require refined localization accuracy.</p>
<p>The overall results confirm that RT-DETR-X, RT-DETR-M, and RT-DETR-S demonstrate superior accuracy, robustness, and adaptability for clinical applications. While RT-DETR-R101 (Alavala et al., <xref ref-type="bibr" rid="B1">2024</xref>) reports a slightly higher AP50:95, its lower AP50 score and longer training setup indicate trade-offs in overfitting and computational efficiency. Our models maintain a strong balance between precision, robustness, and feasibility, making them highly suitable for WCE video analysis.</p></sec>
<sec>
<title>4.4 Classification performance and diagnostic precision</title>
<p>The classification results in <xref ref-type="table" rid="T1">Table 1</xref> demonstrate that RT-DETR models effectively detect gastrointestinal abnormalities with high accuracy. RT-DETR-M achieves the highest F1-score across most pathology classes, ensuring a balanced trade-off between precision and recall. The model reaches an F1-score of 1.00 for Ampulla of Vater, Angiectasia, Blood Fresh, Blood Hematin, Polyp, and Lymphangiectasia, indicating exceptional reliability in detecting these abnormalities. RT-DETR-X closely follows with comparable performance but records slightly lower recall for Erosion at 0.88 compared to 0.92 for RT-DETR-M, and for Ulcer at 0.94 compared to 0.97. RT-DETR-S maintains competitive classification accuracy, though its recall for Blood Fresh and Erosion remains at 0.92, slightly lower than the other two models. These variations suggest that RT-DETR-M achieves the best balance, while RT-DETR-X offers higher precision for select abnormalities.</p>
<p>The confusion matrix analysis in <xref ref-type="fig" rid="F2">Figures 2</xref>&#x02013;<xref ref-type="fig" rid="F4">4</xref> further confirms the effectiveness of the models in distinguishing between pathological and non-pathological frames. RT-DETR-M achieves true positive rates exceeding 99% in key pathologies such as Ampulla of Vater, Angiectasia, and Polyp, reinforcing its classification stability. RT-DETR-X performs similarly but shows a slight drop in recall for a few classes. RT-DETR-S, while optimized for real-time performance, still maintains high classification accuracy, though it exhibits a minor reduction in sensitivity for detecting certain abnormalities. The Background class remains consistently undetected across all models, ensuring that the models do not mistakenly classify non-pathological regions as abnormalities.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>RT-DETR-S normalized confusion matrix.</p></caption>
<alt-text>A multi-class ROC curve showing the performance of a medical image classification model across 12 classes. Each curve represents a different class such as Ampulla of Vater, Angiectasia, Blood (fresh and hematin), Erosion, Erythema, Foreign body, Lymphangiectasia, Polyp, Ulcer, and background. The AUC values are listed in the legend, with most classes achieving high AUC scores above 0.90, except for background (AUC = 0.48).</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0002.tif"/>
</fig>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>RT-DETR-M normalized confusion matrix.</p></caption>
<alt-text>A second ROC curve comparing the classification performance of the RT-DETR model. The chart demonstrates nearly perfect classification for most classes including Ampulla of Vater, Angiectasia, Blood (fresh and hematin), and Polyp with AUC = 1.00. Only Erosion and background fall below the perfect score, indicating extremely high model accuracy overall.</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0003.tif"/>
</fig>

<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>RT-DETR-X normalized confusion matrix.</p></caption>
<alt-text>Another ROC curve variation showing high performance across several medical imaging classes. Most classes maintain AUC values close to or at 1.00, such as Ampulla of Vater and Blood - hematin. Background class remains the weakest with an AUC of 0.48, demonstrating the model&#x00027;s difficulty distinguishing background.</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0004.tif"/>
</fig>



<p>The ability of the models to differentiate between pathology and non-pathology regions is further supported by the ROC AUC scores in <xref ref-type="table" rid="T4">Table 4</xref> and <xref ref-type="fig" rid="F5">Figures 5</xref>&#x02013;<xref ref-type="fig" rid="F7">7</xref>. RT-DETR-M achieves an ROC AUC of 0.99, confirming its superior ability to generalize across different pathology types. RT-DETR-X follows with an ROC AUC of 0.97, demonstrating high precision in its classifications, while RT-DETR-S, with an ROC AUC of 0.93, remains an efficient model suited for real-time clinical applications.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Comparative model performance.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Model variant</bold></th>
<th valign="top" align="center"><bold>ROC AUC (Average)</bold></th>
<th valign="top" align="center" colspan="3"><bold>Macro Avg</bold></th>
<th valign="top" align="center" colspan="3"><bold>Weighted Avg</bold></th>
</tr>
<tr>
<th/>
<th/>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>F1-Score</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Recall</bold></th>
<th valign="top" align="center"><bold>F1-Score</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">RT-DETR-S</td>
<td valign="top" align="center">0.93</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">0.94</td>
<td valign="top" align="center">0.95</td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-M</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.94</td>
<td valign="top" align="center">0.95</td>
</tr> <tr>
<td valign="top" align="left">RT-DETR-X</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.98</td>
</tr></tbody>
</table>
</table-wrap>

<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>ROC curve for RT-DETR-S.</p></caption>
<alt-text>A confusion matrix visualizing classification accuracy of different gastrointestinal conditions. Diagonal values show high accuracy for most classes such as Lymphangiectasia, Ulcer, and Foreign body. Background class shows misclassification across many classes, confirming model challenges in detecting non-lesion areas.</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0005.tif"/>
</fig>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>ROC curve for RT-DETR-M.</p></caption>
<alt-text>Another confusion matrix showing improved accuracy with perfect classification for conditions like Ampulla of Vater, Angiectasia, Blood (fresh and hematin), and Polyp. The background class has a 50% misclassification rate, further confirming it as the most challenging class.</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0006.tif"/>
</fig>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>ROC curve for RT-DETR-X.</p></caption>
<alt-text>A third confusion matrix highlighting reduced model performance for some classes such as Erosion (87.5%) and Erythema (90%), while most others remain highly accurate. Background class still exhibits misclassification with several false positives spread across different classes.</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0007.tif"/>
</fig>


</sec>
</sec>
<sec sec-type="discussion" id="s5">
<title>5 Discussion</title>
<p>The study outcomes demonstrate how different RT-DETR variants affect performance levels when detecting pathologies through WCE. RT-DETR models offer a more adaptable and clinically practical solution for pathology detection in WCE compared to existing object detection frameworks. While models like CRH-YOLO (Wan et al., <xref ref-type="bibr" rid="B42">2024a</xref>) and PD-YOLO (Yu et al., <xref ref-type="bibr" rid="B48">2025</xref>) demonstrate strong performance in specific tasks, they lack the robustness needed for comprehensive WCE analysis, particularly in handling diverse pathology types with varying lesion sizes. The research data shows that higher model complexity does not automatically lead to better accuracy results. The additional parameters in RT-DETR-X lead to only a 0.2 percentage points improvement in AP50:95 detection performance when compared to RT-DETR-M which suggests that model size may not be as important as optimizing training strategies and hyperparameters.</p>
<p>The selection process for models heavily depends on how quickly they can generate inferences. RT-DETR-S offers the fastest frame rate of 270.52 FPS which makes it ideal for real-time diagnostic use. The fast operation of this system leads to slightly diminished detection accuracy for small target objects including ulcers and polyps. The combination of high accuracy and speed performance in RT-DETR-M results in 187.08 FPS making this model the optimal choice for real-time WCE analysis. RT-DETR-X demonstrates the highest AP50:95 score of 78.3 percent but runs at 59.30 FPS which makes it suitable for offline or post-procedure analysis when real-time operation is not necessary.</p>
<p>Model efficiency depends heavily on the backbone architecture design. The RT-DETR-S model with ResNet-18 architecture focuses on speed but struggles to detect smaller or complex abnormalities. The RT-DETR-M network with ResNet-34 architecture demonstrates superior pathology type classification consistency which makes it an optimal selection for medical use. RT-DETR-X utilizes ResNet-101 for feature extraction and detection sensitivity enhancement but requires substantial computational power that hinders its deployment in real-time applications.</p>
<p>The classification performance of RT-DETR-M shows better detection reliability when identifying Fresh Blood and Erosion which ensures reliable medical application detection. The confusion matrix analysis shows that RT-DETR-M delivers an excellent true positive rate which qualifies it as an ideal model for medical applications. The recall performance of RT-DETR-X remains lower than its precision rates which may affect its ability to detect uncommon abnormalities. RT-DETR-S offers enhanced speed performance at the cost of sensitivity which needs thorough examination before clinical implementation.</p>
<p>The ROC AUC analysis demonstrates the reliability of RT-DETR models through its results. RT-DETR-M demonstrates the best performance in AUC value measurements across diverse pathology classes which demonstrates its strong capability to detect abnormalities accurately. RT-DETR-X demonstrates strong performance in medical context since it detects fine lesions though RT-DETR-S stands out due to its speed advantages when operating on WCE videos in real-time conditions.</p>
<p>Researchers should focus their work towards better detection of smaller lesions and enhance model recall efficacy and training techniques to optimize model operational performance. Computational efficiency can be preserved through multi-scale feature extraction techniques and adaptive learning approaches that would improve detection performance. Research should explore how longer training sessions combined with learning rate modifications affect the performance of RT-DETR-X in terms of optimizing its complex structure for better accuracy results.</p>
<p>RT-DETR models establish a flexible method for WCE pathology detection which scales effectively according to different requirements. RT-DETR-S serves real-time diagnostic needs while RT-DETR-M strikes a performance and speed equilibrium and RT-DETR-X provides maximum detection precision for detailed offline evaluations. The flexibility of these models allows for effective integration into clinical workflows, enhancing early disease detection and improving patient outcomes.</p></sec>
<sec sec-type="conclusions" id="s6">
<title>6 Conclusion</title>
<p>In this work, we addressed the improvement of the accuracy and time efficiency of analyzing WCE video with help of transformer models. The presented methodology employed RT-DETR variants with novel backbones including ResNet-18, ResNet-34, and ResNet-101 as well as HybridEncoder to enhance feature learning. The obtained results in the reformed COCO Kvasir Capsule format with the desired balance between speed and accuracy of the models are presented in above mentioned <xref ref-type="table" rid="T2">Table 2</xref> and the highest accuracy for large objects and consistent detection results in our RT-DETR-M and RT-DETR-X as expected. It was found that these improvements in the detection of WCE videos can be linked to the incorporation of object detection transformer models that are particularly useful in detecting long-range features and spatial connections.</p>
<p>The conclusions also underlined the significance of choosing the right backbone; while ResNet-101 showed the best accuracy for important diagnostic tasks, ResNet-34 would be suitable for faster execution without significant loss in precision. Besides, we identified an improvement in HybridEncoder, which is responsible for improving the multi-scale feature extraction, in all the models with regard to detection improvement. This research adds to the current knowledge of transformer models applied to the medical field for analysis of images, and provides a solution with high real-time performance for pathological diagnosis in WCE videos.</p>
<p>Future investigations should concentrate on resolving inconsistencies in assessment by fine-tuning hyperparameters, regularization methods, and model structures to enhance the stability and generalizability of larger-size RT-DETR models. To mitigate overfitting, improvements can be made to learning rate schedules, weight decay approaches, and data augmentation techniques, while pruning and lightweight transformer adjustments can boost efficiency. Additionally, validating the models across a diverse range of WCE datasets and integrating real-time optimization methods (such as quantization and hardware acceleration) will ensure that RT-DETR-X maintains reliable accuracy and efficiency in practical applications. The application prototype allowed real-time testing of WCE video analysis for detection assessment as shown in <xref ref-type="fig" rid="F8">Figure 8</xref>.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>Graphical user interface (GUI) of the RT-DETR video processor.</p></caption>
<alt-text>A screenshot of the RT-DETR video processor application. The central panel displays a frame with a detected region labeled &#x00027;Erosion: 0.91.&#x00027; Detection results on the right include bounding box coordinates and confidence scores for multiple classes. The bottom strip displays thumbnails for different detected objects including Lymphangiectasia, Ulcer, Blood - fresh, and Angiectasia, each with labels and bounding boxes.</alt-text>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-08-1529814-g0008.tif"/>
</fig>

</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://osf.io/dv2ag/wiki/home/">https://osf.io/dv2ag/wiki/home/</ext-link>.</p>
</sec>
<sec sec-type="ethics-statement" id="s8">
<title>Ethics statement</title>
<p>Written informed consent from the patients or patients legal guardian/next of kin was not required to participate in this study in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>TH: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. KH: Conceptualization, Formal analysis, Funding acquisition, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. PT: Conceptualization, Formal analysis, Funding acquisition, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p></sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alavala</surname> <given-names>S.</given-names></name> <name><surname>Vadde</surname> <given-names>A. K.</given-names></name> <name><surname>Kancheti</surname> <given-names>A.</given-names></name> <name><surname>Gorthi</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>A robust pipeline for classification and detection of bleeding frames in wireless capsule endoscopy using swin transformer and rt-detr</article-title>. <source>arXiv preprint arXiv:2406.08046</source>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alawode</surname> <given-names>B.</given-names></name> <name><surname>Hamza</surname> <given-names>S.</given-names></name> <name><surname>Ghimire</surname> <given-names>A.</given-names></name> <name><surname>Velayudhan</surname> <given-names>D.</given-names></name></person-group> (<year>2024</year>). <article-title>Transformer-based wireless capsule endoscopy bleeding tissue detection and classification</article-title>. <source>arXiv preprint arXiv:2412.19218</source>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><collab>ASGE Standards of Practice Committee; Ikenberry S. O. Jue T. L. Anderson M. A. Appalaneni V. Banerjee S.</collab></person-group>. (<year>2011</year>). <article-title>Management of ingested foreign bodies and food impactions</article-title>. <source>Gastrointest. Endosc</source>. <volume>73</volume>, <fpage>1085</fpage>&#x02013;<lpage>1091</lpage>. <pub-id pub-id-type="doi">10.1016/j.gie.2010.11.010</pub-id><pub-id pub-id-type="pmid">21628009</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bordbar</surname> <given-names>M.</given-names></name> <name><surname>Helfroush</surname> <given-names>M. S.</given-names></name> <name><surname>Danyali</surname> <given-names>H.</given-names></name> <name><surname>Ejtehadi</surname> <given-names>F.</given-names></name></person-group> (<year>2023</year>). <article-title>Wireless capsule endoscopy multiclass classification using three-dimensional deep convolutional neural network model</article-title>. <source>Biomed. Eng. Online</source> <volume>22</volume>:<fpage>124</fpage>. <pub-id pub-id-type="doi">10.1186/s12938-023-01186-9</pub-id><pub-id pub-id-type="pmid">38098015</pub-id></citation></ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chae</surname> <given-names>J.-W.</given-names></name> <name><surname>Cho</surname> <given-names>H.-C.</given-names></name></person-group> (<year>2023</year>). <article-title>Enhanced classification of gastric lesions and early gastric cancer diagnosis in gastroscopy using multi-filter autoaugment</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>29391</fpage>&#x02013;<lpage>29399</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3260983</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>W.</given-names></name> <name><surname>Zhang</surname> <given-names>R.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Bao</surname> <given-names>F.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Pact-net: parallel CNNS and transformers for medical image segmentation</article-title>. <source>Comput. Methods Programs Biomed</source>. <volume>242</volume>:<fpage>107782</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2023.107782</pub-id><pub-id pub-id-type="pmid">37690317</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dadoun</surname> <given-names>H.</given-names></name> <name><surname>Rousseau</surname> <given-names>A. L.</given-names></name> <name><surname>de Kerviler</surname> <given-names>E.</given-names></name> <name><surname>Correas</surname> <given-names>J. M.</given-names></name> <name><surname>Tissier</surname> <given-names>A. M.</given-names></name> <name><surname>Joujou</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Deep learning for the detection, localization, and characterization of focal liver lesions on abdominal us images</article-title>. <source>Radiol. Artif. Intell</source>. <volume>4</volume>:<fpage>e210110</fpage>. <pub-id pub-id-type="doi">10.1148/ryai.210110</pub-id><pub-id pub-id-type="pmid">35652113</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>de Moura Lima</surname> <given-names>A. C.</given-names></name> <name><surname>de Paiva</surname> <given-names>L. F.</given-names></name> <name><surname>Brz</surname> <given-names>G.</given-names></name> <name><surname>de Almeida</surname> <given-names>J. D. S.</given-names></name> <name><surname>Silva</surname> <given-names>A. C.</given-names></name> <name><surname>Coimbra</surname> <given-names>M. T.</given-names></name></person-group> (<year>2023</year>). <article-title>A two-stage method for polyp detection in colonoscopy images based on saliency object extraction and transformers</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>76108</fpage>&#x02013;<lpage>76119</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3297097</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Feldman</surname> <given-names>M.</given-names></name> <name><surname>Friedman</surname> <given-names>L. S.</given-names></name> <name><surname>Brandt</surname> <given-names>L. J.</given-names></name></person-group> (<year>2020</year>). <source>Sleisenger and Fordtran&#x00027;s Gastrointestinal and Liver Disease: Pathophysiology, Diagnosis, Management</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>Elsevier Health Sciences</publisher-name>.</citation>
</ref>
<ref id="B10">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ginsberg</surname> <given-names>G. G.</given-names></name> <name><surname>Kochman</surname> <given-names>M. L.</given-names></name> <name><surname>Norton</surname> <given-names>I. D.</given-names></name> <name><surname>Gostout</surname> <given-names>C. J.</given-names></name></person-group> (<year>2011</year>). <source>Clinical Gastrointestinal Endoscopy E-Book</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>Elsevier Health Sciences</publisher-name>.</citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guemas</surname> <given-names>E.</given-names></name> <name><surname>Routier</surname> <given-names>B.</given-names></name> <name><surname>Ghelfenstein-Ferreira</surname> <given-names>T.</given-names></name> <name><surname>Cordier</surname> <given-names>C.</given-names></name> <name><surname>Hartuis</surname> <given-names>S.</given-names></name> <name><surname>Marion</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Automatic patient-level recognition of four plasmodium species on thin blood smear by a real-time detection transformer (RT-DETR) object detection algorithm: a proof-of-concept and evaluation</article-title>. <source>Microbiol. Spectr</source>. <volume>12</volume>:<fpage>e0144023</fpage>. <pub-id pub-id-type="doi">10.1128/spectrum.01440-23</pub-id><pub-id pub-id-type="pmid">38171008</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gui</surname> <given-names>H.</given-names></name> <name><surname>Jiao</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name> <name><surname>Jiang</surname> <given-names>X.</given-names></name> <name><surname>Su</surname> <given-names>T.</given-names></name> <name><surname>Pang</surname> <given-names>Z.</given-names></name></person-group> (<year>2024</year>). <article-title>Breast tumor detection and diagnosis using an improved faster R-CNN in dce-MRI</article-title>. <source>Bioengineering</source> <volume>11</volume>:<fpage>1217</fpage>. <pub-id pub-id-type="doi">10.3390/bioengineering11121217</pub-id><pub-id pub-id-type="pmid">39768035</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Habe</surname> <given-names>T. T.</given-names></name> <name><surname>Haataja</surname> <given-names>K.</given-names></name> <name><surname>Toivanen</surname> <given-names>P.</given-names></name></person-group> (<year>2024</year>). <article-title>Efficiency meets accuracy: benchmarking object detection models for pathology detection in wireless capsule endoscopy</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>126793</fpage>&#x02013;<lpage>126817</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3456100</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hao</surname> <given-names>C.</given-names></name> <name><surname>Wu</surname> <given-names>H.</given-names></name> <name><surname>Tao</surname> <given-names>Z.</given-names></name></person-group> (<year>2024</year>). <source>A novel method of video object detection based on improved rt-detr</source>. SSRN.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Igawa</surname> <given-names>A.</given-names></name> <name><surname>Oka</surname> <given-names>S.</given-names></name> <name><surname>Tanaka</surname> <given-names>S.</given-names></name> <name><surname>Kunihara</surname> <given-names>S.</given-names></name> <name><surname>Nakano</surname> <given-names>M.</given-names></name> <name><surname>Aoyama</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Major predictors and management of small-bowel angioectasia</article-title>. <source>BMC Gastroenterol</source>. <volume>15</volume>:<fpage>108</fpage>. <pub-id pub-id-type="doi">10.1186/s12876-015-0337-8</pub-id><pub-id pub-id-type="pmid">26302944</pub-id></citation></ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Research on microscale vehicle logo detection based on real-time detection transformer (RT-DETR)</article-title>. <source>Sensors</source> <volume>24</volume>:<fpage>6987</fpage>. <pub-id pub-id-type="doi">10.3390/s24216987</pub-id><pub-id pub-id-type="pmid">39517882</pub-id></citation></ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khun</surname> <given-names>P. C.</given-names></name> <name><surname>Zhuo</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>L. Z.</given-names></name> <name><surname>Liyuan</surname> <given-names>L.</given-names></name> <name><surname>Jiang</surname> <given-names>L.</given-names></name></person-group> (<year>2009</year>). &#x0201C;Feature selection and classification for wireless capsule endoscopic frames,? in <source>2009 International Conference on Biomedical and Pharmaceutical Engineering</source>, 1&#x02013;6. <pub-id pub-id-type="doi">10.1109/ICBPE.2009.5384106</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kimberly</surname> <given-names>J.</given-names></name> <name><surname>Baillie</surname> <given-names>J.</given-names></name></person-group> (<year>2006</year>). <article-title>Endoscopy of the upper GI tract: a training manual</article-title>. <source>Gastroenterology</source> <volume>131</volume>:<fpage>1654</fpage>. <pub-id pub-id-type="doi">10.1053/j.gastro.2006.09.047</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kuipers</surname> <given-names>E. J.</given-names></name> <name><surname>Thijs</surname> <given-names>J. C.</given-names></name> <name><surname>Festen</surname> <given-names>H. P.</given-names></name></person-group> (<year>1995</year>). <article-title>The prevalence of helicobacter pylori in peptic ulcer disease</article-title>. <source>Aliment. Pharmacol. Therap</source>. <volume>9</volume>, <fpage>59</fpage>&#x02013;<lpage>69</lpage>.</citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Yan</surname> <given-names>H.</given-names></name> <name><surname>Shi</surname> <given-names>L.</given-names></name></person-group> (<year>2024a</year>). <article-title>Multi-scale coupled attention for visual object detection</article-title>. <source>Sci. Rep</source>. <volume>14</volume>:<fpage>11191</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-60897-8</pub-id><pub-id pub-id-type="pmid">38755252</pub-id></citation></ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Ma</surname> <given-names>J.</given-names></name> <name><surname>Tillo</surname> <given-names>T.</given-names></name> <name><surname>Zhang</surname> <given-names>B.</given-names></name> <name><surname>Lim</surname> <given-names>E. G.</given-names></name></person-group> (<year>2012</year>). <article-title>&#x0201C;A training based support vector machine technique for blood detection in wireless capsule endoscopy images</article-title>, in <source>2012 IEEE-EMBS Conference on Biomedical Engineering and Sciences</source>, 826&#x02013;830. <pub-id pub-id-type="doi">10.1109/IECBES.2012.6498194</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Yoshimura</surname> <given-names>T.</given-names></name> <name><surname>Horima</surname> <given-names>Y.</given-names></name> <name><surname>Sugimori</surname> <given-names>H.</given-names></name></person-group> (<year>2024b</year>). <article-title>A hessian-based deep learning preprocessing method for coronary angiography image analysis</article-title>. <source>Electronics</source> <volume>13</volume>:<fpage>3676</fpage>. <pub-id pub-id-type="doi">10.3390/electronics13183676</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Zheng</surname> <given-names>X.</given-names></name> <name><surname>Mu</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Liu</surname> <given-names>G.</given-names></name></person-group> (<year>2024c</year>). <article-title>The intelligent gastrointestinal metaplasia assessment based on deformable transformer with token merging</article-title>. <source>Biomed. Signal Process. Control</source> <volume>95</volume>:<fpage>106454</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2024.106454</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liang</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Lin</surname> <given-names>W.</given-names></name> <name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Luo</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Enhancing gastrointestinal stromal tumor (gist) diagnosis: an improved yolov8 deep learning approach for precise mitotic detection</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>116829</fpage>&#x02013;<lpage>116840</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3446613</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>W.</given-names></name> <name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Chang</surname> <given-names>Q.</given-names></name> <name><surname>Huang</surname> <given-names>K.</given-names></name> <name><surname>Wang</surname> <given-names>G.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Rt-detrv2: improved baseline with bag-of-freebies for real-time detection transformer</article-title>. <source>arXiv preprint arXiv:2407.17140</source>.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Machicado</surname> <given-names>J. D.</given-names></name> <name><surname>Greer</surname> <given-names>J. B.</given-names></name> <name><surname>Yadav</surname> <given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>Epidemiology of gastrointestinal diseases</article-title>. <source>Geriatr. Gastroenterol</source>. <volume>24</volume>, <fpage>1007</fpage>&#x02013;<lpage>1011</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-90761-1_7-1</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Muzammul</surname> <given-names>M.</given-names></name> <name><surname>Algarni</surname> <given-names>A.</given-names></name> <name><surname>Ghadi</surname> <given-names>Y. Y.</given-names></name> <name><surname>Assam</surname> <given-names>M.</given-names></name></person-group> (<year>2024</year>). <article-title>Enhancing uav aerial image analysis: integrating advanced sahi techniques with real-time detection models on the visdrone dataset</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>21621</fpage>&#x02013;<lpage>21633</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3363413</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oh</surname> <given-names>S.</given-names></name> <name><surname>Oh</surname> <given-names>D.</given-names></name> <name><surname>Kim</surname> <given-names>D.</given-names></name> <name><surname>Song</surname> <given-names>W.</given-names></name> <name><surname>Hwang</surname> <given-names>Y.</given-names></name> <name><surname>Cho</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Video analysis of small bowel capsule endoscopy using a transformer network</article-title>. <source>Diagnostics</source> <volume>13</volume>:<fpage>3133</fpage>. <pub-id pub-id-type="doi">10.3390/diagnostics13193133</pub-id><pub-id pub-id-type="pmid">37835876</pub-id></citation></ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pascual</surname> <given-names>G.</given-names></name> <name><surname>Vitri&#x000E1;</surname> <given-names>J.</given-names></name> <name><surname>Segu&#x000ED;</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Time-coherent embeddings for wireless capsule endoscopy</article-title>, in <source>2022 26th International Conference on Pattern Recognition (ICPR)</source>, 4248&#x02013;4255. <pub-id pub-id-type="doi">10.1109/ICPR56361.2022.9956652</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pogorelov</surname> <given-names>K.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Kvasir: a multi-class image dataset for computer aided gastrointestinal disease detection</article-title>, in <source>Proceedings of the 8th ACM Multimedia Systems Conference, MMSys</source>, 164&#x02013;169. <pub-id pub-id-type="doi">10.1145/3083187.3083212</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pornvoraphat</surname> <given-names>P.</given-names></name> <name><surname>Tiankanon</surname> <given-names>K.</given-names></name> <name><surname>Pittayanon</surname> <given-names>R.</given-names></name> <name><surname>Sunthornwetchapong</surname> <given-names>P.</given-names></name> <name><surname>Vateekul</surname> <given-names>P.</given-names></name> <name><surname>Rerknimitr</surname> <given-names>R.</given-names></name></person-group> (<year>2023</year>). <article-title>Real-time gastric intestinal metaplasia diagnosis tailored for bias and noisy-labeled data with multiple endoscopic imaging</article-title>. <source>Comput. Biol. Med</source>. <volume>154</volume>:<fpage>106582</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106582</pub-id><pub-id pub-id-type="pmid">36738708</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sadeghi</surname> <given-names>V.</given-names></name> <name><surname>Sanahmadi</surname> <given-names>Y.</given-names></name> <name><surname>Behdad</surname> <given-names>M.</given-names></name> <name><surname>Vard</surname> <given-names>A.</given-names></name> <name><surname>Sharifi</surname> <given-names>M.</given-names></name> <name><surname>Raeisi</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Pixel-wise annotation for clear and contaminated regions segmentation in wireless capsule endoscopy images: a multicentre database</article-title>. <source>Data Brief</source> <volume>57</volume>:<fpage>110927</fpage>. <pub-id pub-id-type="doi">10.1016/j.dib.2024.110927</pub-id><pub-id pub-id-type="pmid">39351133</pub-id></citation></ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sahafi</surname> <given-names>A.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Rasmussen</surname> <given-names>C. L. M.</given-names></name> <name><surname>Bollen</surname> <given-names>P.</given-names></name> <name><surname>Baatrup</surname> <given-names>G.</given-names></name> <name><surname>Blanes-Vidal</surname> <given-names>V.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Edge artificial intelligence wireless video capsule endoscopy</article-title>. <source>Dental Sci. Rep</source>. <volume>12</volume>:<fpage>13732</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-17502-7</pub-id><pub-id pub-id-type="pmid">35962014</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Saltzman</surname> <given-names>J. R.</given-names></name></person-group> (<year>2024</year>). <source>Angiodysplasia of the Gastrointestinal Tract</source>. eds L. S. Friedman and C. Meyer. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.uptodate.com/contents/angiodysplasia-of-the-gastrointestinal-tract">https://www.uptodate.com/contents/angiodysplasia-of-the-gastrointestinal-tract</ext-link> (accessed August 19, 2024).</citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shao</surname> <given-names>D.</given-names></name> <name><surname>Jiang</surname> <given-names>J.</given-names></name> <name><surname>Ma</surname> <given-names>L.</given-names></name> <name><surname>Lai</surname> <given-names>H.</given-names></name> <name><surname>Yi</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Real-time medical lesion screening: accurate and rapid detectors</article-title>. <source>J. Real-Time Image Proc</source>. <volume>21</volume>:<fpage>134</fpage>. <pub-id pub-id-type="doi">10.1007/s11554-024-01512-x</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sharmila</surname> <given-names>V.</given-names></name> <name><surname>Geetha</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Gastro intestinal disease classification using hierarchical spatio pyramid tranfonet with pittree fusion and efficient-condconv swishnet</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>113972</fpage>&#x02013;<lpage>113987</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3438799</pub-id></citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shi</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name> <name><surname>Ji</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>He</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>A deep weakly semi-supervised framework for endoscopic lesion segmentation</article-title>. <source>Med. Image Anal</source>. <volume>90</volume>:<fpage>102973</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2023.102973</pub-id><pub-id pub-id-type="pmid">37757643</pub-id></citation></ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Smedsrud</surname> <given-names>P. H.</given-names></name> <name><surname>Thambawita</surname> <given-names>V.</given-names></name> <name><surname>Hicks</surname> <given-names>S. A.</given-names></name> <name><surname>Gjestang</surname> <given-names>H.</given-names></name> <name><surname>Nedrejord</surname> <given-names>O. O.</given-names></name> <name><surname>N&#x000E5;ss</surname> <given-names>E.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Kvasir-capsule, a video capsule endoscopy dataset</article-title>. <source>Sci. Data</source> <volume>8</volume>:<fpage>142</fpage>. <pub-id pub-id-type="doi">10.1038/s41597-021-00920-z</pub-id><pub-id pub-id-type="pmid">34045470</pub-id></citation></ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Strober</surname> <given-names>W.</given-names></name> <name><surname>Wochner</surname> <given-names>R. D.</given-names></name> <name><surname>Carbone</surname> <given-names>P. P.</given-names></name> <name><surname>Waldmann</surname> <given-names>T. A.</given-names></name></person-group> (<year>1967</year>). <article-title>Intestinal lymphangiectasia: a protein-losing enteropathy with hypogammaglobulinemia, lymphocytopenia and impaired homograft rejection</article-title>. <source>J. Clin. Invest</source>. <volume>46</volume>, <fpage>1643</fpage>&#x02013;<lpage>1656</lpage>. <pub-id pub-id-type="doi">10.1172/JCI105656</pub-id><pub-id pub-id-type="pmid">4168730</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>S.</given-names></name> <name><surname>Yu</surname> <given-names>X.</given-names></name> <name><surname>Cheang</surname> <given-names>C. F.</given-names></name> <name><surname>Liang</surname> <given-names>Y.</given-names></name> <name><surname>Zhao</surname> <given-names>P.</given-names></name> <name><surname>Yu</surname> <given-names>H. H.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Transformer-based multi-task learning for classification and segmentation of gastrointestinal tract endoscopic images</article-title>. <source>Comput. Biol. Med</source>. <volume>157</volume>:<fpage>106723</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106723</pub-id><pub-id pub-id-type="pmid">36907035</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Varam</surname> <given-names>D.</given-names></name> <name><surname>Mitra</surname> <given-names>R.</given-names></name> <name><surname>Mkadmi</surname> <given-names>M.</given-names></name> <name><surname>Riyas</surname> <given-names>R. A.</given-names></name> <name><surname>Abuhani</surname> <given-names>D. A.</given-names></name> <name><surname>Dhou</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Wireless capsule endoscopy image classification: an explainable AI approach</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>105262</fpage>&#x02013;<lpage>105280</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3319068</pub-id></citation>
</ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wan</surname> <given-names>J.</given-names></name> <name><surname>Zhu</surname> <given-names>W.</given-names></name> <name><surname>Chen</surname> <given-names>B.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Chang</surname> <given-names>K.</given-names></name> <name><surname>Meng</surname> <given-names>X.</given-names></name></person-group> (<year>2024a</year>). <article-title>Crh-yolo for precise and efficient detection of gastrointestinal polyps</article-title>. <source>Sci. Rep</source>. <volume>14</volume>, <fpage>1</fpage>&#x02013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-81842-9</pub-id><pub-id pub-id-type="pmid">39627309</pub-id></citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wan</surname> <given-names>J. J.</given-names></name> <name><surname>Zhu</surname> <given-names>P. C.</given-names></name> <name><surname>Chen</surname> <given-names>B. L.</given-names></name> <name><surname>Yu</surname> <given-names>Y. T.</given-names></name></person-group> (<year>2024b</year>). <article-title>A semantic feature enhanced yolov5-based network for polyp detection from colonoscopy images</article-title>. <source>Sci. Rep</source>. <volume>14</volume>:<fpage>15478</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-66642-5</pub-id><pub-id pub-id-type="pmid">38969765</pub-id></citation></ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>Y.</given-names></name> <name><surname>Zheng</surname> <given-names>Y.</given-names></name> <name><surname>Chattopadhyay</surname> <given-names>P.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name></person-group> (<year>2025</year>). <article-title>Vision transformers for image classification: a comparative survey</article-title>. <source>Technologies</source> <volume>13</volume>:<fpage>32</fpage>. <pub-id pub-id-type="doi">10.3390/technologies13010032</pub-id></citation>
</ref>
<ref id="B45">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Weerakkody</surname> <given-names>Y.</given-names></name> <name><surname>Bell</surname> <given-names>D.</given-names></name> <name><surname>Morgan</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2024</year>). <source>Ampulla of vater</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://radiopaedia.org/">Radiopaedia.org</ext-link>.</citation>
</ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>S.</given-names></name> <name><surname>Zhang</surname> <given-names>R.</given-names></name> <name><surname>Yan</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>Q.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>High-speed and accurate diagnosis of gastrointestinal disease: Learning on endoscopy images using lightweight transformer with local feature attention</article-title>. <source>Bioengineering</source> <volume>10</volume>:<fpage>1416</fpage>. <pub-id pub-id-type="doi">10.3390/bioengineering10121416</pub-id><pub-id pub-id-type="pmid">38136007</pub-id></citation></ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Y.</given-names></name> <name><surname>Shen</surname> <given-names>Y.</given-names></name> <name><surname>Fernandez-Granda</surname> <given-names>C.</given-names></name> <name><surname>Heacock</surname> <given-names>L.</given-names></name> <name><surname>Geras</surname> <given-names>K. J.</given-names></name></person-group> (<year>2024</year>). <article-title>Understanding differences in applying detr to natural and medical images</article-title>. <source>arXiv preprint arXiv:2405.17677</source>.</citation>
</ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>K.</given-names></name> <name><surname>Hong</surname> <given-names>J.</given-names></name> <name><surname>Tsai</surname> <given-names>R.-G.</given-names></name> <name><surname>Huang</surname> <given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>Pd-yolo: colon polyp detection model based on enhanced small-target feature extraction</article-title>. <source>Comput. Mater. Continua</source> <volume>82</volume>, <fpage>913</fpage>&#x02013;<lpage>928</lpage>. <pub-id pub-id-type="doi">10.32604/cmc.2024.058467</pub-id></citation>
</ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>C.</given-names></name> <name><surname>Bracke</surname> <given-names>M.</given-names></name> <name><surname>da Silva Torres</surname> <given-names>R.</given-names></name> <name><surname>Gansel</surname> <given-names>L. C.</given-names></name></person-group> (<year>2024a</year>). <article-title>Rapid detection of salmon louse larvae in seawater based on machine learning</article-title>. <source>Aquaculture</source> <volume>592</volume>:<fpage>741252</fpage>. <pub-id pub-id-type="doi">10.1016/j.aquaculture.2024.741252</pub-id></citation>
</ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>T.</given-names></name> <name><surname>Feng</surname> <given-names>Y.</given-names></name> <name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Lei</surname> <given-names>Y.</given-names></name> <name><surname>Ying</surname> <given-names>N.</given-names></name> <name><surname>Song</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2024b</year>). <article-title>Si-vit: shuffle instance-based vision transformer for pancreatic cancer rose image classification</article-title>. <source>Comput. Methods Programs Biomed</source>. <volume>244</volume>:<fpage>107969</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2023.107969</pub-id><pub-id pub-id-type="pmid">38064958</pub-id></citation></ref>
<ref id="B51">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Mao</surname> <given-names>Y.</given-names></name> <name><surname>Lu</surname> <given-names>X.</given-names></name> <name><surname>Zou</surname> <given-names>X.</given-names></name> <name><surname>Huang</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2024c</year>). <article-title>From single to universal: tiny lesion detection in medical imaging</article-title>. <source>Artif. Intell. Rev</source>. <volume>57</volume>:<fpage>192</fpage>. <pub-id pub-id-type="doi">10.1007/s10462-024-10762-x</pub-id></citation>
</ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Lv</surname> <given-names>W.</given-names></name> <name><surname>Xu</surname> <given-names>S.</given-names></name> <name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>G.</given-names></name> <name><surname>Dang</surname> <given-names>Q.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Detrs beat yolos on real-time object detection</article-title>. <source>arXiv [Preprint]. arXiv:2304.08069</source>.</citation>
</ref>
</ref-list>
</back>
</article>