<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2026.1781499</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Uterine cancer classification from CT images using convolutional feature extraction and transformer-based self-attention</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Alshdaifat</surname> <given-names>Eman Hussein</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/2905445"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Sindiani</surname> <given-names>Amer Mahmoud</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Alhatamleh</surname> <given-names>Salem</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<uri xlink:href="https://loop.frontiersin.org/people/3093003"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Malkawi</surname> <given-names>Rami</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Madain</surname> <given-names>Rola</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3168219"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Almahmoud</surname> <given-names>Rawan Eimad</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Al-Smadi</surname> <given-names>Bara&#x00027;a</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Al-Mnayyis</surname> <given-names>Asma&#x00027;a Mohammad</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<uri xlink:href="https://loop.frontiersin.org/people/2828566"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Amin</surname> <given-names>Mohammad</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<uri xlink:href="https://loop.frontiersin.org/people/2898364"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Abd-alrazaq</surname> <given-names>Alaa</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<uri xlink:href="https://loop.frontiersin.org/people/2573573"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Obstetrics and Gynecology, Faculty of Medicine, Yarmouk University</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Obstetrics and Gynecology, Faculty of Medicine, Jordan University of Science and Technology</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff3"><label>3</label><institution>Computer Science Department, Faculty of Information Technology and Computer Sciences, Yarmouk University</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Information Systems, Faculty of Information Technology and Computer Science, Yarmouk University</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Internal Medicine, College of Medicine, Yarmouk University</institution>, <city>Irbid</city>, <country country="jo">Jordan</country></aff>
<aff id="aff6"><label>6</label><institution>AI Center for Precision Health, Weill Cornell Medicine-Qatar, Qatar Foundation</institution>, <city>Doha</city>, <country country="qa">Qatar</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Alaa Abd-alrazaq, <email xlink:href="mailto:Aaa4027@qatar-med.cornell.edu">Aaa4027@qatar-med.cornell.edu</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-25">
<day>25</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1781499</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>04</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Alshdaifat, Sindiani, Alhatamleh, Malkawi, Madain, Almahmoud, Al-Smadi, Al-Mnayyis, Amin and Abd-alrazaq.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Alshdaifat, Sindiani, Alhatamleh, Malkawi, Madain, Almahmoud, Al-Smadi, Al-Mnayyis, Amin and Abd-alrazaq</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-25">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>Accurate and early diagnosis of uterine cancer from computed tomography images remains a challenging task due to the complexity of anatomical structures and the subtle visual differences between normal, benign, and malignant uterine tissues. Traditional diagnostic approaches and conventional deep learning models often fail to effectively capture both local and global image characteristics.</p></sec>
<sec>
<title>Objective</title>
<p>This study aims to develop and validate a novel hybrid deep learning framework that integrates convolutional feature extraction with transformer-based global attention mechanisms to improve the accuracy and robustness of uterine cancer classification from computed tomography images.</p></sec>
<sec>
<title>Methods</title>
<p>In the proposed framework, DenseNet121 is employed as a convolutional neural network feature extractor, while a transformer encoder is utilized to model long-range contextual dependencies through multi-head self-attention. DenseNet121 captures discriminative local features from computed tomography images, which are subsequently processed by the transformer to enhance global feature representation. The performance of the proposed model is evaluated using the KAUH uterine cancer computed tomography dataset, which includes three classes: normal, benign, and malignant. The proposed approach is compared with several state-of-the-art deep learning models, including VGG16, VGG19, MobileNetV2, ResNet50, and DenseNet121.</p></sec>
<sec>
<title>Results</title>
<p>Experimental results demonstrate that the proposed hybrid model outperforms the comparative models. It achieves an accuracy of 87.44%, sensitivity of 87.13%, specificity of 95.20%, an F1 score of 87.17%, and an area under the receiver operating characteristic curve of 99.41%.</p></sec>
<sec>
<title>Conclusion</title>
<p>The results confirm the effectiveness of integrating convolutional neural networks with transformer-based self-attention mechanisms for significantly improving uterine cancer classification from computed tomography images. The proposed model shows strong potential as a computer-aided decision-support tool for radiologists to assist in the detection of uterine cancer and may be extended to various real-world clinical applications.</p></sec></abstract>
<kwd-group>
<kwd>classification</kwd>
<kwd>CT images</kwd>
<kwd>deep learning</kwd>
<kwd>diagnosis</kwd>
<kwd>real dataset</kwd>
<kwd>uterine cancer</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. Open Access funding provided by the Weill Cornell Medicine - Qatar Health Sciences Library and Qatar National Library.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="3"/>
<equation-count count="17"/>
<ref-count count="43"/>
<page-count count="15"/>
<word-count count="9425"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Obstetrics and Gynecology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Uterine tumors, which include both benign and malignant tumors, create a substantial healthcare challenge because of their high occurrence rates. Uterine cancer constitutes the most frequently occurring malignancy in women throughout the United States, which results in a 3.4% lifetime cancer risk (<xref ref-type="bibr" rid="B1">1</xref>). The sixth most prevalent cancer in 2012 (<xref ref-type="bibr" rid="B2">2</xref>) affects breast cancer as the most common cancer among women in the United States. The most common tumor type that affects women is leiomyoma (<xref ref-type="bibr" rid="B3">3</xref>). The tumors develop during the female reproductive life along with the perimenopausal period, thus creating debilitating symptoms of hemorrhage, pelvic heaviness, and pain (<xref ref-type="bibr" rid="B4">4</xref>). The malignant varieties of uterine cancers include endometrial cancer along with uterine sarcomas, which have a poor prognosis despite being a rare type of cancer.</p>
<p>Uterine cancers develop among women who experience advanced age and reach menopause later and who show hormonal imbalances and who have never given birth (<xref ref-type="bibr" rid="B5">5</xref>). The relationship between genetic pre-dispositions such as Lynch syndrome and obesity, which causes increased estrogen levels, which can lead to tumor development, serves as an additional risk factor (<xref ref-type="bibr" rid="B6">6</xref>). Researching the risk factors associated with uterine cancer proves necessary because of rising cancer rates and growing death rates which reached 1.1 increase from 1999 to 2016 with black women experiencing higher rates than white women (<xref ref-type="bibr" rid="B7">7</xref>). The medical establishment achieves its mission to decrease patient death rates while delivering better patient care through the practice of diagnosing diseases at their earliest stages and providing prompt medical intervention.</p>
<p>Deep learning algorithms have been applied for the evaluation and interpretation of images obtained from CT scans with the objective of improving diagnostic accuracy. For instance, they have been applied to help in distinguishing between benign and malignant uterine smooth muscle tumors (<xref ref-type="bibr" rid="B8">8</xref>). Deep learning enables automatic tumor segmentation which has become a vital component of treatment planning. The ability of deep learning in improving the diagnostic accuracy of CT images has been proven in studies involving ovarian cancer (<xref ref-type="bibr" rid="B9">9</xref>), which further supports its application for the evaluation of uterine smooth muscle tumors. The deep learning method for staging endometrial cancer using MRI images has produced positive results according to the research findings (<xref ref-type="bibr" rid="B10">10</xref>). The applications face multiple difficulties because of dataset discrepancies which create the main obstacles. Future studies should develop models which exhibit reduced variations while including multimodal imaging elements according to (<xref ref-type="bibr" rid="B11">11</xref>).</p>
<p>Although there are significant advances in the employment of deep learning techniques in medical image analysis, there are still challenges to be encountered in diagnosing uterine cancer from computed tomography images. Most previous works have harnessed CNN architectures that are mostly designed and trained for extracting local features, with limited representation of the long-range contextual relations within the image. Thus far, only a limited number of studies have focused on the classification of uterine cancer CT images into normal, benign, and malignant categories with high accuracy, using realistic datasets from clinical settings. In general, the following contributions can be summarized as part of the research study in uterine cancer diagnosis using artificial intelligence (AI):</p>
<list list-type="bullet">
<list-item><p>A novel hybrid deep learning framework is proposed, combining the DenseNet121 network for local feature extraction with a Transformer Encoder for capturing long-range contextual dependencies using a multi-headed self-attention mechanism.</p></list-item>
<list-item><p>The accuracy of classifying CT images of uterine cancer into three categories (normal, benign, and malignant) is improved, surpassing the performance of traditional deep learning models such as VGG16, VGG19, MobileNetV2, ResNet50, and DenseNet121.</p></list-item>
<list-item><p>The proposed model&#x00027;s effectiveness is experimentally verified using a real clinical dataset (KAUH-UCCTD), enhancing the reliability of the results and their generalizability for real-world medical applications.</p></list-item>
<list-item><p>Outstanding performance is demonstrated across several evaluation metrics, including accuracy, sensitivity, specificity, F1 score, and AUC curve, showing the model&#x00027;s strength in differentiating between various conditions and reducing diagnostic errors.</p></list-item>
<list-item><p>Presenting a promising model as a clinical decision support system that can assist radiologists in the early and accurate detection of uterine cancer, with the potential to be developed and expanded for use in actual clinical settings.</p></list-item>
</list>
<p>The subsequent sections are arranged as follows: a thorough analysis of the pertinent research on uterine diseases is given in Section 2. In Section 3, the dataset is described, the applied methodology is described, and the structure of the suggested model is explained in depth. The experimental outcomes of the suggested model are shown in Section 4. The scalability and comprehensiveness of the suggested model are covered in Section 5, and the results are summarized and future research opportunities are outlined in Section 6.</p></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>Deep learning models used for many types of medical imaging, including computed tomography, magnetic resonance imaging, and histology imaging, have progressively helped diagnose uterine cancer and classify benign and malignant tumors. The most recent findings from studies on using AI for uterine cancer classification are presented in this section.</p>
<p>The study presented in (<xref ref-type="bibr" rid="B12">12</xref>) introduced a modified YOLOv5 framework for the classification of uterine abnormalities based on ultrasound images. The proposed study focused on the redesigning of the head module of the original YOLOv5 architecture, the addition of global attention mechanism (GAM), and the use of a modified activation function and ResNeXt-based CSP blocks to create a new backbone to capture the very subtle features from the noisy ultrasound images more effectively. A dataset of 3,026 clinically-acquired ultrasound images was used for model training and validation, and the images were labeled into the classes: cervical cyst, uterine fibroid, and normal. Experimental results showed that the proposed method achieved an accuracy of 80.5%, indicating the effectiveness of architectural enhancements and attention processes in enhancing the classification of uterine abnormalities based on ultrasound.</p>
<p>In another research work (<xref ref-type="bibr" rid="B13">13</xref>), the goal is to offer an objective evaluation regarding the aspects influencing the success of UFE and to promote the use of an interpretable machine learning (ML) approach to aid in decision-making in the clinical setting via features extracted from pre-operative MRI. A carefully filtered dataset with 74 patients and 311 fibroids was created, and they used Deep Set Networks, which allow for permutation-equivariant aggregation of features over the collection of fibroids. Accuracy values of 81&#x02013;88% (AUC = 0.81&#x02013;0.87) were measured in individual symptoms for the proposed models, as well as 75% (AUC = 0.74) for the overall clinical success, whereas at the fibroid level, the prediction task by ensemble tree-based techniques resulted in an accuracy of 76% max.</p>
<p>Another study (<xref ref-type="bibr" rid="B14">14</xref>) presented the role of AI in the diagnosis and treatment of uterine fibroids and uterine sarcomas, covering studies published between 2019 and March 2025. The research examined various AI techniques which included radiomics machine learning and deep neural network systems that worked with ultrasound and MRI medical images. The methods successfully distinguished between benign leiomyomas and malignant leiomyosarcomas which helped doctors develop treatment plans and improved results from minimally invasive procedures including HIFU and uterine artery embolization. AI systems demonstrated better performance than expert radiologists in most cases according to the results while multiple studies reached diagnostic accuracy levels above 0.85 AUC scores.</p>
<p>In the study (<xref ref-type="bibr" rid="B15">15</xref>), a framework was proposed for surgical decision-making in uterine fibroid management through integrating female sex hormone levels with fibroid characteristics. The study included 618 women diagnosed with uterine fibroids (UFs) from a multicenter hospital, of whom 238 underwent surgery. Multiple supervised ML algorithms, such as support vector machine (SVM), decision tree (DT), random forest (RF), logistic regression (LR), and k-nearest neighbors (KNN), with 126 different input combinations derived from hormonal markers (FSH, LH, E2, PRL, and AMH) and morphological variables. The RF model achieved the highest accuracy of 91% and an AUC of 0.88 using LH, FSH, E2, and AMH. The model showed a high level of clinical concordance through external validation using 20 independent cases, achieving a 90% agreement rate with a blinded gynecologist.</p>
<p>As per the study (<xref ref-type="bibr" rid="B16">16</xref>), a multicenter deep learning-based framework for automatically outlining clinical target volumes (CTV) and planning target volumes (PTV) in radiotherapy for uterine cancers. The research employed the self-configuring nnU-Net architecture to solve manual contouring challenges which included observer differences and institutional result variations and heavy workload demands. The researchers collected a dataset which contained 602 contrast-enhanced CT scans that included cervical and endometrial cancer cases from multiple medical centers. The researchers conducted a detailed evaluation of three nnU-Net configurations which included 2D slice-level and 3D full-resolution and 3D cascaded models through testing with Dice Similarity Coefficient (DSC) and Average Surface Distance (ASD) and Hausdorff Distance (HD95). The 3D full-resolution nnU-Net provided the best overall results, with average DSC values of 83.42% for PTV and 81.23% for CTV in internal testing. Additionally, evaluations from experienced radiation oncologists showed that approximately 90% of the automatically created contours required no changes or only minor adjustments, demonstrating the clinical usefulness of the proposed method. While the study (<xref ref-type="bibr" rid="B17">17</xref>) introduced deep learning-based nnU-Net models, for automatic segmentation of uterine fibroids and their surrounding organs based on MRI images to support high-intensity focused ultrasound HIFU surgery planning. Used a retrospective dataset of 550 T2-weighted MR images. The evaluation showed that the proposed 3D nnU-Net significantly outperformed state-of-the-art methods such as HIFUNet, U-Net, R2U-Net, ConvUNeXt, and 2D nnU-Net, achieved DSC of 92.55% for the uterus, 89.63% for the endometrium, 90.45% for the urethral orifice, 97.75% for the bladder, 95.63% for fibroids, and 92.69% for the spine.</p>
<p>In a study (<xref ref-type="bibr" rid="B18">18</xref>) a novel deep learning-based 3D super-resolution DWI (SR-DWI) radiomics model was proposed for predicting the prognosis of high-intensity focused ultrasound (HIFU) ablation of uterine fibroids. Radiomics features were extracted from manually segmented fibroid regions and subsequently reduced using <italic>t</italic>-test, Pearson&#x00027;s correlation, and the LASSO regression algorithm. machine learning classifiers, including SVM, RF, and LightGBM, were trained and validated on multicenter datasets with both internal and external testing cohorts. All DWI radiomics models showed superior AUC, according to experimental results. The best-performing HR-DWI model (SVM) achieved an AUC of approximately 0.805 in internal testing, whereas the SR-DWI&#x02013;based models demonstrated better performance, with AUC values of 0.876 in internal testing and 0.800 in external validation, indicating a statistically significant improvement over HR-DWI (<italic>P</italic> &#x0003C; 0.05). The findings show the potential of combining deep learning-based super-resolution imaging with radiomics. This approach can improve pre-operative prognostic assessment of eligible candidates for HIFU therapy.</p>
<p>According to the study (<xref ref-type="bibr" rid="B19">19</xref>) an image processing-based solution was proposed for the diagnosis of cervical cancer from uterine cervix images using transfer learning architectures to reduce workload and assist experts by leveraging deep convolutional neural networks. Histogram equalization was applied to enhance image contrast before classification, while to suppress noise, Gaussian filtering was used. Many transfer learning models, including AlexNet, MobileNetV2, DenseNet201, ResNet50, VGG19, and Xception, were systematically evaluated using a 10-fold cross-validation strategy on the Herlev pap-smear dataset. The experimental results showed that the VGG19 model achieved the best performance, with an accuracy of 98.26%, outperforming other models on the same dataset.</p>
<p>Another study (<xref ref-type="bibr" rid="B20">20</xref>) proposed a machine-learning-based radiomics framework using pre-operative contrast-enhanced computed tomography (CECT) images to differentiate uterine leiomyomas from leiomyosarcomas. After standardized pre-processing, PyRadiomics was used to extract the radiomic features, and three methods (Boruta, LASSO, and RFE) were used to select features, with multiple classifiers such as GLM, RF, and SVM. The diagnostic performance of the proposed models reached strong results because they achieved test data AUC results between 0.78 and 0.97 which exceeded the performance of radiologists who achieved AUC results between 0.70 and 0.78 when the patient&#x00027;s clinical data were available. The study demonstrates how CT-based radiomics can function as a decision-support system during pre-operative evaluations. The EfficientNetB0 model achieved classification with an accuracy of 99% in the study (<xref ref-type="bibr" rid="B21">21</xref>) which used 1,990 ultrasound images to automatically classify uterine fibroids into two categories. The study employed an attention-based model which used attention mechanisms to guide the model towards important clinical areas while it neglected non-essential parts of the body. <xref ref-type="table" rid="T1">Table 1</xref> presents a comparison of previous research on uterine diseases.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Comparison of previous works on uterine diseases.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Ref</bold></th>
<th valign="top" align="left"><bold>Year</bold></th>
<th valign="top" align="left"><bold>Methodology</bold></th>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Results</bold></th>
<th valign="top" align="left"><bold>Limitations</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="left">2025</td>
<td valign="top" align="left">Improved YOLOv5 with enhanced ResNeXt backbone, and GAM attention for ultrasound image classification</td>
<td valign="top" align="left">3,026 ultrasound images</td>
<td valign="top" align="left">Accuracy: 80.5%</td>
<td valign="top" align="left">Single-center dataset</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B13">13</xref>)</td>
<td valign="top" align="left">2025</td>
<td valign="top" align="left">Machine learning with deep set networks for patient-level prediction and fibroid-level prediction using MRI features</td>
<td valign="top" align="left">74 patients, 311 fibroids (MRI dataset)</td>
<td valign="top" align="left">Clinical outcome accuracy 75% (AUC = 0.74)<break/> symptom. prediction 81&#x02013;88%.<break/> fibroid-level accuracy up to 76%</td>
<td valign="top" align="left">Single-center study small dataset</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B14">14</xref>)</td>
<td valign="top" align="left">2025</td>
<td valign="top" align="left">AI methods applied to ultrasound, MRI, and HIFU workflows</td>
<td valign="top" align="left">Multiple prior studies (2019&#x02013;2025), datasets range from small single-center cohorts to multicenter MRI/HIFU datasets</td>
<td valign="top" align="left">Achieved AUC values &#x0003E;0.85 for diagnosis and prognosis, strong performance</td>
<td valign="top" align="left">No new experimental validation</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B15">15</xref>)</td>
<td valign="top" align="left">2025</td>
<td valign="top" align="left">Supervised ML models (RF, SVM, DT, LR, KNN) using female sex hormone parameters and fibroid features for surgical decision support</td>
<td valign="top" align="left">618 uterine fibroid patients from three hospitals</td>
<td valign="top" align="left">Best RF model 91% accuracy<break/> 90% agreement with a blinded gynecologist</td>
<td valign="top" align="left">No model calibration analysis</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B16">16</xref>)</td>
<td valign="top" align="left">2025</td>
<td valign="top" align="left">nnU-Net&#x02013;based deep learning (2D slice-level, 3D full-resolution, and 3D cascaded)</td>
<td valign="top" align="left">602 multicenter CT scans (cervical and endometrial cancers)</td>
<td valign="top" align="left">Best DSC: 83.42% (PTV), 81.23% (CTV), &#x0007E;90% contours clinically acceptable</td>
<td valign="top" align="left">High computational cost for 3D models. CT-only data</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B17">17</xref>)</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">3D nnU-Net deep learning for multi-organ MRI segmentation and 3D reconstruction for HIFU planning</td>
<td valign="top" align="left">550 retrospective T2-weighted MRI scans</td>
<td valign="top" align="left">Significantly outperformed HIFUNet, U-Net, R2U-Net, ConvUNeXt, and 2D nnU-Net<break/> DSC = 95.63% (fibroids), 92.55% (uterus)</td>
<td valign="top" align="left">Single-center retrospective dataset</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B18">18</xref>)</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">DL-based 3D SR-DWI radiomics &#x0002B; ML (SVM, RF, LightGBM)</td>
<td valign="top" align="left">Multicenter MRI DWI datasets (360 patients, internal and external validation)</td>
<td valign="top" align="left">SR-DWI achieved AUC 0.876 (internal) and 0.800 (external), outperforming radiologists</td>
<td valign="top" align="left">Manual segmentation Limited clinical features included</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B19">19</xref>)</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">Image pre-processing (histogram equalization &#x0002B; Gaussian filter) &#x0002B; transfer learning CNNs (AlexNet, DenseNet201, MobileNetV2, ResNet50, Xception, VGG19)</td>
<td valign="top" align="left">Herlev Pap-smear</td>
<td valign="top" align="left">Accuracy: 98.26%<break/> f1-measure: 0.9671<break/> specificity: 0.9896<break/> sensitivity: 0.9631<break/> precision: 0.9711<break/> MCC: 0.9552</td>
<td valign="top" align="left">Limited to a single public dataset, binary classification only</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B20">20</xref>)</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">CT-based radiomics with ML (GLM, RF, SVM) and LASSO, Boruta, RFE</td>
<td valign="top" align="left">65 patients (30 leiomyosarcoma, 35 leiomyoma)</td>
<td valign="top" align="left">AUC from 0.78 to 0.97</td>
<td valign="top" align="left">Small sample size single-center study</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B21">21</xref>)</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">Attention-based fine-tuned EfficientNetB0 for ultrasound image classification</td>
<td valign="top" align="left">1,990 Ultrasound images</td>
<td valign="top" align="left">Accuracy = 99%</td>
<td valign="top" align="left">Single dataset</td>
</tr></tbody>
</table>
</table-wrap>
<p>Hybrid deep learning systems development has demonstrated potential to improve medical imaging techniques and cancer detection methods. Alswilem and Pacal (<xref ref-type="bibr" rid="B22">22</xref>) conducted a comprehensive comparative study which assessed both computational efficiency and diagnostic accuracy of deep learning models used for automated breast cancer detection in ultrasound imaging. The study found that RexNet-200 achieved optimal performance with minimal computational resources which included 13.81 million parameters and 3.05 GFLOPs while maintaining 95 percent accuracy. The study demonstrates how model complexity needs to achieve a specific level which medical professionals need for their work throughout our DenseNet121-Transformer. Demirta&#x0015F; Alpsalaz et al. (<xref ref-type="bibr" rid="B23">23</xref>) colleagues created a hybrid model which combines EfficientNet-B3 with Vision Transformer to detect colon cancer through their attention fusion mechanism and the system reached 96.2 percent accuracy while achieving an MCC score of 0.961. Their work demonstrated that attention-based fusion effectively harmonizes local texture features extracted by CNNs with global contextual dependencies captured by transformers, directly supporting our implementation of multi-head self-attention for feature integration. &#x000C7;akmak (<xref ref-type="bibr" rid="B24">24</xref>) examined various machine learning techniques to improve the diagnostic process of hematological disorders and achieved 98.38 percent accuracy with LightGBM while showing that selecting features and optimizing models represent crucial components for building clinical decision support systems. &#x000C7;akmak and Pacal (<xref ref-type="bibr" rid="B25">25</xref>) performed a study that compared four CNN architectures (ResNet50, DenseNet169, InceptionV3, InceptionV4) to classify breast ultrasound images and found that InceptionV3 achieved the best results with 96.67% accuracy and 96.55% precision. Their findings demonstrate that densely connected architectures together with multi-scale feature extraction systems work effectively which serves as our basis for selecting DenseNet121 to function as our convolutional current architecture. Alpsalaz et al. (<xref ref-type="bibr" rid="B26">26</xref>) created a deep learning model based on MaxViT which they used to classify Alzheimer&#x00027;s disease through MRI scans and achieved 99.60% accuracy by implementing transfer learning together with multi-axis attention mechanisms. This research shows how vision transformers successfully capture medical image details through their ability to detect both small and large image features which supports our choice to use transformer-based self-attention in our uterine cancer classification system.</p>
<p>Recent studies show that deep learning techniques successfully analyze medical images across different clinical settings. The medical field uses convolutional neural network approaches to detect and classify tumors which result in accurate brain tumor diagnosis through magnetic resonance imaging (<xref ref-type="bibr" rid="B27">27</xref>). The advanced object detection frameworks which include YOLO-based models successfully detect abdominal diseases because they use advanced feature extraction and pre-processing methods to handle complicated anatomical areas (<xref ref-type="bibr" rid="B28">28</xref>). The research demonstrates that data pre-processing and imbalance handling methods serve as essential elements which enhance machine learning models accuracy for medical diagnostic purposes (<xref ref-type="bibr" rid="B29">29</xref>). The research demonstrates that hybrid CNN-Transformer architectures with attention mechanisms work effectively in medical imaging applications because they provide strong theoretical and empirical foundations which support our proposed methodology.</p></sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<sec>
<label>3.1</label>
<title>Study design</title>
<p>This study proposes a hybrid deep learning framework for the classification of uterine CT images from the King Abdullah University Hospital Uterine Cancer CT Dataset (KAUH-UCCTD) into three diagnostic classes: Benign, Malignant, and Normal. The methodology follows a systematic process that begins with dataset verification and controlled data partitioning, followed by standard image pre-processing and training-time data augmentation to enhance model robustness and generalization. As shown in <xref ref-type="fig" rid="F1">Figure 1</xref>, an already-trained DenseNet121 is used as the feature-extraction backbone and is further improved by adding a patch embedding stage that turns the convolutional feature maps into a sequence of tokens, which are then fed into the stacked Transformer encoder layers where the long-range contextual dependencies are captured through self-attention. The resulting global representation is combined and sent to an MLP classification head to produce the final predictions, and the whole model is trained using a multi-class cross-entropy objective in an end-to-end manner.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Overall architecture of the proposed DenseNet121-transformer hybrid model for uterine classification.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1781499-g0001.tif">
<alt-text content-type="machine-generated">Flowchart diagram illustrates a medical image classification pipeline using CT data. Steps include data preparation, feature extraction via Enhanced DenseNet121, patch-transformer representation learning, and MLP classifier, producing outputs: normal, benign, or malignant classifications.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>3.2</label>
<title>Data collection from a hospital (KAUH)</title>
<p>In this study, KAUH-UCCTD was used. The Jordan University of Science and Technology&#x00027;s King Abdullah University Hospital provided the dataset, which contained 2,870 images from 600 women between the ages of 22 and 80. Numerous image slices were taken for each of the multiple imaging procedures that many of these patients had. The collection consists of images from patients who were retrospectively diagnosed by radiologists between early 2019 and late June 2024. Beginning in June and concluding in November 2024, the data collection process took 6 months. The photos were gathered, assessed, and classified by the hospital doctors before being filed. Three CT scan slices sagittal, coronal, and axial views are included in the collection. The pictures are downloaded in JPG format and saved on a 64-channel Philips Brilliance CT scanner. <xref ref-type="table" rid="T2">Table 2</xref> displays the quantity and distribution of photos in each dataset category, while <xref ref-type="fig" rid="F2">Figure 2</xref> displays an example from every category.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>The Number and distribution of images in each KAUH-UCCTD category.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Case</bold></th>
<th valign="top" align="center"><bold>Number of images</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Normal</td>
<td valign="top" align="center">502</td>
</tr>
<tr>
<td valign="top" align="left">Benign</td>
<td valign="top" align="center">1,513</td>
</tr>
<tr>
<td valign="top" align="left">Malignant</td>
<td valign="top" align="center">855</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Total</bold></td>
<td valign="top" align="center"><bold>2,870</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold value represents the total number of images across all categories.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>An example from the image dataset (KAUH-UCCTD).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1781499-g0002.tif">
<alt-text content-type="machine-generated">Nine CT scan images of the pelvic region are arranged in three rows labeled Normal, Benign, and Malignant. Each row contains three scan views: axial, sagittal, and coronal, with red arrows and dashed circles highlighting the prostate area to compare typical, benign, and malignant conditions.</alt-text>
</graphic>
</fig>
<p>To avoid potential data leakage, the dataset was split on a patient-level basis rather than a slice-level basis. All computed tomography slices belonging to the same patient were assigned exclusively to either the training, validation, or test set. This ensures that no overlapping patient information exists across different subsets and provides a fair and unbiased evaluation of the model&#x00027;s generalization performance.</p></sec>
<sec>
<label>3.3</label>
<title>Data pre-processing and augmentation</title>
<p>All CT images pass through a common pre-processing method unification to minimize variability and to standardize the input representations prior to training (<xref ref-type="bibr" rid="B30">30</xref>). The first step of the process is to resize the image spatially and then convert it into a three-channel format that matches the pre-trained feature extraction backbone. Furthermore, to aid the optimization process and to make the distribution of intensities similar among the samples, pixel values are normalized for each channel independently. This is normalization process equation.</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>-</mml:mo><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>Where <italic>x</italic> is the original input image tensor, &#x003BC; is the channel-wise mean vector, and <bold>&#x003C3;</bold> is the corresponding standard deviation vector. This transformation guarantees that the input features are centered and scaled to ranges that are comparable prior to inference by the model. To solve the problem of data scarcity and inter-class imbalance, the usual data augmentation techniques were performed on the training subset. Let us call the original training set dataset, where <italic>x</italic><sub><italic>i</italic></sub> is an image sample and <italic>y</italic><sub><italic>i</italic></sub> its corresponding class label. An augmentation operator <inline-formula><mml:math id="M2"><mml:mrow><mml:mi mathvariant="script">A</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is used to create more transformed samples:</p>
<disp-formula id="EQ2"><mml:math id="M3"><mml:mrow><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy='false'>&#x0007B;</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mi mathvariant='script'>A</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>&#x0007D;</mml:mo></mml:mrow></mml:math><label>(2)</label></disp-formula>
<p>The augmentation process increases intra-class variability while preserving semantic class identity, and is used to equalize the number of samples across classes, resulting in a balanced training distribution (<xref ref-type="bibr" rid="B31">31</xref>). Following pre-processing and augmentation, the dataset is partitioned into three mutually exclusive subsets: a training set <inline-formula><mml:math id="M4"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>80</mml:mn><mml:mi>%</mml:mi></mml:math></inline-formula>, a validation set <inline-formula><mml:math id="M5"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>v</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>10</mml:mn><mml:mi>%</mml:mi></mml:math></inline-formula>, and a test set <inline-formula><mml:math id="M6"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>10</mml:mn><mml:mi>%</mml:mi></mml:math></inline-formula>. The training subset is used for parameter optimization, the validation subset guides model selection and convergence control, and the test subset is reserved for final performance evaluation, ensuring an unbiased assessment of the proposed framework.</p></sec>
<sec>
<label>3.4</label>
<title>Model architecture and design</title>
<sec>
<label>3.4.1</label>
<title>Enhanced DenseNet121 feature extraction</title>
<p>The DenseNet121 architecture was adopted as the feature extraction backbone, as its dense connectivity enhances information flow across layers and supports robust representation learning. The patterns allow for efficient feature reuse and also help the gradient to propagate easily through the deep layers. Let <bold>I</bold> &#x02208; &#x0211D;<sup><italic>H</italic>&#x000D7;<italic>W</italic>&#x000D7;<italic>C</italic></sup> denote a pre-processed CT image that is fed into the network (<xref ref-type="bibr" rid="B32">32</xref>). The input image is processed through a series of densely connected convolutional blocks, where each block progressively transforms the input into higher-level feature representations. The resulting feature map is denoted as <inline-formula><mml:math id="M7"><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>, and the overall transformation performed by the DenseNet121 backbone is represented by &#x003A6;(&#x000B7;). Unlike the standard DenseNet121 configuration, which applies global pooling followed by direct classification using high-level features, the proposed framework modifies this pipeline by further refining the extracted feature map <italic>F</italic> to better support subsequent global modeling. Specifically, a normalization operation is applied to stabilize the feature distribution (<xref ref-type="bibr" rid="B33">33</xref>), followed by a non-linear activation function to enhance representational capacity.</p>
<disp-formula id="EQ3"><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x003A6;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>I</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<disp-formula id="EQ4"><mml:math id="M9"><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>F</mml:mi></mml:mstyle><mml:mo>&#x00027;</mml:mo></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>B</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>F</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math><label>(4)</label></disp-formula>
<p>where <italic>BN</italic>(&#x000B7;) denotes batch normalization and &#x003C3;(&#x000B7;) stands for rectified linear activation function. The enhancement phase promotes feature consistency and prepares the representation for downstream tokenization. The enhanced feature map <bold>F</bold>&#x02032; still contains all the rich spatial and semantic information while exhibiting reduced redundancy due to dense feature reuse. The architecture proposed here overcomes the local limitation of pure CNN-based classifiers by taking the convolutional output right before any global pooling operation and thus allowing smooth integration with the following patch embedding and Transformer encoder modules. As a result, DenseNet121 serves not only as a local feature extractor but also as a robust foundation for capturing both fine-grained anatomical details and high-level contextual patterns present in uterine CT images.</p></sec>
<sec>
<label>3.4.2</label>
<title>Patch-transformer representation learning</title>
<p>Following convolutional feature extraction, the enhanced feature map produced by the DenseNet121 backbone is transformed into a sequence-based representation suitable for attention-based modeling. Let <inline-formula><mml:math id="M10"><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>F</mml:mi></mml:mstyle><mml:mo>&#x00027;</mml:mo></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mi>&#x0211D;</mml:mi><mml:mrow><mml:msub><mml:mi>H</mml:mi><mml:mi>f</mml:mi></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>f</mml:mi></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>f</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>denote the refined convolutional feature map obtained from the previous stage. To bridge the gap between convolutional and Transformer-based processing (<xref ref-type="bibr" rid="B34">34</xref>), a patch embedding operation is applied to partition <bold>F</bold>&#x02032; into a set of non-overlapping patches. This operation can be expressed as equation (<xref ref-type="bibr" rid="B35">35</xref>).</p>
<disp-formula id="EQ5"><mml:math id="M11"><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mo>=</mml:mo><mml:mi>&#x003A8;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>F</mml:mi></mml:mstyle><mml:mo>&#x00027;</mml:mo></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math><label>(5)</label></disp-formula>
<p>In this case, the symbol &#x003A8;(&#x000B7;) stands for a learnable patch embedding function that is executed through a convolutional projection, while <inline-formula><mml:math id="M12"><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi></mml:mrow></mml:msup><mml:mtext>&#x000A0;</mml:mtext></mml:math></inline-formula>illustrates the final patch token sequence (<xref ref-type="bibr" rid="B36">36</xref>). In this situation, <italic>N</italic><sub><italic>p</italic></sub> refers to the complete number of patches that have been broken down from the feature map, whereas <italic>D</italic> is the size of the corresponding token embedding dimension. The tokenization method used keeps the localized spatial information intact while allowing its sequential modeling. Moreover, since the Transformer architecture is characterized by its permutation-invariance, it becomes mandatory to inject explicit positional information into the system in order to maintain the spatial order among the tokens. Thus, a set of learnable positional embeddings <inline-formula><mml:math id="M13"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>E</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is appended to the patch token sequence.</p>
<disp-formula id="EQ6"><mml:math id="M14"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>E</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<p>where <bold>X</bold><sub>0</sub> stands for the position-aware token representation. By carrying out this step, the model is made capable of telling the difference between the patches that come from different spatial locations in the original CT image. The position-modulated tokens are next fed into a series of Transformer encoder layers to discover long-range dependencies and global contextual relationships. Each Transformer encoder layer consists of a multi-head self-attention (MHSA) mechanism followed by a feed-forward multilayer perceptron (MLP), both wrapped with residual connections and layer normalization (<xref ref-type="bibr" rid="B37">37</xref>). Given an input token sequence <bold>X</bold>, the self-attention operation is defined as an equation.</p>
<disp-formula id="EQ7"><mml:math id="M15"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>Q</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>K</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>V</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>Q</mml:mtext></mml:mstyle><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>K</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>V</mml:mtext></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>In the above-mentioned process, <bold>Q</bold>, <bold>K</bold>, and<bold>V</bold> signify the query, key, and value matrices generated from <bold>X</bold>, respectively, whereas <italic>d</italic><sub><italic>k</italic></sub> refers to the size of the individual attention head. The features of various attention heads get concatenated and then linearly projected to form the final output of MHSA (<xref ref-type="bibr" rid="B38">38</xref>). For making training stable and allowing gradient flow, residual learning is applied in every encoder block, giving rise to the following transformations.</p>
<disp-formula id="EQ8"><mml:math id="M16"><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>Z</mml:mi></mml:mstyle><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>X</mml:mi></mml:mstyle><mml:mo>+</mml:mo><mml:mi>M</mml:mi><mml:mi>H</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>L</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>X</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math><label>(8)</label></disp-formula>
<disp-formula id="EQ9"><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold"><mml:mtext>Y</mml:mtext></mml:mstyle><mml:mo>=</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>Z</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mi>M</mml:mi><mml:mi>L</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>L</mml:mi><mml:mi>N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<p>Where <italic>LN</italic>(&#x000B7;) is the layer normalization function and Y is the output from one of the Transformer encoder layers. When multiple layers are combined, the model can take advantage of local convolutional features as well as global contextual information all across the image. This representation learning approach based on Patch-Transformer not only surpasses the locality constraint of convolutional networks but also allows for drawing relations between far-off anatomical regions that capture fine and spatially distributed patterns in uterine CT images which is a necessity.</p></sec>
<sec>
<label>3.4.3</label>
<title>Classification and training strategy</title>
<p>Following the Patch-Transformer learning of representation, the output token sequence <inline-formula><mml:math id="M18"><mml:mstyle mathvariant="bold"><mml:mtext>Y</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>D</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> carries the information of both local spatial features and global contextual relationships. A global aggregation operation is done across the token dimension in order to get a fixed-length representation that is suitable for classification. If <bold>y</bold><sub><italic>i</italic></sub> denote is the <italic>i</italic>-th token in the sequence, then the aggregated feature vector <bold>g</bold> &#x02208; &#x0211D;<sup><italic>D</italic></sup> is computed as follows.</p>
<disp-formula id="EQ10"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold"><mml:mtext>g</mml:mtext></mml:mstyle><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>This operation summarizes the overall image representation while preserving the discriminative information learned by the Transformer encoder. The resulting vector <bold>g</bold> is then passed to a MLP classification head (<xref ref-type="bibr" rid="B39">39</xref>), which serves as the decision-making component of the proposed framework. The MLP head consists of a sequence of fully connected layers with non-linear activation functions and regularization mechanisms, and produces a vector of class logits.</p>
<disp-formula id="EQ11"><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>g</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>where <italic>f</italic><sub><italic>cls</italic></sub>(&#x000B7;) represents the mapping executed by the MLP head and <inline-formula><mml:math id="M21"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> refers to the unnormalized prediction scores for Cdiagnostic classes. For the complete training of the network, a multi-class cross-entropy loss function is used. The loss is calculated between the true label vector <italic>y</italic> and the predicted class probabilities <inline-formula><mml:math id="M22"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>p</mml:mtext></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> obtained through a softmax transformation, and is given as:</p>
<disp-formula id="EQ12"><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>p</mml:mtext></mml:mstyle></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<p>Where <italic>y</italic><sub><italic>c</italic></sub> and <inline-formula><mml:math id="M24"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represent the actual and estimated probabilities for class <italic>c</italic>, respectively. This aim pushes the model to be very sure about the right diagnostic category. A gradient-based adaptive optimizer is used for model optimization, and a validation-guided learning rate scheduling strategy dynamically controls the learning process. The situation in which the schedule tracks the validation loss and consequently modifies the learning rate leads to the stagnation of convergence process which in return makes training stable and evades the trap of the model getting stuck in poor local minima. Among the regularization techniques, dropout applied in the MLP head helps in significantly reducing overfitting and consequently improving the model&#x00027;s performance on unseen data (<xref ref-type="bibr" rid="B40">40</xref>). In general, this classification and training policy guarantees that the proposed architecture can successfully convert rich Patch-Transformer representations into trustworthy diagnostic predictions while at the same time being robust and reproducible across training runs.</p></sec></sec>
<sec>
<label>3.5</label>
<title>Evaluation protocol</title>
<p>To ensure that the proposed model is assessed in a clinically meaningful manner, evaluation is not limited to a single aggregate score. Instead, a set of complementary metrics is reported to reflect different diagnostic risks and decision priorities encountered in uterine cancer screening and triage (<xref ref-type="bibr" rid="B41">41</xref>). Accuracy provides an overall estimate of correctness; however, in clinical practice it may hide critical failure modes when the number of normal cases differs from abnormal cases. Therefore, Precision is used to quantify how trustworthy a positive prediction is (i.e., when the model flags a case as malignant/abnormal, how often this alert is correct). This is directly relevant to reducing unnecessary follow-up imaging, biopsies, and patient anxiety associated with false-positive findings. Sensitivity (Recall) detects true disease cases and is often the most important standard in cancer treatment because if a tumor is missed, the treatment may be delayed, and negative results will follow. Oncology may thus rely on this criterion most heavily. Sensitivity is complemented by Specificity, which indicates how well the system correctly identifies non-disease cases. Specificity further supports clinic workflow efficiency by avoiding over-referral and lightening the load on both radiology and oncology services. The F1-score issue a single balanced indicator that together with precision and sensitivity, thus being especially useful when clinicians ask for one summary measure and yet wishing the reflection of both false-alarm and missed-case behaviors. Lastly, since clinical decision thresholds might differ from one hospital to another and depend on screening policies, a threshold-independent assessment is carried out by means of ROC analysis and AUC (<xref ref-type="bibr" rid="B42">42</xref>). By means of ROC curves, the trade-off between sensitivity and the false-positive rate is visualized as the decision threshold is varied while AUC captures the model&#x00027;s global discriminative power; combined they demonstrate whether the model can differentiate malignant from non-malignant patterns consistently across various operating points and risk tolerances. In case of a multi-class situation (Benign/Malignant/Normal), these metrics can be evaluated for each class and also simultaneously computed, allowing doctors to view class-wise reliability (e.g., malignant detection) instead of relying on an overall score only.</p>
<disp-formula id="EQ13"><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<disp-formula id="EQ14"><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<disp-formula id="EQ15"><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>S</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<disp-formula id="EQ16"><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>S</mml:mi><mml:mi>p</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(16)</label></disp-formula>
<disp-formula id="EQ17"><mml:math id="M29"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>S</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000B7;</mml:mo><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mi>S</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>S</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(17)</label></disp-formula></sec></sec>
<sec sec-type="results" id="s4">
<label>4</label>
<title>Results</title>
<p>This section will go over the findings of the study&#x00027;s suggested model, which was tested using the KAUH-UCCTD uterine tumor datasets that were gathered from King Abdullah University Hospital in Jordan. Each model was trained using the same set of parameters: 50 epochs, learning rate of 0.0001, Adam activation function, batch size 16, image size (224,224), and class cross-entropy loss function. The dataset is divided into three subsets for this study: 10% for testing, 10% for validation, and 80% for training. Additionally, a Jupyter laptop and an RTX 3050 GPU were used to train the models locally.</p>
<p>The section displays all model performance metrics which include accuracy, precision, sensitivity, and specificity together with their F1 score and AUC. The KAUH-UCCTD dataset evaluates proposed model performance which researchers use to compare their model against five existing models: VGG16 and VGG19 and MobileNetV2 and DenseNet121 and ResNet50. <xref ref-type="table" rid="T3">Table 3</xref> shows that the proposed model achieved better results than all baseline models. The system proved itself effective for uterine cancer detection because it achieved all evaluation metrics which included 87.44% accuracy and 87.48% precision and 95.20% specificity and 99.41% AUC and 87.17% F1-score. The DenseNet121 architecture among baseline models achieved second place because it reached 84.80% accuracy and 92.40% specificity which showed its ability to extract and classify complex CT image features. The VGG19 and ResNet50 models performed worse than other models because they achieved accuracies of 80.17% and 75.33% respectively. The models examined in the study show their effectiveness through the results which <xref ref-type="fig" rid="F3">Figure 3</xref> displays.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Performance of the evaluated models for uterine CT image classification.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Accuracy</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>Sensitivity</bold></th>
<th valign="top" align="center"><bold>Specificity</bold></th>
<th valign="top" align="center"><bold>F1 Score</bold></th>
<th valign="top" align="center"><bold>AUC</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">VGG16</td>
<td valign="top" align="center">83.03%</td>
<td valign="top" align="center">82.84%</td>
<td valign="top" align="center">83.04%</td>
<td valign="top" align="center">91.52%</td>
<td valign="top" align="center">82.74%</td>
<td valign="top" align="center">95.42%</td>
</tr>
<tr>
<td valign="top" align="left">VGG19</td>
<td valign="top" align="center">80.17%</td>
<td valign="top" align="center">84.37%</td>
<td valign="top" align="center">80.20%</td>
<td valign="top" align="center">90.09%</td>
<td valign="top" align="center">80.50%</td>
<td valign="top" align="center">95.21%</td>
</tr>
<tr>
<td valign="top" align="left">MobileNetV2</td>
<td valign="top" align="center">83.70%</td>
<td valign="top" align="center">83.53%</td>
<td valign="top" align="center">83.76%</td>
<td valign="top" align="center">91.85%</td>
<td valign="top" align="center">83.55%</td>
<td valign="top" align="center">96.02%</td>
</tr>
<tr>
<td valign="top" align="left">DenseNet121</td>
<td valign="top" align="center">84.80%</td>
<td valign="top" align="center">84.54%</td>
<td valign="top" align="center">84.81%</td>
<td valign="top" align="center">92.40%</td>
<td valign="top" align="center">84.59%</td>
<td valign="top" align="center">96.62%</td>
</tr>
<tr>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center">75.33%</td>
<td valign="top" align="center">78.25%</td>
<td valign="top" align="center">75.37%</td>
<td valign="top" align="center">87.67%</td>
<td valign="top" align="center">74.79%</td>
<td valign="top" align="center">90.87%</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Proposed model</bold></td>
<td valign="top" align="center"><bold>87.44%</bold></td>
<td valign="top" align="center"><bold>87.48%</bold></td>
<td valign="top" align="center"><bold>87.13%</bold></td>
<td valign="top" align="center"><bold>95.20%</bold></td>
<td valign="top" align="center"><bold>87.17%</bold></td>
<td valign="top" align="center"><bold>99.41%</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values in the last row (Proposed Model) indicate the best performance results among all compared models.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Performance comparison of different models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1781499-g0003.tif">
<alt-text content-type="machine-generated">Bar chart comparing six models&#x02014;VGG16, VGG19, MobileNetV2, DenseNet121, ResNet50, and Proposed Model&#x02014;across six metrics: accuracy, precision, sensitivity, specificity, F1 score, and AUC. Proposed Model shows the highest values consistently across all metrics.</alt-text>
</graphic>
</fig>
<p>The proposed model shows excellent performance because it can effectively distinguish three uterine tissue classes at all classification threshold points. The system evaluates its classification performance through a specific decision point which does not accurately represent its ability to rank results in multi-class situations with unbalanced classes. The selected threshold for classification results in misclassifications because of slight visual overlap between benign and malignant cases which leads to decreased accuracy although the system shows strong ability to differentiate between the two groups. The system shows high area under the receiver operating characteristic curve results but its actual accuracy performance remains lower than expected.</p>
<p>The results from the proposed model and the standard DenseNet121 model show their performance differences through their classification accuracy and loss measurements, which they produced during training and validation through 50 completed training epochs, as a shown <xref ref-type="fig" rid="F4">Figure 4</xref>. The proposed model in the top row achieves full training accuracy through controlled training that achieves 90% accuracy while maintaining small differences between training and validation accuracy assessment. The results indicate that the model can generalize well because it achieved good results while maintaining control of overfitting problems. The proposed model&#x00027;s loss curve demonstrates a continuous training loss reduction while the validation loss maintains stable low levels which experience minor changes because medical CT data shows natural variability. The bottom row displays the results of DenseNet121 testing. The model achieves satisfactory training accuracy but shows poor performance during validation because it has a major gap between training results and validation outcomes. The model shows difficulties because it cannot capture all the worldwide connections that exist in images according to the high validation loss results which fail to decrease at a proper rate. The results demonstrate that using DenseNet121 with Transformer and self-attention in the proposed model has improved training stability and brought better training validation convergence results which increased generalization capabilities according to the better quantitative results that the proposed model achieved on the KAUH-UCCTD dataset.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Comparison of training and validation performance between the proposed model and DenseNet121.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1781499-g0004.tif">
<alt-text content-type="machine-generated">Four line charts compare training and validation accuracy and loss over fifty epochs for a proposed model and DenseNet121. Both models show increasing accuracy and decreasing loss, with training curves outperforming validation curves in each chart.</alt-text>
</graphic>
</fig>
<p>The <xref ref-type="fig" rid="F5">Figure 5</xref> graphs below show the confusion matrices for all comparative models in classifying uterine CT images in KAUH-UCCTD in terms of VGG16, VGG19, MobileNetV2, DenseNet121, ResNet50, and our design model. The problem of identifying Benign vs. Malignant can be seen in traditional models such as VGG16 and VGG19; they are highly confused in both models, as reflected in high off-diameter points. While improvements are made in DenseNet121 and MobileNetV2 designs, but are mistaken in some cases regarding malignancies. The model presented does the best job in having the largest number of correct predictions on the main diameter, along with a considerable decrease in errors, especially in the Malignant class, exhibiting a high level of sensitivity in malign predictions and a decrease in false positives. Moreover, the model does a near-perfect job in predicting the Normal class with a negligible level of interference in the result, depicting a high level of discrimination between healthy and infected regions. The result-making matrices together ensure that the combination of the DenseNet121 model and the Self-Attention mechanism (Transformer model), in fact, increases the efficiency level in distinguishing the three classes effectively and makes the model more reliable in clinical practices.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Confusion matrices of all compared models on the KAUH-UCCTD dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1781499-g0005.tif">
<alt-text content-type="machine-generated">Grid containing six heatmap-style confusion matrices for model comparison, labeled Confusion Matrix for VGG16, VGG19, MobileNetV2, DenseNet121, ResNet50, and Proposed Model. Axes indicate true and predicted classes: Benign, Malignant, and Normal. Color intensity represents prediction count, and each cell includes numeric values for predictions per class. Top center title reads &#x0201C;Confusion Matrices of All Models.&#x0201D; Each matrix evaluates model classification performance.</alt-text>
</graphic>
</fig>
<p>False-negative predictions in the malignant class represent a particularly critical limitation in cancer diagnosis, as they may lead to delayed clinical intervention and adversely affect patient outcomes. Although the proposed model demonstrates high overall sensitivity and specificity, a small number of malignant cases were misclassified, primarily due to morphological similarities between benign and malignant uterine tissues. In a clinical decision-support context, such a system should be used as an assistive tool rather than a standalone diagnostic solution, with final decisions made by experienced radiologists. Moreover, future work will focus on reducing false-negative rates by incorporating cost-sensitive learning strategies, threshold optimization, or ensemble-based approaches that prioritize malignant case detection</p></sec>
<sec sec-type="discussion" id="s5">
<label>5</label>
<title>Discussion</title>
<sec>
<label>5.1</label>
<title>Generalization across different data sets</title>
<p>To ensure the validity of the results for the generalizability of the proposed model on an extended range rather than the original dataset, the performance of the model was assessed on an independent dataset, the KAUH-UCM dataset (<xref ref-type="bibr" rid="B43">43</xref>). The dataset consisted of 1,814 uterine MRI images, each belonging to three classes (normal, benign, and malignant). The proposed model performed satisfactorily as far as the generalization performance as it achieved an overall accuracy of 85.71%, macro-averaged precision of 85.94%, macro-averaged sensitivity of 85.25%, and macro-averaged F1 score of 85.53%. It is pertinent to note here that the model also achieved an impressive macro-averaged specificity of 92.70%. The result is significantly valuable as it substantiates the model&#x00027;s ability to distinguish between the negative and actual cases, which is highly desirable for an ideal medical model, as it does not produce any false positives, thus maintaining clinic relevance and reliability. The performances on the independent dataset are, however, slightly low considering the original dataset model performance, which is 88.10%.</p>
<p>The confusion matrix in <xref ref-type="fig" rid="F6">Figure 6</xref> represents the correctness of the classifications performed by the model for all three classes. The model was able to correctly classify a total of 71 out of 80 benign examples 88.75% accuracy, and a total of nine incorrect classifications. In the malignant class, a total of 46 out of 57 examples 80.70% accuracy were correctly classified, thus indicating the capability of the model to identify pathological abnormalities. In the normal class, the total accuracy was the highest with 63 out of 73 examples 86.30% correctly classified. The relatively mild values of the false-positive rates in the benign and malignant examples five and six, respectively suggest the existence of morphological similarities between these two classes.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Confusion matrix of the proposed model on the KAUH-UCM Dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-13-1781499-g0006.tif">
<alt-text content-type="machine-generated">Confusion matrix for a proposed model showing classification results for three classes: Benign, Malignant, and Normal. Diagonal cells display correct predictions with 71 Benign, 46 Malignant, and 63 Normal. Off-diagonal cells indicate misclassifications. Color intensity represents the number of samples, with a scale bar ranging from 0 to 70.</alt-text>
</graphic>
</fig></sec>
<sec>
<label>5.2</label>
<title>Performance analysis and computational considerations</title>
<p>The model shows overall classification capabilities according to the current research study but lacks specific testing results for various tumor stages and sizes and different body parts. The model testing process needs to include subgroup testing because it helps determine how well the model works in clinical settings across various patient groups. The study failed to evaluate both computational efficiency and inference time which count as two essential requirements for real-time clinical use. The proposed model uses both a DenseNet121 backbone and a lightweight transformer encoder which helps to achieve fast inference times but the research team plans to study runtime performance through detailed analysis and optimization work. The future research will evaluate tumor subtypes and anatomical variations through detailed performance tests while also working to enhance model inference speed for actual clinical application.</p></sec>
<sec>
<label>5.3</label>
<title>Practical and research implications</title>
<p>The proposed deep learning algorithm would integrate well with the medical system of the hospital as a clinical decision-support system for radiologists for early and accurate diagnosis of uterine cancer using CT scanning images of the uterus. The integration process would begin with the acquisition of the CT scan images by imaging technology available at the hospital, which would all be stored in the Picture Archiving and Communication System (PACS) of the hospital. These would then ideally flow automatically to the proposed system for resizing and normalization according to the training environment requirements. The DenseNet121-Transformer deep learning system would then process these images and produce probabilities for all three types: normal, benign, and malignant uteri. The output would then ideally flow back to the hospital information system (HIS) and radiology information system (RIS), and the radiologist would easily access the output of the system by means of an interactive user interface without having to actually undergo all the processes of the machine learning system since it acts as a decision support tool for radiologists and doctors of the hospital.</p></sec>
<sec>
<label>5.4</label>
<title>Limitations of the study</title>
<p>Although there are promising outcomes observed with the proposed model regarding the (DenseNet121-Transformer) architecture, there are a few shortcomings and areas that need to be addressed in the future for improvement. Although there are multiple views available in the data set presented (axial, sagittal, and coronal), it is still observed that the model can independently analyze these different views without taking into consideration the other data available in between the slices or in 3D. As observed in the data set, it contains a certain level of class imbalance, which can affect the training process, even when data augmentation strategies are used. Although there is an improvement in explainability on a feature level through the attention mechanism in the Transformer, there are no techniques in this study presented that focus on the use of other explainability tools, which are essential in increasing trust. Although the proposed model demonstrates consistent performance improvements over the comparative methods across all evaluation metrics, statistical significance analysis was not performed in this study. The current research work exhibits a specific limitation here. The upcoming research will use formal statistical testing through multiple experimental repetitions to establish the strength and dependability of the suggested method.</p>
<p>The study conducted their study at one complete tertiary healthcare facility which delivered superior imaging services and treated patients from diverse backgrounds. The research achieves dependable results for model development and testing yet remains restricted to one research site. The upcoming research will use datasets from multiple medical facilities and different equipment manufacturers to improve the model&#x00027;s performance in various clinical environments.</p></sec></sec>
<sec id="s6">
<label>6</label>
<title>Conclusion and future works</title>
<p>This study presented an efficient hybrid deep learning architecture that was demonstrated, comprising the combination of DenseNet121 and the self-attention mechanism based on the Transformer for the classification of uterine CT images into normal, benign, and malignant images based on the KAUH-UCCTD dataset. In the study, the proposed hybrid model efficiently combined the strength of convolutional neural networks in identifying local textual features in the images along with the ability of the Transformer encoder to handle the images globally. The results obtained from the experiment have shown that there is superiority in the proposed method over different advanced methods such as VGG16, VGG19, MobileNetV2, DenseNet121, and ResNet50 based on different parameters such as accuracy, sensitivity, specificity, F1-score, and AUC.</p>
<p>Future work will be done by extending the study to validate the proposed framework on multi-institutional multi-center datasets, which improves the generalizability and clinical reliability of the framework. The inclusion of 3D volumetric information and interslice relationships from CT examinations could lead to even better diagnostic performance. The integration of XAI techniques, such as attention maps or visualization saliency, would make the model more interpretable and increase clinician trust in the models. Other future works may involve real-time deployment in hospital environments, optimization for computational efficiency, and integration with clinical workflows for practical decision-making during routine medical practice.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="ethics-statement" id="s8">
<title>Ethics statement</title>
<p>Written informed consent was not obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article because this study was conducted according to the guidelines and with the approval of the Institutional Review Board (IRB No. 21/171/2024) at King Abdullah University Hospital, Jordan University of Science and Technology, Jordan. Institutional Review Board approval has been granted. Written informed consent for participation was not required for this study in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>EA: Data curation, Visualization, Conceptualization, Writing &#x02013; original draft. AS: Validation, Data curation, Writing &#x02013; original draft, Resources. SA: Writing &#x02013; original draft, Methodology, Software. RMal: Software, Investigation, Methodology, Writing &#x02013; original draft. RMad: Validation, Data curation, Resources, Writing &#x02013; original draft. RA: Visualization, Writing &#x02013; review &#x00026; editing, Investigation, Data curation. BA-S: Project administration, Visualization, Writing &#x02013; review &#x00026; editing, Resources, Software. AA-M: Project administration, Writing &#x02013; review &#x00026; editing, Conceptualization, Investigation. MA: Software, Writing &#x02013; review &#x00026; editing, Supervision, Methodology. AA-a: Validation, Project administration, Visualization, Conceptualization, Writing &#x02013; review &#x00026; editing, Funding acquisition, Investigation.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<mixed-citation publication-type="web"><collab>NCI Division of Cancer Control and Population Sciences</collab>. <source>Cancer Stat Facts: Uterine Cancer</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://seer.cancer.gov/statfacts/html/corp.html">https://seer.cancer.gov/statfacts/html/corp.html</ext-link> (Accessed December 25, 2024).</mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Felix</surname> <given-names>AS</given-names></name> <name><surname>Brinton</surname> <given-names>LA</given-names></name></person-group>. <article-title>Cancer progress and priorities: uterine cancer</article-title>. <source>Cancer Epidemiol Biomarkers Prev</source>. (<year>2018</year>) <volume>27</volume>:<fpage>985</fpage>&#x02013;<lpage>94</lpage>. doi: <pub-id pub-id-type="doi">10.1158/1055-9965.EPI-18-0264</pub-id><pub-id pub-id-type="pmid">30181320</pub-id></mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stewart</surname> <given-names>EA</given-names></name> <name><surname>Cookson</surname> <given-names>CL</given-names></name> <name><surname>Gandolfo</surname> <given-names>RA</given-names></name> <name><surname>Schulze-Rath</surname> <given-names>R</given-names></name></person-group>. <article-title>Epidemiology of uterine fibroids: a systematic review</article-title>. <source>BJOG.</source> (<year>2017</year>) <volume>124</volume>:<fpage>1501</fpage>&#x02013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1471-0528.14640</pub-id><pub-id pub-id-type="pmid">28296146</pub-id></mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ledford</surname> <given-names>LR</given-names></name> <name><surname>Lockwood</surname> <given-names>S</given-names></name></person-group>. <article-title>Scope and epidemiology of gynecologic cancers: an overview</article-title>. in <source>Seminars in Oncology Nursing</source>. Amsterdam: Elsevier (<year>2019</year>). p. <fpage>147</fpage>&#x02013;<lpage>50</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.soncn.2019.03.002</pub-id><pub-id pub-id-type="pmid">30902519</pub-id></mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Venkatesh</surname> <given-names>A</given-names></name> <name><surname>Isaacs</surname> <given-names>C</given-names></name></person-group>. <article-title>Trends in Uterine cancer mortality in the United States: a 50-year population-based analysis</article-title>. <source>Obstet Gynecol.</source> (<year>2024</year>) <volume>143</volume>:<fpage>e130</fpage>&#x02013;<lpage>1</lpage>. doi: <pub-id pub-id-type="doi">10.1097/AOG.0000000000005543</pub-id><pub-id pub-id-type="pmid">37678887</pub-id></mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Somasegar</surname> <given-names>S</given-names></name> <name><surname>Bashi</surname> <given-names>A</given-names></name> <name><surname>Lang</surname> <given-names>SM</given-names></name> <name><surname>Liao</surname> <given-names>CI</given-names></name> <name><surname>Johnson</surname> <given-names>C</given-names></name> <name><surname>Darcy</surname> <given-names>KM</given-names></name> <etal/></person-group>. <article-title>Trends in uterine cancer mortality in the United States: a 50-year population-based analysis</article-title>. <source>Obstet Gynecol</source>. (<year>2023</year>) <volume>142</volume>:<fpage>978</fpage>&#x02013;<lpage>86</lpage>. doi: <pub-id pub-id-type="doi">10.1097/AOG.0000000000005321</pub-id><pub-id pub-id-type="pmid">37678887</pub-id></mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Henley</surname> <given-names>SJ</given-names></name></person-group>. <article-title>Uterine cancer incidence and mortality&#x02014;United States, 1999&#x02013;2016</article-title>. <source>MMWR Morb Mortal Wkly Rep.</source> (<year>2018</year>) <volume>67</volume>:<fpage>1333</fpage>&#x02013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.15585/mmwr.mm6748a1</pub-id></mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>H</given-names></name> <name><surname>Luo</surname> <given-names>S</given-names></name> <name><surname>Ji</surname> <given-names>J</given-names></name> <name><surname>Wang</surname> <given-names>Z</given-names></name> <name><surname>Zhi</surname> <given-names>W</given-names></name> <name><surname>Mo</surname> <given-names>N</given-names></name> <etal/></person-group>. <article-title>A deep-learning-based artificial intelligence system for the pathology diagnosis of uterine smooth muscle tumor</article-title>. <source>Life.</source> (<year>2022</year>) <volume>13</volume>:<fpage>3</fpage>. doi: <pub-id pub-id-type="doi">10.3390/life13010003</pub-id><pub-id pub-id-type="pmid">36675952</pub-id></mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jan</surname> <given-names>YT</given-names></name> <name><surname>Tsai</surname> <given-names>PS</given-names></name> <name><surname>Huang</surname> <given-names>WH</given-names></name> <name><surname>Chou</surname> <given-names>LY</given-names></name> <name><surname>Huang</surname> <given-names>SC</given-names></name> <name><surname>Wang</surname> <given-names>JZ</given-names></name> <etal/></person-group>. <article-title>Machine learning combined with radiomics and deep learning features extracted from CT images: a novel AI model to distinguish benign from malignant ovarian tumors</article-title>. <source>Insights Imaging</source>. (<year>2023</year>) <volume>14</volume>:<fpage>68</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13244-023-01412-x</pub-id><pub-id pub-id-type="pmid">37093321</pub-id></mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mao</surname> <given-names>W</given-names></name> <name><surname>Chen</surname> <given-names>C</given-names></name> <name><surname>Gao</surname> <given-names>H</given-names></name> <name><surname>Xiong</surname> <given-names>L</given-names></name> <name><surname>Lin</surname> <given-names>Y</given-names></name></person-group>. <article-title>A deep learning-based automatic staging method for early endometrial cancer on MRI images</article-title>. <source>Front. Physiol.</source> (<year>2022</year>) <volume>13</volume>:<fpage>974245</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fphys.2022.974245</pub-id><pub-id pub-id-type="pmid">36111158</pub-id></mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Buddenkotte</surname> <given-names>T</given-names></name> <name><surname>Rundo</surname> <given-names>L</given-names></name> <name><surname>Woitek</surname> <given-names>R</given-names></name> <name><surname>Sanchez</surname> <given-names>LE</given-names></name> <name><surname>Beer</surname> <given-names>L</given-names></name> <name><surname>Crispin-Ortuzar</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Deep learning-based segmentation of multisite disease in ovarian cancer</article-title>. <source>Eur Radiol Exp.</source> (<year>2023</year>) <volume>7</volume>:<fpage>77</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s41747-023-00388-z</pub-id><pub-id pub-id-type="pmid">38057616</pub-id></mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>Z</given-names></name> <name><surname>Xu</surname> <given-names>S</given-names></name> <name><surname>Li</surname> <given-names>K</given-names></name> <name><surname>Zhao</surname> <given-names>W</given-names></name> <name><surname>Xu</surname> <given-names>T</given-names></name> <name><surname>Xia</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>Abnormal uterine classification based on an improved YOLOv5 framework from ultrasound images</article-title>. in <source>Sixteenth International Conference on Graphics and Image Processing (ICGIP 2024)</source> (<year>2025</year>) p. <fpage>256</fpage>&#x02013;<lpage>65</lpage>. doi: <pub-id pub-id-type="doi">10.1117/12.3060403</pub-id></mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sepehr</surname> <given-names>J</given-names></name> <name><surname>Caprio</surname> <given-names>A</given-names></name> <name><surname>Sam</surname> <given-names>L</given-names></name> <name><surname>Lee</surname> <given-names>BC</given-names></name> <name><surname>Sabuncu</surname> <given-names>MR</given-names></name> <name><surname>Lamparello</surname> <given-names>NA</given-names></name></person-group>. <article-title>et al. Predicting clinical outcomes and symptom relief in uterine fibroid embolization using machine learning on MRI features</article-title>. <source>AI.</source> 6:200. doi: <pub-id pub-id-type="doi">10.3390/ai6090200</pub-id></mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tinelli</surname> <given-names>A</given-names></name> <name><surname>Morciano</surname> <given-names>A</given-names></name> <name><surname>Sparic</surname> <given-names>R</given-names></name> <name><surname>Hatirnaz</surname> <given-names>S</given-names></name> <name><surname>Malgieri</surname> <given-names>LE</given-names></name> <name><surname>Malvasi</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Artificial intelligence and uterine fibroids: a useful combination for diagnosis and treatment</article-title>. <source>J Clin Med.</source> 14:3454. doi: <pub-id pub-id-type="doi">10.3390/jcm14103454</pub-id><pub-id pub-id-type="pmid">40429449</pub-id></mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>&#x000D6;z</surname> <given-names>I</given-names></name> <name><surname>Yegin</surname> <given-names>EE</given-names></name> <name><surname>&#x000D6;z</surname> <given-names>AU</given-names></name> <name><surname>Ulukaya</surname> <given-names>E</given-names></name></person-group>. <article-title>An AI-driven clinical decision support framework utilizing female sex hormone parameters for surgical decision guidance in uterine fibroid management</article-title>. <source>Medicina (B Aires)</source>. (<year>2025</year>) <volume>62</volume>:<fpage>1</fpage>. doi: <pub-id pub-id-type="doi">10.3390/medicina62010001</pub-id><pub-id pub-id-type="pmid">41597287</pub-id></mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>B</given-names></name> <name><surname>Liu</surname> <given-names>J</given-names></name> <name><surname>Fang</surname> <given-names>M</given-names></name> <name><surname>Zhu</surname> <given-names>H</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Multicenter deep learning-based automatic delineation of CTV and PTV in uterine malignancy CT imaging</article-title>. <source>Radiother Oncol</source>. (<year>2025</year>) <volume>214</volume>:<fpage>111212</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.radonc.2025.111212</pub-id><pub-id pub-id-type="pmid">41120056</pub-id></mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>T</given-names></name> <name><surname>Wen</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>Z</given-names></name></person-group>. <article-title>nnU-Net based segmentation and 3D reconstruction of uterine fibroids with MRI images for HIFU surgery planning</article-title>. <source>BMC Med. Imaging.</source> (<year>2024</year>) <volume>24</volume>:<fpage>233</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12880-024-01385-3</pub-id><pub-id pub-id-type="pmid">39243001</pub-id></mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>C</given-names></name> <name><surname>He</surname> <given-names>Z</given-names></name> <name><surname>Lv</surname> <given-names>F</given-names></name> <name><surname>Liao</surname> <given-names>H</given-names></name> <name><surname>Xiao</surname> <given-names>Z</given-names></name></person-group>. <article-title>Predicting the prognosis of HIFU ablation of uterine fibroids using a deep learning-Based 3D super-resolution DWI radiomics model: a multicenter Study</article-title>. <source>Acad. Radiol</source> (<year>2024</year>) <volume>31</volume>:<fpage>4996</fpage>&#x02013;<lpage>5007</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.acra.2024.06.027</pub-id><pub-id pub-id-type="pmid">38969576</pub-id></mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>G&#x000F6;ker</surname> <given-names>H</given-names></name></person-group>. <article-title>Detection of cervical cancer from uterine cervix images using transfer learning architectures</article-title>. <source>Eskisehir Tech Univ J Sci Technol A Appl Sci Eng</source>. <volume>25</volume>:<fpage>222</fpage>&#x02013;<lpage>39</lpage>. doi: <pub-id pub-id-type="doi">10.18038/estubtda.1384489</pub-id></mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Santoro</surname> <given-names>M</given-names></name> <name><surname>Zybin</surname> <given-names>V</given-names></name> <name><surname>Coada</surname> <given-names>VA</given-names></name> <name><surname>Mantovani</surname> <given-names>G</given-names></name> <name><surname>Paolani</surname> <given-names>G</given-names></name> <name><surname>Stanislao</surname> <given-names>MD</given-names></name> <etal/></person-group>. <article-title>Machine learning applied to pre-operative computed-tomography-based radiomic features can accurately differentiate uterine leiomyoma from leiomyosarcoma: a pilot study</article-title>. <source>Cancers (Basel).</source> (<year>2024</year>) <volume>16</volume>:<fpage>1570</fpage>. doi: <pub-id pub-id-type="doi">10.3390/cancers16081570</pub-id><pub-id pub-id-type="pmid">38672651</pub-id></mixed-citation>
</ref>
<ref id="B21">
<label>21.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xi</surname> <given-names>H</given-names></name> <name><surname>Wang</surname> <given-names>W</given-names></name></person-group>. <article-title>Deep learning based uterine fibroid detection in ultrasound images</article-title>. <source>BMC Med Imaging.</source> (<year>2014</year>) <volume>24</volume>:<fpage>218</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12880-024-01389-z</pub-id><pub-id pub-id-type="pmid">39160500</pub-id></mixed-citation>
</ref>
<ref id="B22">
<label>22.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alswilem</surname> <given-names>L</given-names></name> <name><surname>Pacal</surname> <given-names>M</given-names></name></person-group>. <article-title>Computational efficiency and accuracy of deep learning models for automated breast cancer detection in ultrasound imaging</article-title>. <source>Artif Intell Appl Sci.</source> (<year>2025</year>) <volume>1</volume>:<fpage>1</fpage>&#x02013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.69882/adba.ai.2025071</pub-id></mixed-citation>
</ref>
<ref id="B23">
<label>23.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alpsalaz</surname> <given-names>SD</given-names></name> <name><surname>Aslan</surname> <given-names>E</given-names></name> <name><surname>&#x000D6;z&#x000FC;pak</surname> <given-names>Y</given-names></name> <name><surname>Alpsalaz</surname> <given-names>F</given-names></name> <name><surname>Uzel</surname> <given-names>H</given-names></name> <name><surname>Bereznychenko</surname> <given-names>V</given-names></name> <etal/></person-group>. <article-title>Hybrid deep learning with attention fusion for enhanced colon cancer detection. Sci</article-title>. <source>Rep.</source> (<year>2025</year>) <volume>15</volume>:<fpage>45583</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-29447-8</pub-id></mixed-citation>
</ref>
<ref id="B24">
<label>24.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>&#x000C7;akmak</surname> <given-names>Y</given-names></name></person-group>. <article-title>Machine learning approaches for enhanced diagnosis of hematological disorders</article-title>. <source>Comput Syst Artif Intell.</source> (<year>2015</year>) <volume>1</volume>:<fpage>8</fpage>&#x02013;<lpage>14</lpage>. doi: <pub-id pub-id-type="doi">10.69882/adba.csai.2025072</pub-id></mixed-citation>
</ref>
<ref id="B25">
<label>25.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>&#x000C7;akmak</surname> <given-names>Y</given-names></name> <name><surname>Pacal</surname> <given-names>N</given-names></name></person-group>. <article-title>Deep learning for automated breast cancer detection in ultrasound: A comparative study of four CNN architectures</article-title>. <source>Artif Intell Appl Sci.</source> (<year>2025</year>) <volume>1</volume>:<fpage>13</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.69882/adba.ai.2025073</pub-id></mixed-citation>
</ref>
<ref id="B26">
<label>26.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aslan</surname> <given-names>E</given-names></name> <name><surname>Alpsalaz</surname> <given-names>SD</given-names></name> <name><surname>Alpsalaz</surname> <given-names>F</given-names></name> <name><surname>Uzel</surname> <given-names>H</given-names></name></person-group>. <article-title>Alzheimer&#x00027;s classification with a MaxViT-based deep learning model using magnetic resonance imaging</article-title>. <source>J Appl Sci TechnolTrends.</source> (<year>2025</year>) 6: doi: <pub-id pub-id-type="doi">10.38094/jastt62453</pub-id></mixed-citation>
</ref>
<ref id="B27">
<label>27.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>K&#x000F6;r</surname> <given-names>H</given-names></name> <name><surname>Mazman</surname> <given-names>R</given-names></name></person-group>. <article-title>Brain tumor detection and classification with deep learning based CNN method</article-title>. <source>Comput Syst Artif Intell.</source>(<year>2025</year>) <volume>1</volume>:<fpage>15</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.69882/adba.csai.2025073</pub-id></mixed-citation>
</ref>
<ref id="B28">
<label>28.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kiran</surname> <given-names>HE</given-names></name></person-group>. <article-title>Deep learning-based detection of abdominal diseases using YOLOv9 models and advanced pre-processing techniques</article-title>. <source>Comput Electron Med</source>. <volume>2</volume>:<fpage>20</fpage>&#x02013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.69882/adba.cem.2025014</pub-id></mixed-citation>
</ref>
<ref id="B29">
<label>29.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abushahla</surname> <given-names>KH</given-names></name> <name><surname>Pala</surname> <given-names>MA</given-names></name></person-group>. <article-title>Optimizing diabetes prediction: addressing data imbalance with machine learning algorithms</article-title>. <source>ADBA Comput Sci</source>. (<year>2024</year>) <volume>1</volume>:<fpage>26</fpage>&#x02013;<lpage>35</lpage>. doi: <pub-id pub-id-type="doi">10.69882/adba.cs.2024075</pub-id></mixed-citation>
</ref>
<ref id="B30">
<label>30.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rama</surname> <given-names>J</given-names></name> <name><surname>Nalini</surname> <given-names>C</given-names></name> <name><surname>Kumaravel</surname> <given-names>A</given-names></name></person-group>. <article-title>Image pre-processing: enhance the performance of medical image classification using various data augmentation technique</article-title>. <source>ACCENTS Trans Image Process Comput Vis.</source> (<year>2019</year>) <volume>5</volume>:<fpage>14</fpage>&#x02013;<lpage>17</lpage>. doi: <pub-id pub-id-type="doi">10.19101/TIPCV.413001</pub-id></mixed-citation>
</ref>
<ref id="B31">
<label>31.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goceri</surname> <given-names>E</given-names></name></person-group>. <article-title>Medical image data augmentation: techniques, comparisons and interpretations</article-title>. <source>Artif. Intell Rev.</source> (<year>2023</year>) <volume>56</volume>:<fpage>12561</fpage>&#x02013;<lpage>605</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10462-023-10453-z</pub-id><pub-id pub-id-type="pmid">37362888</pub-id></mixed-citation>
</ref>
<ref id="B32">
<label>32.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Siddarth</surname> <given-names>SG</given-names></name> <name><surname>Chokkalingam</surname> <given-names>S</given-names></name></person-group>. <article-title>DenseNet 121 framework for automatic feature extraction of diabetic retinopathy images</article-title>. In International Conference on Emerging Systems and Intelligent Computing(ESIC); 2024 Feb; New York, NY: IEEE, (<year>2024</year>), p. <fpage>338</fpage>&#x02013;<lpage>42</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ESIC60604.2024.10481664</pub-id></mixed-citation>
</ref>
<ref id="B33">
<label>33.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rajkumar</surname> <given-names>R</given-names></name></person-group>. <article-title>Deep learning feature extraction using attention-based DenseNet 121 for copy move forgery detection</article-title>. <source>Int. J. Image Graph.</source> (<year>2023</year>) <volume>23</volume>:<fpage>2350042</fpage>. doi: <pub-id pub-id-type="doi">10.1142/S0219467823500420</pub-id></mixed-citation>
</ref>
<ref id="B34">
<label>34.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Sun</surname> <given-names>G</given-names></name> <name><surname>Qiu</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>L</given-names></name> <name><surname>Chhatkuli</surname> <given-names>A</given-names></name> <name><surname>Van Gool</surname> <given-names>L</given-names></name> <etal/></person-group>. <article-title>Transformer in convolutional neural networks</article-title>. <source>arXiv</source> [Preprint]. <source>arXiv:2106.03180</source> (<year>2021</year>).</mixed-citation>
</ref>
<ref id="B35">
<label>35.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname> <given-names>A</given-names></name></person-group>. <article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>. <source>arXiv</source> [Preprint] <italic>arXiv:11929</italic> (<year>2020</year>).</mixed-citation>
</ref>
<ref id="B36">
<label>36.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>H</given-names></name> <name><surname>Xiao</surname> <given-names>B</given-names></name> <name><surname>Codella</surname> <given-names>N</given-names></name> <name><surname>Liu</surname> <given-names>M</given-names></name> <name><surname>Dai</surname> <given-names>X</given-names></name> <name><surname>Yuan</surname> <given-names>L</given-names></name> <etal/></person-group>. <article-title>Cvt: introducing convolutions to vision transformers</article-title>. in <source>Proceedings of the IEEE/CVF international conference on computer vision.</source> (<year>2021</year>) <fpage>22</fpage>&#x02013;<lpage>31</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00009</pub-id></mixed-citation>
</ref>
<ref id="B37">
<label>37.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xiao</surname> <given-names>X</given-names></name> <name><surname>Zhang</surname> <given-names>D</given-names></name> <name><surname>Hu</surname> <given-names>G</given-names></name> <name><surname>Jiang</surname> <given-names>Y</given-names></name> <name><surname>Xia</surname> <given-names>S</given-names></name></person-group>. <article-title>CNN&#x02013;MHSA: a Convolutional neural network and multi-head self-attention combined approach for detecting phishing websites</article-title>. <source>Neural Netw.</source> (<year>2020</year>) <volume>125</volume>:<fpage>303</fpage>&#x02013;<lpage>312</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2020.02.013</pub-id><pub-id pub-id-type="pmid">32172140</pub-id></mixed-citation>
</ref>
<ref id="B38">
<label>38.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tan</surname> <given-names>H</given-names></name> <name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Yin</surname> <given-names>B</given-names></name> <name><surname>Li</surname> <given-names>X</given-names></name></person-group>. <article-title>MHSA-Net: multihead self-attention network for occluded person re-identification</article-title>. <source>IEEE Trans Neural Netw Learn Syst.</source> (<year>2022</year>) <volume>34</volume>:<fpage>8210</fpage>&#x02013;<lpage>24</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2022.3144163</pub-id><pub-id pub-id-type="pmid">35312622</pub-id></mixed-citation>
</ref>
<ref id="B39">
<label>39.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Raghu</surname> <given-names>S</given-names></name> <name><surname>Sriraam</surname> <given-names>N</given-names></name></person-group>. <article-title>Optimal configuration of multilayer perceptron neural network classifier for recognition of intracranial epileptic seizures</article-title>. <source>Expert Syst Appl.</source> (<year>2017</year>) <volume>89</volume>:<fpage>205</fpage>&#x02013;<lpage>21</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2017.07.029</pub-id></mixed-citation>
</ref>
<ref id="B40">
<label>40.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dino</surname> <given-names>HI</given-names></name> <name><surname>Abdulrazzaq</surname> <given-names>MB</given-names></name></person-group>. <article-title>Facial expression classification based on SVM, KNN and MLP classifiers</article-title>. 2019 In: <italic>International Conference on Advanced Science and Engineering (ICOASE)</italic>, IEEE (<year>2019</year>) <fpage>70</fpage>&#x02013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICOASE.2019.8723728</pub-id></mixed-citation>
</ref>
<ref id="B41">
<label>41.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>M&#x000FC;ller</surname> <given-names>D Soto-Rey I Kramer F</given-names></name></person-group>. <article-title>Towards a guideline for evaluation metrics in medical image segmentation</article-title>. <source>BMC Res Notes.</source> (<year>2022</year>). 15:1210. doi: <pub-id pub-id-type="doi">10.1186/s13104-022-06096-y</pub-id><pub-id pub-id-type="pmid">35725483</pub-id></mixed-citation>
</ref>
<ref id="B42">
<label>42.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Heydarian</surname> <given-names>M</given-names></name> <name><surname>Doyle TE Samavi</surname> <given-names>R</given-names></name></person-group>. <article-title>MLCM: multi-label confusion matrix</article-title>. <source>Ieee Access</source> (<year>2022</year>). <volume>10</volume>:<fpage>19083</fpage>&#x02013;<lpage>95</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3151048</pub-id></mixed-citation>
</ref>
<ref id="B43">
<label>43.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Altal</surname> <given-names>OF</given-names></name> <name><surname>Sindiani</surname> <given-names>AM</given-names></name> <name><surname>Mhannaa</surname> <given-names>HYA</given-names></name> <name><surname>Alhatamleh</surname> <given-names>S</given-names></name> <name><surname>Amin</surname> <given-names>M</given-names></name> <name><surname>Akhdar</surname> <given-names>HF</given-names></name> <etal/></person-group>. <article-title>WOAENet: a whale optimization-guided ensemble deep learning with soft voting for uterine cancer diagnosis based on MRI images</article-title>. <source>Front Artif Intell.</source> (<year>2025</year>) <volume>8</volume>:<fpage>1664201</fpage>. <pub-id pub-id-type="pmid">41190038</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3066053/overview">Ishak Pacal</ext-link>, Igdir &#x000DC;niversitesi, T&#x000FC;rkiye</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3187222/overview">Emrah Aslan</ext-link>, Mardin Artuklu University, T&#x000FC;rkiye</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3356888/overview">Akif Akgul</ext-link>, Hittite University, T&#x000FC;rkiye</p>
</fn>
</fn-group>
</back>
</article>