<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1478750</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2024.1478750</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Advanced gastrointestinal tract organ differentiation using an integrated swin transformer U-Net model for cancer care</article-title>
<alt-title alt-title-type="left-running-head">Sharma et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2024.1478750">10.3389/fphy.2024.1478750</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Sharma</surname>
<given-names>Neha</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gupta</surname>
<given-names>Sheifali</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Almogren</surname>
<given-names>Ahmad</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bharany</surname>
<given-names>Salil</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2126275/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Altameem</surname>
<given-names>Ayman</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Rehman</surname>
<given-names>Ateeq Ur</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2770907/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Chitkara University Institute of Engineering and Technology</institution>, <institution>Chitkara University</institution>, <addr-line>Rajpura</addr-line>, <addr-line>Punjab</addr-line>, <country>India</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Computer Science</institution>, <institution>College of Computer and Information Sciences</institution>, <institution>King Saud University</institution>, <addr-line>Riyadh</addr-line>, <country>Saudi Arabia</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Natural and Engineering Sciences</institution>, <institution>College of Applied Studies and Community Services</institution>, <institution>King Saud University</institution>, <addr-line>Riyadh</addr-line>, <country>Saudi Arabia</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>School of Computing</institution>, <institution>Gachon University</institution>, <addr-line>Seongnam-si</addr-line>, <country>Republic of Korea</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/976359/overview">Wenjun Liu</ext-link>, Beijing University of Posts and Telecommunications (BUPT), China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/711875/overview">Nguyen Quoc Khanh Le</ext-link>, Taipei Medical University, Taiwan</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2270497/overview">Imran Iqbal</ext-link>, New York University, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Neha Sharma, <email>sharma.neha@chitkara.edu.in</email>; Ateeq Ur Rehman, <email>202411144@gachon.ac.kr</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>12</day>
<month>12</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>12</volume>
<elocation-id>1478750</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>08</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Sharma, Gupta, Almogren, Bharany, Altameem and Rehman.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Sharma, Gupta, Almogren, Bharany, Altameem and Rehman</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>The segmentation of gastrointestinal (GI) organs, including the stomach, small intestine, and large intestine, is crucial for radio oncologists to plan effective cancer therapy. This study presents an innovative semantic segmentation approach that integrates the Swin Transformer Block with the U-Net model to delineate healthy GI organs accurately using MRI data. The paper presents a novel approach that merges the Swin Transformer and U-Net models to leverage global context learning capabilities and fine-grained spatial resolution. Incorporating this integration greatly enhances the model&#x2019;s capacity to achieve precise and comprehensive semantic segmentation, specifically in accurately outlining the gastrointestinal tract in MRI data. It utilizes the Swin Transformer, incorporating a shift-based windowing technique to gather contextual information efficiently while ensuring scalability. This novel architecture effectively balances local and global contexts, improving performance across various computer vision tasks, especially in medical imaging for segmenting the gastrointestinal tract. The model was trained and tested on the UW Madison GI Tract dataset, which comprises 38,496 MRI images from actual cancer cases. By leveraging the self-attention mechanisms of the Swin Transformer to capture global context and long-term dependencies, this approach combines the strengths of both models. The proposed architecture achieved a loss of 0.0949, a dice coefficient of 0.9190, and an Intersection over Union (IoU) score of 0.8454, demonstrating its effectiveness in providing high accuracy and robust performance. This technology holds significant potential for integration into clinical processes, enhancing the precision of radiation therapy for GI cancer patients.</p>
</abstract>
<kwd-group>
<kwd>swin transformer</kwd>
<kwd>U-Net model</kwd>
<kwd>segmentation</kwd>
<kwd>gastrointestinal tract</kwd>
<kwd>radiation therapy</kwd>
<kwd>UW madison GI tract dataset</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medical Physics and Imaging</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Gastrointestinal cancers include cancers of the colon, liver, stomach, and esophagus, which are the most common and deadly in the world [<xref ref-type="bibr" rid="B1">1</xref>]. They are a substantial source of health burden, especially among older men, and have led to high mortality rate around the globe. GLOBOCAN, the International Agency for Research on Cancer reported that cancer remains one of the most common diseases and causes of death, accounting for around 1.93 million new cases in 2020 and 900,000 deaths. The statistics unveil essential issues that should be urgently addressed through preventive measures, proper early detection methods, and better treatment protocols to mitigate the global burden of cancer [<xref ref-type="bibr" rid="B2">2</xref>].</p>
<p>The treatment of GI cancers is generally wide-ranging and based on the type of cancer. For example, patients who have colon cancer often experience surgical intervention, chemotherapy, or radiation therapy [<xref ref-type="bibr" rid="B3">3</xref>, <xref ref-type="bibr" rid="B4">4</xref>]. To be more specific, among the three, it tends to be central with the position of being more dominant because it utilizes high-energy X-rays to eliminate cancer cells [<xref ref-type="bibr" rid="B4">4</xref>]. Radiation therapy poses a significant challenge in the GI tract. It enables radiation oncologists to reach nearer the cancer cells without affecting the rest of the healthy tissues [<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>]. Determining tumor size and location helps to optimize treatment plans by giving a higher radiation dose to cancerous tissues, thus providing an effective and targeted approach. Segmentation further allows for easy follow-up of the treatment response, where clinicians can analyze changes in the affected organs&#x2019; size and shape during therapy [<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>].</p>
<p>There has been a revolution in clinical practice over the past decade with the appearance of deep learning as a drastically transforming tool, mainly in the diagnostic space of medical imaging [<xref ref-type="bibr" rid="B9">9</xref>, <xref ref-type="bibr" rid="B10">10</xref>]. Techniques like image classification, object recognition, and segmentation have dramatically improved disease diagnosis and treatment planning, thus increasing the accuracy and personalization of patient care [<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B12">12</xref>]. CNN and U-Net architectures shown as deep learning models exhibited auspicious performance in segmenting small intestines, large intestines, and stomachs from MRI scans [<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>]. This considerable dataset training facilitates the models to recognize and outline segments of disease areas, enabling clinicians to establish relevant information for early detection, treatment, and follow-up monitoring [<xref ref-type="bibr" rid="B15">15</xref>&#x2013;<xref ref-type="bibr" rid="B17">17</xref>].</p>
<p>This study introduces a novel deep-learning model that integrates Swin Transformer Blocks and U-Net architecture for the semantic segmentation of GI structures, explicitly targeting the small intestine, large intestine, and stomach. The model leverages the strengths of U-Net, which is optimized for segmentation tasks, and Swin Transformer, which effectively captures global context and pixel relationships within images. Our model achieves highly detailed and precise segmentation by combining these two approaches. The proposed model holds potential for significant clinical applications, enhancing the ability to accurately identify anatomical structures and improving diagnostic, therapeutic, and follow-up capabilities in GI cancer management. This research effort makes significant contributions as follows:<list list-type="simple">
<list-item>
<p>&#x2022; The paper presents a novel approach that merges the Swin Transformer and U-Net models to leverage global context learning capabilities and fine-grained spatial resolution. Incorporating this integration greatly enhances the model&#x2019;s capacity to achieve precise and comprehensive semantic segmentation, specifically in accurately outlining the gastrointestinal tract in MRI data.</p>
</list-item>
<list-item>
<p>&#x2022; The paper utilizes the Swin Transformer, incorporating a shift-based windowing technique to gather contextual information efficiently while ensuring scalability. This novel architecture effectively balances local and global contexts, improving performance across various computer vision tasks, especially in medical imaging for segmenting the gastrointestinal tract.</p>
</list-item>
<list-item>
<p>&#x2022; The U-Net architecture captures intricate details and preserves spatial information. The model effectively integrates context data and high-resolution information by utilizing skip connections, resulting in accurate localization of object boundaries.</p>
</list-item>
</list>
</p>
<p>The following outlines the later parts of this study: <xref ref-type="sec" rid="s2">Section 2</xref> summarizes the literature work, and <xref ref-type="sec" rid="s3">Section 3</xref> addresses the input dataset. <xref ref-type="sec" rid="s4">Section 4</xref> elaborates on the proposed Integrated Swin Transformer U-Net Model, <xref ref-type="sec" rid="s5">Section 5</xref> represents the results, <xref ref-type="sec" rid="s6">Section 6</xref> offers a comparative investigation of the proposed model with state-of-the-art outcomes, and <xref ref-type="sec" rid="s7">Section 7</xref> presents the conclusion.</p>
</sec>
<sec id="s2">
<title>2 Literature work</title>
<p>Many medical imaging researchers have used deep learning architectures to build segmentation and classification models for the gastrointestinal system. Ganz et al. [<xref ref-type="bibr" rid="B18">18</xref>] developed software based on narrow-band imaging (NBI) data to differentiate polyps autonomously. The proposed model outperforms previous algorithms for automatically segmenting 87 images. Wang et al. [<xref ref-type="bibr" rid="B19">19</xref>] developed a technique named &#x201c;Polyp-Alert&#x201d; to support endoscopists in locating polyps during colonoscopy. By monitoring the detected polyp edge(s), the method aggregates images of the same polyp(s) in one shot. V&#xe1;zquez et al. [<xref ref-type="bibr" rid="B20">20</xref>] provided an enlarged segmentation dataset intending to create a novel robust norm for research into colonoscopy image analysis. The proposed dataset includes four relevant classifications for evaluating the endoluminal scene, each serving a different therapeutic need. Using the dataset, the authors train conventional fully convolutional networks (FCNs) to construct new baselines.</p>
<p>Brandao et al. [<xref ref-type="bibr" rid="B21">21</xref>] described a DL-based segmentation algorithm for identifying lesions in colonoscopy images. Shape-from-shading is also used to provide a more comprehensive picture of tissues. Depth is introduced as an extra input passage to the RGB data in their network models, and the resulting network performs better. The segmentation model got an IoU of 48%, producing an IoU of 56.95% on the CVC-Colon dataset. Dijkstra et al. [<xref ref-type="bibr" rid="B22">22</xref>] described a one-step method for detecting polyps. The approach leverages an FCNN model for segmenting polyp. They tested the proposed network on different datasets, and their outcomes were promising.</p>
<p>Banik et al. [<xref ref-type="bibr" rid="B23">23</xref>] offered a multiscale patch network for automatic polyp area segmentation. The patches are then concatenated for precise polyp area pixel label annotation. The proposed model was validated using the CVC-Clinic DB. Wang et al. [<xref ref-type="bibr" rid="B24">24</xref>] created a multiscale MCNet for segmentation of GI Tract endoscopic images, using global and local contexts as training guidance. One global subnetwork determines each input image&#x2019;s worldwide structure. They then build two cascaded local subnetworks based on the worldwide subnetwork&#x2019;s output feature maps to collect regional appearance. Three subnetworks learn feature maps concatenated for the lesion segmentation task. Galdran et al. [<xref ref-type="bibr" rid="B25">25</xref>] described a new approach for gastrointestinal polyp delineation using an encoder-decoder approach. In the proposed method, pre-trained encoder-decoder architecture was successively joined. Sharma et al. [<xref ref-type="bibr" rid="B26">26</xref>] used an encoder and a standard U-Net architecture. More sophisticated algorithms with remarkable performance in various classification contexts are available. One can encode these models to generate a distinctive U-Net design and improve output. Ye et al. [<xref ref-type="bibr" rid="B27">27</xref>] proposed SIA-UNet, a modified network including MRI sequence information. Extensive studies on the UWM database were conducted to evaluate the suggested model. Chou et al. [<xref ref-type="bibr" rid="B28">28</xref>] employed Mask R-CNN along with U-Net techniques to distinguish the GI parts. Sharma et al. [<xref ref-type="bibr" rid="B29">29</xref>] suggested a model that is a U-Net design built from the ground up and utilized for image segmentation.</p>
<p>Li et al. [<xref ref-type="bibr" rid="B30">30</xref>] examined and combined several 2.5D data creation strategies to make the most of the images and proposed a 2.5D feature combination approach with adjacent weighting. Their solution integrates several representation processes by deeply combining multidimensional convolutions into fundamental modules. Extensive experiments on a publically accessible GI database show that the 2.5D combination strategy outperforms the 2.5D method devoid of feature combination by 0.36% on dice and 0.12% on Jaccard. Using two methods&#x2014;a UNet and a ResNet50 encoder&#x2014;and a sparser UNet&#x2014;Chia et al. [<xref ref-type="bibr" rid="B31">31</xref>] look at FiLM, a technique for leveraging pixel width and height picture data to improve UNet design. Using the variety of methods of the ensemble, Georgescu et al. [<xref ref-type="bibr" rid="B32">32</xref>] offered a fresh strategy for building ensembles of different medical picture segmentation architectures. Choosing the structures with the highest scores reveals that DiPE surpasses several designs and ensemble-building approaches.</p>
</sec>
<sec id="s3">
<title>3 Input dataset</title>
<p>The UW Madison GI tract dataset is employed in the proposed study. The University of Wisconsin has released the dataset, which is available on the Kaggle platform [<xref ref-type="bibr" rid="B33">33</xref>]. The collection contains 38,496 MRI scans of the GI tract for actual cancer patients.The ground truth of the dataset is in RLE format (Run Length Encoding), so the ground truth mask is created using RLE decoding. The segmentation mask is divided into three classes: small bowel, large bowel, and stomach. The size of the images in a dataset is not same for all the images, so the dataset has been resized to make all the images of same size. The input size for the images is set to 240 &#xd7; 240. <xref ref-type="table" rid="T1">Table 1</xref> displays some dataset&#x2019;s sample images and corresponding ground truth masks. <xref ref-type="fig" rid="F1">Figures 1A, B</xref> show two MRI images. In contrast, <xref ref-type="fig" rid="F1">Figures 1C, D</xref> show the ground truth masks, with yellow representing the large bowel, green representing the small bowel, and red representing the stomach. The dataset has been separated in the ratio of 70:15:15 for train, testing, and validation, respectively.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Different hyperparameters.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Parameters name</th>
<th align="center">Parameter value</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Batch Size</td>
<td align="center">8</td>
</tr>
<tr>
<td align="center">Learning Rate</td>
<td align="center">0.0001</td>
</tr>
<tr>
<td align="center">Epochs</td>
<td align="center">70</td>
</tr>
<tr>
<td align="center">Processing Time</td>
<td align="center">6 h 37 min 43 s</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>UW Madison GI Tract Dataset <bold>(A)</bold> and <bold>(B)</bold> Input Images and <bold>(C)</bold> and <bold>(D)</bold> Respective Ground Truth Masks (yellow color shows the large intestine, green shows the small intestines, and red meaning the stomach).</p>
</caption>
<graphic xlink:href="fphy-12-1478750-g001.tif"/>
</fig>
</sec>
<sec id="s4">
<title>4 Proposed integrated swin transformer U-Net model for GI tract segmentation</title>
<p>The proposed Integrated Swin Transformer U-Net Model combines the Swin Transformer design, a breakthrough in computer vision [<xref ref-type="bibr" rid="B34">34</xref>], with the U-Net model [<xref ref-type="bibr" rid="B35">35</xref>], which is well-known for its segmentation capabilities. Combining UNet with the Swin Transformer gives the benefits from both the fine-grained spatial resolution of UNet and the high-level context information of the Swin Transformer. With its unique U-shaped topology, the combination of a contracting path for context, and an extensive method for accurate localization, the U-Net architecture captures minute details. Conversely, the Swin Transformer retains localized detail extraction rapidly and adds sliding windows (hence &#x201c;Swin&#x201d;) to capture broader context. The Swin Transformer is advantageous over pure transformers for vision tasks due to its hierarchical feature learning and computational efficiency. Using shifted windows to capture localized self-attention reduces the quadratic complexity of processing entire images, making it more scalable and manageable for high-resolution inputs. This &#x201c;shifted window&#x201d; mechanism also enables Swin Transformers to capture fine-grained details and global context as information flows between neighboring windows across layers. This combination of localized and global attention makes Swin Transformers particularly effective for image segmentation, where understanding both local structures and overall context is crucial, providing a balanced and efficient approach that pure transformers lack.</p>
<p>Here, employing self-attention techniques to grasp the global context and long-term relationships of images, the Swin Transformer Block and U-Net model have been constructed to combine their capabilities. Its unique feature allows it to combine international academic content with local inference. The proposed model includes three main components: encoder, bottleneck layer, and decoder. In the encoder, decoder, and bottleneck part of the U-Net model, Swin transformer blocks are used to gradually reduce the spatial dimension while increasing the complexity of the received data. Between the encoder&#x2019;s downsampling and the decoder&#x2019;s upsampling, the bottleneck layer refines and compresses the coding features, thus allowing us to know how much information flows through the network. The U-Net model&#x2019;s decoder component improves features gathered during the encoding phase, enabling the network to record fine-grained data. A detailed description of the encoder, decoder, and bottleneck block of the proposed model is given in the following sections.</p>
<sec id="s4-1">
<title>4.1 Swin transformer</title>
<p>The Swin Transformer block, a vital component of the Swin Transformer architecture, presents a shift-based windowing technique to gather contextual information quickly while retaining scalability. The model&#x2019;s name, &#x201c;Swin&#x201d;, is derived from Shifted Windows, which divides the image into non-overlapping windows and applies the attention mechanism within them. To capture relationships between windows, they are shifted in successive layers. This enables the model to capture local and global context without requiring the whole attention mechanism to cover the entire image. <xref ref-type="fig" rid="F2">Figure 2</xref> shows the Swin transformer block arrangements. <xref ref-type="fig" rid="F2">Figure 2</xref>, comprising component 1 and component 2, leverages a hierarchical structure for effective image processing. Component 1 initiates with Layer Normalization (LN) to standardize input features, followed by Window-based Multi-head Self-Attention (W-MSA) for capturing local dependencies in a windowed context. Subsequent Layer Normalization ensures stability, and a Multi-Layer Perceptron (MLP) extracts complex features. Component 2 maintains this pattern with LN for normalization, Shifted Window-based Multi-head Self-Attention (SW-MSA) to capture global information with window shifts, and LN for stability. The final MLP facilitates further feature extraction. This dual-block architecture enables the Swin Transformer to simultaneously consider local and global image details, enhancing its performance across diverse computer vision tasks. Cross-window communication incorporates global context, layer normalization, and multi-layer perceptron blocks process patch embeddings to ensure non-linearity and feature transformation. This novel architecture balances local and global context, making the Swin Transformer block particularly successful for various computer vision tasks, including GI tract segmentation in medical imaging.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Swin transformer block.</p>
</caption>
<graphic xlink:href="fphy-12-1478750-g002.tif"/>
</fig>
<p>Consecutive Swin Transformer components calculated with the shifted window partitioning technique are illustrated in <xref ref-type="disp-formula" rid="e1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="e4">4</xref>:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="italic">LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m2">
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="italic">LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m3">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>W</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="italic">LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m4">
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="italic">LN</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf1">
<mml:math id="m5">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf2">
<mml:math id="m6">
<mml:mrow>
<mml:msup>
<mml:mi>Y</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> Denote the output features of the WMSA module and the MLP module for block <inline-formula id="inf3">
<mml:math id="m7">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> respectively.</p>
</sec>
<sec id="s4-2">
<title>4.2 Encoder (downsampling path)</title>
<p>The encoder of the Swin U-Net network consists of a linear embedding block followed by a succession of Swin Transformer blocks capturing local and global information in the image, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. Combining the inventive token-based architecture of the Swin Transformer with the conventional feature extraction powers of the U-Net, the Encoder&#x2014;or Downsampling Path dividing the input image into fixed-size patches, this transformational method treats each patch as a &#x201c;token&#x201d; for self-attention computation. It compiles global contextual data essential for exact medical image segmentation. Every encoder layer improves token representations so the model may understand visual content at ever-rising degrees of abstraction. The encoder enables the Swin U-Net to excel in complex medical image analysis, adapt to different scales, and learn relevant features by combining self-attention mechanisms with hierarchical feature extraction, enabling it to obtain good results in semantic segmentation tasks, so a potent tool for accurate and context-aware anatomical structure identification in medical images.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Proposed integrated swin transformer U-net model.</p>
</caption>
<graphic xlink:href="fphy-12-1478750-g003.tif"/>
</fig>
<p>Initially, the input image has dimensions 240 &#xd7; 240 &#xd7; C (where C stands for the number of channels). After that, this processed image passes via several Swin Transformer Blocks. Swin Transformer Block 1 generates a feature map of dimensions 60 &#xd7; 60 &#xd7; C; Swin Transformer Block 2 generates a feature map of dimensions 30 &#xd7; 30 &#xd7; 2C after that. Swin Transformer Block 3 finally creates a 15 &#xd7; 15 &#xd7; 4C feature map. Skip connections between related blocks in the encoder and decoder help to guarantee thorough feature preservation: Skip Connection 1/4 links between Swin Transformer Block 1 and Swin Transformer Block 8. Skip Connection 1/8 links between Swin Transformer Block 2 and Swin Transformer Block 7. Skip connection 1/16 links between Block 3 and Block 6. These links directly connect relevant feature maps from the encoder to the decoder, minimizing spatial information loss via downsampling. Skip connections ensure that spatial information is maintained and enhanced throughout the segmentation process.</p>
</sec>
<sec id="s4-3">
<title>4.3 Bottleneck block</title>
<p>In the Swin U-Net model, the encoded data is refined through a bottleneck process consisting of two Swin Transformer blocks (<xref ref-type="fig" rid="F3">Figure 3</xref>). The bottom layer is the main point of the network, where the regional capacity of the U-Net model and the hierarchical features collected by the Swin Transformer combine perfectly. Between the encoder&#x2019;s downsampling and the decoder&#x2019;s upsampling, this layer compresses and improves the coding characteristics employing two Swin Transformer Blocks (Blocks 4 and 5), allowing more data to flow through the network. The bottleneck layer combines the general concepts of Swin Transformer with the fine-grained data in U-Net, reducing the complexity of the connection while ensuring that the model preserves all information about the input image. This integration improves Swin U-Net&#x2019;s ability to accurately segment medical images and collect comprehensive and small local data. It is the foundation for standards of excellence in medical image analysis.</p>
</sec>
<sec id="s4-4">
<title>4.4 Decoder (upsampling path)</title>
<p>The decoder is an essential part of the Swin U-Net model and is responsible for using the best features of the bottleneck process and the encoder cross-connection to generate feature maps. The decoder is a linear combination that provides the encoder process&#x2019;s fine details to reconstruct the original image&#x2019;s segmentation map. This allows the Swin U-Net to effectively capture all the collected data and the Swin Transformer to maintain the complex regional features. This allows the model to achieve high performance. Swin Transformer Block 6 generates a 15 &#xd7; 15 &#xd7; 4C feature map, Swin Transformer Block 7 creates a 30 &#xd7; 30 &#xd7; 2C feature map, and Swin Transformer Block 8 produces a 60 &#xd7; 60 &#xd7; C feature map. The patch merging layer then reconstructs the segmented image, effectively segmenting the intestinal tract while maintaining the original size of 240 &#xd7; 240 &#xd7; C. The boundary and content of the region are preserved, which is crucial in processing medical images. This integration allows the model to combine the global understanding provided by the Swin Transformer with the real-time accuracy provided by U-Net, leading to the best performance in the semantic segmentation task where the treatment plan requires anatomical structure information.</p>
</sec>
</sec>
<sec id="s5">
<title>5 Results analysis</title>
<p>This research proposed an Integrated Swin Transformer U-Net Model to segment the gastrointestinal tract with MRI data. The model runs on the Google Colab platform using Keras and TensorFlow framework. <xref ref-type="table" rid="T1">Table 1</xref> describes the Swin Transformer U-Net model&#x2019;s training parameters proposed in GI organ segmentation task. The selected batch size is 8 to balance the two objectives so that it will not lose any performance and minimize memory use. The learning rate has been set to be 0.0001, which is small enough to ensure that convergence is stable instead of overshooting, which is what matters most for such complex architecture deep learning models like Swin Transformer U-Net. The model was trained over 70 times; thus, there were more than enough iterations to fit the dataset without overfitting. This training run took 6 h, 37 min, and 43 s, demonstrating how computer-intensive training efficient models can be on vast amounts of medical data. The following section presents this model&#x2019;s results and showcases how it can help segment small bowel, large bowel, and stomach from MRI images.</p>
<sec id="s5-1">
<title>5.1 Loss analysis</title>
<p>The loss plot analysis for gastrointestinal tract segmentation entails watching the convergence of loss curves unique to the small bowel, large intestine, and stomach segmentation. These curves represent the model&#x2019;s accuracy in segmenting each area. Monitoring the training and validation loss curves is critical to ensure the model learns properly without overfitting. <xref ref-type="fig" rid="F4">Figure 4</xref> represents the training and validation loss plots by implementing the proposed design. In <xref ref-type="fig" rid="F4">Figure 4</xref>, we can observe a sharp decline in loss during the fifth epoch. Subsequently, the loss gradually decreases, reaching a value of 0.0472 for the training and 0.0929 for the validation.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Training and validation loss curve.</p>
</caption>
<graphic xlink:href="fphy-12-1478750-g004.tif"/>
</fig>
</sec>
<sec id="s5-2">
<title>5.2 Dice coefficient analysis</title>
<p>The accuracy of segmentation for the small intestine, large intestine, and stomach regions is assessed using Dice coefficient plots, which illustrate how well the predicted segmentations from the proposed Integrated Swin Transformer U-Net Model align with the ground truth masks. Higher Dice coefficients indicate better alignment. <xref ref-type="fig" rid="F5">Figure 5</xref> displays the Dice curves generated by the proposed Ensemble of Swin Transformer Block and U-Net Model. As shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, the Dice value starts at 0 and rapidly increases between epochs 0 and 10, followed by a more gradual rise. Ultimately, the Dice coefficient reaches final values of 0.9571 for training and 0.9203 for validation.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Training and validation dice coefficient curve.</p>
</caption>
<graphic xlink:href="fphy-12-1478750-g005.tif"/>
</fig>
</sec>
<sec id="s5-3">
<title>5.3 IoU coefficient analysis</title>
<p>Evaluating the Intersection over Union (IoU) coefficient plots for gastrointestinal tract segmentation using the proposed Integrated Swin Transformer U-Net Model involves assessing the model&#x2019;s accuracy in delineating the boundaries of the small intestine, large intestine, and stomach regions. These plots demonstrate how closely the model&#x2019;s predicted segmentations align with the ground truth masks, with higher IoU coefficients indicating superior segmentation quality. <xref ref-type="fig" rid="F6">Figure 6</xref> presents the IoU curve generated by the proposed model. As depicted in <xref ref-type="fig" rid="F6">Figure 6</xref>, the IoU value increases from the 10th epoch and continues to rise gradually. Ultimately, the IoU coefficient achieves a final value of 0.9147 for the training dataset and 0.9203 for the validation dataset.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Training and validation IoU coefficient curve.</p>
</caption>
<graphic xlink:href="fphy-12-1478750-g006.tif"/>
</fig>
</sec>
<sec id="s5-4">
<title>5.4 Performance analysis for test dataset</title>
<p>
<xref ref-type="table" rid="T2">Table 2</xref> shows the performance parameters of the proposed segmentation model for training, testing, and validation datasets. Three crucial measurements of the model&#x2019;s performance are loss, dice, and IoU. With a low loss of 0.0472, a high Dice value of 0.9571, and an IoU value of 0.99147, the model shows accurate segmentation and significant overlap with the ground truth during training. The model retains its segmentation quality over the testing and validation phases with slightly higher loss values, demonstrating constant Dice and IoU Coefficients of around 0.9190 to 0.9203 and 0.8454 to 0.8490, respectively. These results show that the model can generalize its segmentation skills to previously encountered data while maintaining consistent performance.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Performance parameters.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Parameter</th>
<th align="center">Train</th>
<th align="center">Validation</th>
<th align="center">Test</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Loss</td>
<td align="center">0.0472</td>
<td align="center">0.0929</td>
<td align="center">0.0949</td>
</tr>
<tr>
<td align="center">Dice Coefficient</td>
<td align="center">0.9571</td>
<td align="center">0.9203</td>
<td align="center">0.9190</td>
</tr>
<tr>
<td align="center">IoU Coefficient</td>
<td align="center">0.9147</td>
<td align="center">0.8490</td>
<td align="center">0.8454</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s5-5">
<title>5.5 Visual analysis</title>
<p>
<xref ref-type="fig" rid="F7">Figure 7</xref> provides a comparison of gastrointestinal tract segmentation results on MRI images, organized into four columns: &#x201c;Original Image&#x201d;, &#x201c;Predicted Mask Image&#x201d;, &#x201c;Ground Truth Mask Image&#x201d;, and &#x201c;Miss Mask Image&#x201d;. Each row represents a different MRI slice. The &#x201c;Original Image&#x201d; column shows the raw grayscale MRI scans. In contrast, the &#x201c;Predicted Mask Image&#x201d; column displays the segmentation masks generated by the proposed model, where different regions are color-coded for straightforward interpretation: red represents the large bowel, green corresponds to the small bowel, and blue indicates the stomach. This color scheme is consistent across the &#x201c;Ground Truth Mask Image&#x201d; column, which shows expert-annotated masks that serve as the benchmark for evaluating the model&#x2019;s accuracy.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Visualization of results.</p>
</caption>
<graphic xlink:href="fphy-12-1478750-g007.tif"/>
</fig>
<p>The &#x201c;Miss Mask Image&#x201d; column highlights discrepancies between the model&#x2019;s predictions and the ground truth annotations, using green to indicate true positives (areas predicted by the model and present in the ground truth) and red for false negatives (areas in the ground truth but the model missed). This layout effectively visualizes the model&#x2019;s strengths and limitations, allowing for a quick assessment of its accuracy in segmenting the small bowel, large bowel, and stomach within MRI scans of the gastrointestinal tract.</p>
<p>The proposed integration of the Swin Transformer and U-Net architectures offers several notable advantages over existing GI tract segmentation methods, primarily by combining the global context-capturing capabilities of the Swin Transformer with the spatial precision of the U-Net. Unlike traditional convolutional neural networks (CNNs) or standalone U-Net models, which focus on local features, the Swin Transformer&#x2019;s hierarchical structure with shifted windows allows efficient processing of local and global information, enhancing segmentation accuracy, particularly in complex anatomical structures. This combined approach proves robust in handling variations in MRI data, such as differences in organ shape and texture. It is especially effective in distinguishing between similar tissues, where boundaries are often ambiguous. However, the model has limitations, including higher computational requirements due to the transformer layers, which could restrict its applicability in clinical settings with limited resources.</p>
</sec>
</sec>
<sec id="s6">
<title>6 Comparison with state of art</title>
<p>
<xref ref-type="table" rid="T3">Table 3</xref> provides a comparative overview of several image segmentation approaches assessed for their effectiveness in the context of a given goal, most likely in medical imaging or computer vision, in 2022. The techniques described include transfer learning encoders, U-Net architecture, Mask RCNN, a mix of U-Net and transfer learning models, U-Net applied to 2.5D images, U-Net paired with ResNet 50, and ensemble learning. The associated Dice coefficient and IoU/Jaccard scores serve as performance measures, assessing the quality of segmentation findings. Highlights include the proposed model, which has a Dice value of 0.91 and an IoU/Jaccard of 0.84, and additional algorithms with varied segmentation accuracy. In this case, the superior performance of Swin Transformer-U-Net can be attributed to the combination of global content and spatial accuracy. Swin Transformer&#x2019;s moving window effectively captures surface irregularities, enabling the model to identify minor differences between similar tissues in the colon. Furthermore, the U-Net model refines region boundaries through cross-linking, essential for accurate segmentation. This combination makes the model more efficient than previous methods by evaluating local details with global context understanding and makes it particularly suitable for complex anatomical segmentation tasks.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>State-of-the-art comparison.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Ref/Year</th>
<th align="left">Method</th>
<th align="left">Dice value</th>
<th align="left">IoU/Jaccard</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">[<xref ref-type="bibr" rid="B24">24</xref>]/2022</td>
<td align="center">Transfer learning encoders</td>
<td align="center">----</td>
<td align="center">0.84</td>
</tr>
<tr>
<td align="center">[<xref ref-type="bibr" rid="B25">25</xref>]/2022</td>
<td align="center">U-Net</td>
<td align="center">0.78</td>
<td align="center">---</td>
</tr>
<tr>
<td align="center">[<xref ref-type="bibr" rid="B26">26</xref>]/2022</td>
<td align="center">Mask RCNN</td>
<td align="center">0.51</td>
<td align="center">---</td>
</tr>
<tr>
<td align="center">[<xref ref-type="bibr" rid="B27">27</xref>]/2022</td>
<td align="center">U-Net and transfer learning models</td>
<td align="center">0.88</td>
<td align="center">0.88</td>
</tr>
<tr>
<td align="center">[<xref ref-type="bibr" rid="B28">28</xref>]/2022</td>
<td align="center">U-Net on 2.5 D</td>
<td align="center">0.36</td>
<td align="center">0.12</td>
</tr>
<tr>
<td align="center">[<xref ref-type="bibr" rid="B29">29</xref>]/2022</td>
<td align="center">U-Net with ResNet 50</td>
<td align="center">---</td>
<td align="center">---</td>
</tr>
<tr>
<td align="center">[<xref ref-type="bibr" rid="B30">30</xref>]/2022</td>
<td align="center">Ensemble learning</td>
<td align="center">0.91</td>
<td align="center">---</td>
</tr>
<tr>
<td align="center">Proposed Model</td>
<td align="center">Proposed Integrated Swin Transformer U-Net Model</td>
<td align="left">0.92</td>
<td align="center">0.84</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec sec-type="conclusion" id="s7">
<title>7 Conclusion</title>
<p>This study presents an integrated Swin Transformer U-Net model for segmenting intestinal lesions in MRI images, which is an essential task for developing radiology in cancer treatment. The well-designed model combines the global content learning capabilities of Swin Transformer with the detailed feature extraction capabilities of U-Net to provide optimal performance. Experimental results validated in gastrointestinal diseases at the University of Wisconsin-Madison showed that the model has high accuracy with low loss, high Dice coefficient, and IoU scores of 0.0949, 0.9190, and 0.8454, respectively. These results indicate that the proposed model can improve the accuracy of GI cancer treatment and provide radiation oncologists with a powerful tool for better treatment planning and patient care. Integrating these principles into clinical practice will lead to more efficient and effective radiation therapy, ultimately improving patient outcomes. We plan to refine the model&#x2019;s architecture for future enhancements to reduce computational complexity, allowing for more efficient real-time applications. We also aim to explore further multi-modal data integration, such as combining MRI with CT scans, to improve segmentation accuracy. Beyond GI tract segmentation, this model&#x2019;s framework could be adapted to other types of cancer and areas of medical imaging by fine-tuning its parameters to accommodate different tissue characteristics and imaging modalities. For instance, it could be adapted for lung or brain tumor segmentation by training on specialized datasets, enabling broader clinical applications across oncology.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/competitions/uw-madison-gi-tract-image-segmentation/data">https://www.kaggle.com/competitions/uw-madison-gi-tract-image-segmentation/data</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>NS: Conceptualization, Methodology, Software, Writing&#x2013;original draft, Writing&#x2013;review and editing. SG: Conceptualization, Methodology, Supervision, Writing&#x2013;original draft, Writing&#x2013;review and editing. AA: Conceptualization, Methodology, Project administration, Resources, Writing&#x2013;original draft, Writing&#x2013;review and editing. SB: Conceptualization, Formal Analysis, Methodology, Writing&#x2013;original draft, Writing&#x2013;review and editing. AA: Funding acquisition, Investigation, Project administration, Resources, Writing&#x2013;original draft, Writing&#x2013;review and editing. AR: Conceptualization, Methodology, Writing&#x2013;original draft, Writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported by King Saud University, Riyadh, Saudi Arabia, through researchers supporting project number RSP2024R498.</p>
</sec>
<sec sec-type="COI-statement" id="s11">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rawla</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Barsouk</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Epidemiology of gastric cancer: global trends, risk factors, and prevention</article-title>. <source>Gastroenterol Review/Przegl&#x105;d Gastroenterologiczny</source> (<year>2019</year>) <volume>14</volume>(<issue>1</issue>):<fpage>26</fpage>&#x2013;<lpage>38</lpage>. <pub-id pub-id-type="doi">10.5114/pg.2018.80001</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>MQH</given-names>
</name>
</person-group>. <article-title>Tumor recognition in wireless capsule endoscopy images using textural features and SVM-based feature selection</article-title>. <source>IEEE Trans Inf Tech Biomed</source> (<year>2012</year>) <volume>16</volume>(<issue>3</issue>):<fpage>323</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1109/TITB.2012.2185807</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Bao</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Geng</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Alkandari</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Polyp detection and radius measurement in small intestine using video capsule endoscopy</article-title>. In: <conf-name>2014 7th International Conference on Biomedical Engineering and Informatics</conf-name>; <conf-date>14-16 October 2014</conf-date>; <conf-loc>Dalian, China</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2014</year>). p. <fpage>237</fpage>&#x2013;<lpage>41</lpage>.</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jaffray</surname>
<given-names>DA</given-names>
</name>
<name>
<surname>Gospodarowicz</surname>
<given-names>MK</given-names>
</name>
</person-group>. <article-title>Radiation therapy for cancer</article-title>. <source>Cancer Dis Control priorities</source> (<year>2015</year>) <volume>3</volume>:<fpage>239</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1596/978-1-4648-0349-9_ch14</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shin</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Balasingham</surname>
<given-names>I</given-names>
</name>
</person-group> <article-title>Comparison of hand-craft feature based SVM and CNN based deep learning framework for automatic polyp classification</article-title>. In: <conf-name>2017 39th annual international conference of the IEEE engineering in medicine and biology society (EMBC)</conf-name>; <conf-date>11-15 July 2017</conf-date>; <conf-loc>Jeju, Korea (South)</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2017</year>) p. <fpage>3277</fpage>&#x2013;<lpage>80</lpage>.</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>D</given-names>
</name>
<etal/>
</person-group> <article-title>Colorectal polyp segmentation using a fully convolutional neural network</article-title>. In: <conf-name>2017 10th international congress on image and signal processing, biomedical engineering and informatics (CISP-BMEI)</conf-name>; <conf-date>14-16 October 2017</conf-date>; <conf-loc>Shanghai, China</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2017</year>). p. <fpage>1</fpage>&#x2013;<lpage>5</lpage>.</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nguyen</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>SW</given-names>
</name>
</person-group>. <article-title>Colorectal segmentation using multiple encoder-decoder network in colonoscopy images</article-title>. In: <conf-name>2018 IEEE first international conference on artificial intelligence and knowledge engineering (AIKE)</conf-name>; <conf-date>26-28 September 2018</conf-date>; <conf-loc>Laguna Hills, CA, USA</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2018</year>) p. <fpage>208</fpage>&#x2013;<lpage>11</lpage>.</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Meng</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X</given-names>
</name>
</person-group> <article-title>Biomedia ACM MM grand challenge 2019: using data enhancement to solve sample unbalance</article-title>. In: <source>Proceedings of the 27th ACM international conference on multimedia</source> (<year>2019</year>) p. <fpage>2588</fpage>&#x2013;<lpage>92</lpage>.</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lilhore</surname>
<given-names>UK</given-names>
</name>
<name>
<surname>Poongodi</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Kaur</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Simaiya</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Algarni</surname>
<given-names>AD</given-names>
</name>
<name>
<surname>Elmannai</surname>
<given-names>H</given-names>
</name>
<etal/>
</person-group> <article-title>Hybrid model for detection of cervical cancer using causal analysis and machine learning techniques</article-title>. <source>Comput Math Methods Med</source> (<year>2022</year>) <volume>2022</volume>:<fpage>1</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1155/2022/4688327</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kukreja</surname>
<given-names>V</given-names>
</name>
<name>
<surname>Dhiman</surname>
<given-names>P</given-names>
</name>
</person-group>. <article-title>A Deep Neural Network based disease detection scheme for Citrus fruits</article-title>. In: <conf-name>2020 International conference on smart electronics and communication (ICOSEC)</conf-name>; <conf-date>10-12 September 2020</conf-date>; <conf-loc>Trichy, India</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2020</year>) p. <fpage>97</fpage>&#x2013;<lpage>101</lpage>.</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Iqbal</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Younus</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Walayat</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Kakar</surname>
<given-names>MU</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Automated multi-class classification of skin lesions through deep convolutional neural network with dermoscopic images</article-title>. <source>Comput Med Imaging graphics</source> (<year>2021</year>) <volume>88</volume>:<fpage>101843</fpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2020.101843</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Iqbal</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Walayat</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Kakar</surname>
<given-names>MU</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Automated identification of human gastrointestinal tract abnormalities based on deep convolutional neural network with endoscopic images</article-title>. <source>Intell Syst Appl</source> (<year>2022</year>) <volume>16</volume>:<fpage>200149</fpage>. <pub-id pub-id-type="doi">10.1016/j.iswa.2022.200149</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Le</surname>
<given-names>NQK</given-names>
</name>
</person-group>. <article-title>Potential of deep representative learning features to interpret the sequence information in proteomics</article-title>. <source>Proteomics</source> (<year>2022</year>) <volume>22</volume>(<issue>1-2</issue>):<fpage>2100232</fpage>. <pub-id pub-id-type="doi">10.1002/pmic.202100232</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kha</surname>
<given-names>QH</given-names>
</name>
<name>
<surname>Tran</surname>
<given-names>TO</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>VN</given-names>
</name>
<name>
<surname>Than</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>NQK</given-names>
</name>
</person-group>. <article-title>An interpretable deep learning model for classifying adaptor protein complexes from sequence information</article-title>. <source>Methods</source> (<year>2022</year>) <volume>207</volume>:<fpage>90</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2022.09.007</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Q</given-names>
</name>
<etal/>
</person-group> <article-title>UNet: unet-like pure transformer for medical image segmentation</article-title>. In: <source>European conference on computer vision</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer Nature Switzerland</publisher-name> (<year>2022</year>) p. <fpage>205</fpage>&#x2013;<lpage>18</lpage>.</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hatamizadeh</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Nath</surname>
<given-names>V</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Roth</surname>
<given-names>HR</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Swin unetr: swin transformers for semantic segmentation of brain tumors in mri images</article-title>. In: <source>International MICCAI brainlesion workshop</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name> (<year>2021</year>). p. <fpage>272</fpage>&#x2013;<lpage>84</lpage>.</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Ds-transunet: dual swin transformer u-net for medical image segmentation</article-title>. <source>IEEE Trans Instrumentation Meas</source> (<year>2022</year>) <volume>71</volume>:<fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2022.3178991</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ganz</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Slabaugh</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Automatic segmentation of polyps in colonoscopic narrow-band imaging data</article-title>. <source>IEEE Trans Biomed Eng</source> (<year>2012</year>) <volume>59</volume>(<issue>8</issue>):<fpage>2144</fpage>&#x2013;<lpage>51</lpage>. <pub-id pub-id-type="doi">10.1109/TBME.2012.2195314</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tavanapong</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Oh</surname>
<given-names>JH</given-names>
</name>
<name>
<surname>De Groen</surname>
<given-names>PC</given-names>
</name>
</person-group>. <article-title>Polyp-alert: near real-time feedback during colonoscopy</article-title>. <source>Comp Methods Programs Biomed</source> (<year>2015</year>) <volume>120</volume>(<issue>3</issue>):<fpage>164</fpage>&#x2013;<lpage>79</lpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2015.04.002</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>V&#xe1;zquez</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Bernal</surname>
<given-names>J</given-names>
</name>
<name>
<surname>S&#xe1;nchez</surname>
<given-names>FJ</given-names>
</name>
<name>
<surname>Fern&#xe1;ndez-Esparrach</surname>
<given-names>G</given-names>
</name>
<name>
<surname>L&#xf3;pez</surname>
<given-names>AM</given-names>
</name>
<name>
<surname>Romero</surname>
<given-names>A</given-names>
</name>
<etal/>
</person-group> <article-title>A benchmark for endoluminal scene segmentation of colonoscopy images</article-title>. <source>J Healthc Eng</source> (<year>2017</year>) <volume>2017</volume>:<fpage>1</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1155/2017/4037190</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brandao</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Zisimopoulos</surname>
<given-names>O</given-names>
</name>
<name>
<surname>Mazomenos</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Ciuti</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Bernal</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Visentini-Scarzanella</surname>
<given-names>M</given-names>
</name>
<etal/>
</person-group> <article-title>Towards a computed-aided diagnosis system in colonoscopy: automatic polyp segmentation using convolution neural networks</article-title>. <source>J Med Robotics Res</source> (<year>2018</year>) <volume>3</volume>(<issue>02</issue>):<fpage>1840002</fpage>. <pub-id pub-id-type="doi">10.1142/s2424905x18400020</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dijkstra</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Sobiecki</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Bernal</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Telea</surname>
<given-names>AC</given-names>
</name>
</person-group>. <article-title>Towards a single solution for polyp detection, localization and segmentation in colonoscopy images</article-title>. <source>VISIGRAPP</source> (<year>2019</year>) <volume>4</volume>:<fpage>616</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.5220/0007694906160625</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Banik</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Bhattacharjee</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Nasipuri</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>A multiscale patch-based deep learning system for polyp segmentation</article-title>. In: <source>Advanced computing and systems for security</source>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2020</year>) p. <fpage>109</fpage>&#x2013;<lpage>19</lpage>.</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Cong</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>H</given-names>
</name>
<etal/>
</person-group> <article-title>Multiscale context-guided deep network for automated lesion segmentation with endoscopy images of gastrointestinal tract</article-title>. <source>IEEE J Biomed Health Inform</source> (<year>2020</year>) <volume>25</volume>(<issue>2</issue>):<fpage>514</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1109/jbhi.2020.2997760</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Galdran</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Carneiro</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Ballester</surname>
<given-names>MAG</given-names>
</name>
</person-group>. <article-title>Double encoder-decoder networks for gastrointestinal polyp segmentation</article-title>. In: <source>International conference on pattern recognition</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2021</year>) p. <fpage>293</fpage>&#x2013;<lpage>307</lpage>.</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sharma</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Automated GI tract segmentation using deep learning</article-title>. <source>arXiv preprint arXiv:2206.11048</source> (<year>2022</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2206.11048</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ye</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>SIA-unet: a unet with sequence information for gastrointestinal tract segmentation</article-title>. In: <source>Pacific rim international conference on artificial intelligence</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2022</year>) p. <fpage>316</fpage>&#x2013;<lpage>26</lpage>.</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chou</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Roman</surname>
<given-names>E</given-names>
</name>
</person-group>. <source>GI tract image segmentation with U-net and mask R-CNN</source> (<year>2024</year>).</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sharma</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Koundal</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Alyami</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Alshahrani</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Asiri</surname>
<given-names>Y</given-names>
</name>
<etal/>
</person-group> <article-title>U-net model with transfer learning model as a backbone for segmentation of gastrointestinal tract</article-title>. <source>Bioengineering</source> (<year>2023</year>) <volume>10</volume>(<issue>1</issue>):<fpage>119</fpage>. <pub-id pub-id-type="doi">10.3390/bioengineering10010119</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Multi-view unet for automated GI tract segmentation</article-title>. In: <conf-name>2022 5th International Conference on Pattern Recognition and Artificial Intelligence (PRAI)</conf-name>; <conf-date>19-21 August 2022</conf-date>; <conf-loc>Chengdu, China</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2022</year>) p. <fpage>1067</fpage>&#x2013;<lpage>72</lpage>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chia</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Lui</surname>
<given-names>N</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Gastrointestinal tract segmentation using multi-task learning</article-title>.</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Georgescu</surname>
<given-names>MI</given-names>
</name>
<name>
<surname>Ionescu</surname>
<given-names>RT</given-names>
</name>
<name>
<surname>Miron</surname>
<given-names>AI</given-names>
</name>
</person-group>. <article-title>Diversity-promoting ensemble for medical image segmentation</article-title>. <source>arXiv preprint arXiv:2210.12388</source> (<year>2022</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2210.12388</pub-id>
</citation>
</ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="web">
<collab>Kaggle</collab>. <article-title>UW-madison GI tract image segmentation</article-title> (<year>2024</year>). <comment>Available from: <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/competitions/uw-madison-gi-tract-image-segmentation/data">https://www.kaggle.com/competitions/uw-madison-gi-tract-image-segmentation/data</ext-link> (Accessed July 1, 2024).</comment>
</citation>
</ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z</given-names>
</name>
<etal/>
</person-group> <article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>. In: <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name> (<year>2021</year>) p. <fpage>10012</fpage>&#x2013;<lpage>22</lpage>.</citation>
</ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T</given-names>
</name>
</person-group>. <article-title>U-net: convolutional networks for biomedical image segmentation</article-title>. In: <conf-name>Medical Image Computing and Computer-Assisted Intervention&#x2013;MICCAI 2015: 18th International Conference</conf-name>; <conf-date>October 5-9, 2015</conf-date>; <conf-loc>Munich, Germany</conf-loc>. <publisher-name>Springer International Publishing</publisher-name> (<year>2015</year>) p. <fpage>234</fpage>&#x2013;<lpage>41</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>