<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1618607</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>AI-assisted anatomical structure recognition and segmentation via mamba-transformer architecture in abdominal ultrasound images</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Chang</surname> <given-names>Shih-Fang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3039968/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wu</surname> <given-names>Po-Yi</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tsai</surname> <given-names>Ming-Chang</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tseng</surname> <given-names>Vincent S.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Chi-Chih</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1079724/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Information and Communications Research Laboratories, Industrial Technology Research Institute</institution>, <addr-line>Hsinchu</addr-line>, <country>Taiwan</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Computer Science, National Yang Ming Chiao Tung University</institution>, <addr-line>Hsinchu</addr-line>, <country>Taiwan</country></aff>
<aff id="aff3"><sup>3</sup><institution>School of Medicine, Chung Shan Medical University</institution>, <addr-line>Taichung</addr-line>, <country>Taiwan</country></aff>
<aff id="aff4"><sup>4</sup><institution>Division of Gastroenterology and Hepatology, Department of Internal Medicine, Chung Shan Medical University Hospital</institution>, <addr-line>Taichung</addr-line>, <country>Taiwan</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: Tuan D. Pham, Queen Mary University of London, United Kingdom</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: Tanvi Luthra, All India Institute of Medical Sciences, India</p>
<p>Mahendra Gawali, Sanjivani University, India</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Chi-Chih Wang, <email>bananaudwang@gmail.com</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>23</day>
<month>07</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1618607</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>09</day>
<month>07</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 Chang, Wu, Tsai, Tseng and Wang.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Chang, Wu, Tsai, Tseng and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec id="sec1">
<title>Background</title>
<p>Abdominal ultrasonography is a primary diagnostic tool for evaluating medical conditions within the abdominal cavity. Accurate determination of the relative locations of intra-abdominal organs and lesions based on anatomical features in ultrasound images is essential in diagnostic sonography. Recognizing and extracting anatomical landmarks facilitates lesion evaluation and enhances diagnostic interpretation. Recent artificial intelligence (AI) segmentation methods employing deep neural networks (DNNs) and transformers encounter computational efficiency challenges to balance the preservation of feature dependencies information with model efficiency, limiting their clinical applicability.</p>
</sec>
<sec id="sec2">
<title>Methods</title>
<p>The anatomical structure recognition framework, MaskHybrid, was developed using a private dataset comprising 34,711 abdominal ultrasound images of 2,063 patients from CSMUH. The dataset included abdominal organs and vascular structures (hepatic vein, inferior vena cava, portal vein, gallbladder, kidney, pancreas, spleen) and liver lesions (hepatic cyst, tumor). MaskHybrid adopted a mamba-transformer hybrid architecture consisting of an evolved backbone network, encoder, and corresponding decoder to capture long-range spatial dependencies and contextual information effectively, demonstrating improved image segmentation capabilities in visual tasks while mitigating the computational burden associated with the transformer-based attention mechanism.</p>
</sec>
<sec id="sec3">
<title>Results</title>
<p>Experiments on the retrospective dataset achieved a mean average precision (mAP) score of 74.13% for anatomical landmarks segmentation in abdominal ultrasound images. Our proposed framework outperformed baselines across most organ and lesion types and effectively segmented challenging anatomical structures. Moreover, MaskHybrid exhibited a significantly shorter inference time (0.120&#x202F;&#x00B1;&#x202F;0.013&#x202F;s), achieving 2.5 times faster than large-sized AI models of similar size. Combining Mamba and transformer architectures, this hybrid design was well-suited for the timely analysis of complex anatomical structures segmentation in abdominal ultrasonography, where accuracy and efficiency are critical in clinical practice.</p>
</sec>
<sec id="sec4">
<title>Conclusion</title>
<p>The proposed mamba-transformer hybrid recognition framework simultaneously detects and segments multiple abdominal organs and lesions in ultrasound images, achieving superior segmentation accuracy, visualization effect, and inference efficiency, thereby facilitating improved medical image interpretation and near real-time diagnostic sonography that meets clinical needs.</p>
</sec>
</abstract>
<kwd-group>
<kwd>anatomical structure</kwd>
<kwd>image segmentation</kwd>
<kwd>abdominal ultrasound</kwd>
<kwd>sonography</kwd>
<kwd>artificial intelligence</kwd>
<kwd>deep learning</kwd>
<kwd>transformer</kwd>
<kwd>state space models</kwd>
</kwd-group>
<contract-num rid="cn1">Q301AA3110</contract-num>
<contract-sponsor id="cn1">Industrial Technology Research Institute<named-content content-type="fundref-id">10.13039/501100003848</named-content></contract-sponsor>
<counts>
<fig-count count="6"/>
<table-count count="4"/>
<equation-count count="3"/>
<ref-count count="43"/>
<page-count count="14"/>
<word-count count="8010"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medicine and Public Health</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec5">
<label>1</label>
<title>Introduction</title>
<p>Abdominal ultrasonography (US) is a primary diagnostic tool for evaluating medical conditions within the abdominal cavity and discomfort (<xref ref-type="bibr" rid="ref33">Tomizawa et al., 2017</xref>). Physicians frequently use the abdominal US to screen for lesions in abdominal organs, including the liver, gallbladder, kidneys, pancreas, spleen, and adjacent blood vessels, facilitating a comprehensive assessment of intra-abdominal structures. The deep location of abdominal organs within the body and their potential obscuration by bone structures or intestinal gas often result in partially captured organ images on abdominal ultrasound. Variations in ultrasound imaging equipment and systems further complicate image interpretation, posing significant assessment challenges. Despite its relatively lower image resolution compared to advanced medical imaging modalities such as computed tomography (CT) and magnetic resonance imaging (MRI), US remains irreplaceable for the timely detection of potentially life-threatening conditions such as acute abdomen and supports further diagnosis and intervention.</p>
<p>Identifying the relative location of abdominal organs and lesions based on anatomical or pathological features in US images is essential in diagnostic sonography. However, due to the inherent characteristics of ultrasound imaging, including blurred textures and indistinct organ boundaries, interpretation can be challenging, particularly for inexperienced physicians and inadequately trained technicians (<xref ref-type="bibr" rid="ref29">Reddy et al., 2021</xref>). With the rise of artificial intelligence (AI), deep neural network (DNN) techniques (<xref ref-type="bibr" rid="ref29">Reddy et al., 2021</xref>; <xref ref-type="bibr" rid="ref6">Cheng and Malhi, 2017</xref>; <xref ref-type="bibr" rid="ref8">Dandan et al., 2020</xref>; <xref ref-type="bibr" rid="ref37">Xu et al., 2018</xref>; <xref ref-type="bibr" rid="ref13">Hatture and Kadakol, 2021</xref>) have shown promise in facilitating object detection and instance segmentation of abdominal organs, reducing examination interpretation time in US images. Due to the time-consuming annotation process, most studies on the abdominal US have trained their AI models using small datasets, typically comprising only hundreds or thousands of labeled images (<xref ref-type="bibr" rid="ref32">Song, 2021</xref>). Consequently, extracting features from limited data to enhance model training is essential for improving generalizability in clinical applications.</p>
<p>Object detection and instance segmentation in US image analysis aims to identify regions of interest (ROI) as reference points for lesion assessment and aid in diagnostic interpretation. Therefore, accurate ROI extraction is critical for defining organs and lesion boundaries in abdominal US images. Historically, abdominal anatomical recognition heavily relied on the generation of hand-crafted image characteristics to expand feature dimension spaces. For instance, the light neural network, a time-sensitive attention-radial basis function network (TSA-RBFN), was designed to calculate distances within feature dimensions, aiding in segmenting and measuring inflamed gallbladder volumes associated with cholecystitis and gallstones (<xref ref-type="bibr" rid="ref23">Muneeswaran and Rajasekaran, 2018</xref>). Similarly, wavelet decomposition has been employed in high-resolution US images to enhance gallbladder localization and facilitate the detection of suspicious gallbladder polyps (<xref ref-type="bibr" rid="ref5">Chen et al., 2020</xref>). Active contour segmentation with wavelet filtering has also been further applied to liver disease classification (<xref ref-type="bibr" rid="ref17">Krishnan and Radhakrishnan, 2017</xref>). More recently, the continued advancement of deep learning has revolutionized abdominal US imaging applications (<xref ref-type="bibr" rid="ref3">Cai and Pfob, 2025</xref>), particularly in the automatic feature extraction and recognition of abdominal organs such as the kidney (<xref ref-type="bibr" rid="ref28">Ravishankar et al., 2017</xref>; <xref ref-type="bibr" rid="ref41">Yin et al., 2019</xref>; <xref ref-type="bibr" rid="ref40">Yin et al., 2020</xref>; <xref ref-type="bibr" rid="ref27">Peng et al., 2023</xref>; <xref ref-type="bibr" rid="ref26">Peng et al., 2023</xref>), prostate (<xref ref-type="bibr" rid="ref27">Peng et al., 2023</xref>; <xref ref-type="bibr" rid="ref16">Karimi et al., 2019</xref>; <xref ref-type="bibr" rid="ref19">Lei et al., 2019</xref>; <xref ref-type="bibr" rid="ref25">Orlando et al., 2020</xref>), gallbladder (<xref ref-type="bibr" rid="ref24">Obaid et al., 2023</xref>), and liver (<xref ref-type="bibr" rid="ref31">Ryu et al., 2021</xref>; <xref ref-type="bibr" rid="ref7">Dadoun et al., 2022</xref>; <xref ref-type="bibr" rid="ref22">M&#x0103;muleanu et al., 2022</xref>; <xref ref-type="bibr" rid="ref18">Lee et al., 2020</xref>; <xref ref-type="bibr" rid="ref1">Biswas et al., 2018</xref>; <xref ref-type="bibr" rid="ref36">Xi et al., 2021</xref>; <xref ref-type="bibr" rid="ref34">Turco et al., 2022</xref>).</p>
<p><xref ref-type="bibr" rid="ref31">Ryu et al. (2021)</xref> introduced a multi-task system based on the Visual Geometry Group Network (VGG-Net) for segmenting and classifying liver lesions in US images with user-provided click guidance. <xref ref-type="bibr" rid="ref28">Ravishankar et al. (2017)</xref> developed a shape-regularized U-Net (SR-UNet) segmentation framework that integrates shape priors into fully convolutional networks to enhance robustness against low contrast and artifacts. Strategies incorporating morphological information have also improved the effectiveness of DNN-based segmentation tasks. <xref ref-type="bibr" rid="ref41">Yin et al. (2019)</xref> and <xref ref-type="bibr" rid="ref40">Yin et al. (2020)</xref> developed a boundary distance regression network to improve the segmentation robustness against variations in kidney appearance. Peng et al. employed a contour extraction approach (<xref ref-type="bibr" rid="ref27">Peng et al., 2023</xref>) and an automatic searching polygon tracking method (<xref ref-type="bibr" rid="ref26">Peng et al., 2023</xref>) to address the challenges of unclear boundaries and diverse kidney shapes in US images. Similarly, Obaid (<xref ref-type="bibr" rid="ref24">Obaid et al., 2023</xref>) applied active contour segmentation integrated with DNN models to delineate organ boundaries and classify gallbladder disease. Additionally, blood vessels within the liver are critical anatomical landmarks in delineating the liver&#x2019;s anatomy and identifying adjacent organs, such as the pancreas. Deep learning (U-Net) and detection transformer (DETR) have been applied to the characteristic identification and lesion segmentation of liver diseases, including hepatic cysts and tumors (<xref ref-type="bibr" rid="ref31">Ryu et al., 2021</xref>; <xref ref-type="bibr" rid="ref7">Dadoun et al., 2022</xref>; <xref ref-type="bibr" rid="ref22">M&#x0103;muleanu et al., 2022</xref>). <xref ref-type="bibr" rid="ref18">Lee et al. (2020)</xref> utilized deep learning (VGG-Net) to predict the meta-analysis of histological data in viral hepatitis (METAVIR) score and classify liver fibrosis severity for screening and longitudinal assessment of US examinations. <xref ref-type="bibr" rid="ref1">Biswas et al. (2018)</xref> applied a deeper DNN structure (GoogLeNet) to characterize tissue in fatty liver disease and stratify normal and abnormal tissues. <xref ref-type="bibr" rid="ref36">Xi et al. (2021)</xref> employed a similar DNN architecture (ResNet) with pre-trained weights to distinguish between benign and malignant liver lesions. Advanced supervised multidirectional DNN mechanisms (3D V-Net series) (<xref ref-type="bibr" rid="ref19">Lei et al., 2019</xref>; <xref ref-type="bibr" rid="ref25">Orlando et al., 2020</xref>) were further employed to segment prostate volume for prostate cancer diagnostic applications. Despite having relatively shallow architecture with only a few dozen layers, these deep-learning models have outperformed radiologists in specific tasks.</p>
<p>Moreover, <xref ref-type="bibr" rid="ref34">Turco et al. (2022)</xref> interpreted the spatiotemporal features of vascular perfusion and characterized vascular structures in contrast-enhanced ultrasound (CEUS) to improve the precise characterization of focal liver lesions. <xref ref-type="bibr" rid="ref43">Zhang et al. (2024)</xref> proposed the SEG-LUS semantic segmentation model, incorporating multi-head self-attention to identify small ROIs, such as the inferior vena cava, portal vein branches, and hepatic artery, during clinical scanning. While current studies have reported the dice scores (0.826&#x2013;0.957; <xref ref-type="bibr" rid="ref28">Ravishankar et al., 2017</xref>; <xref ref-type="bibr" rid="ref27">Peng et al., 2023</xref>; <xref ref-type="bibr" rid="ref26">Peng et al., 2023</xref>; <xref ref-type="bibr" rid="ref43">Zhang et al., 2024</xref>) and diagnostic accuracies (83.5&#x2013;98.4%; <xref ref-type="bibr" rid="ref27">Peng et al., 2023</xref>; <xref ref-type="bibr" rid="ref24">Obaid et al., 2023</xref>; <xref ref-type="bibr" rid="ref18">Lee et al., 2020</xref>) for abdominal US, most research focuses on learning from US images with a single label per image in multi-organ scenes and subsequently inferring the organ with the highest probability. Such limitations hinder the applicability of these methods in clinical scenarios that require the simultaneous identification of multiple organs to enhance the diagnostic quality in abdominal ultrasound imaging.</p>
<p>This study aimed to develop an efficient AI-based anatomical recognition framework capable of automatically and simultaneously detecting and segmenting multiple anatomical landmarks from the abdominal US images, thereby enhancing generalizability and diagnostic accuracy in clinical practice.</p>
</sec>
<sec sec-type="materials|methods" id="sec6">
<label>2</label>
<title>Materials and methods</title>
<sec id="sec7">
<label>2.1</label>
<title>Study population</title>
<sec id="sec8">
<label>2.1.1</label>
<title>Participants</title>
<p>The retrospective dataset was obtained from outpatient examinations using ultrasound scanners from Toshiba, Hitachi, General Electric, Canon, and Siemens at Chung Shan Medical University Hospital (CSMUH) between April 2013 and May 2024. It included 34,711 B-type abdominal ultrasound images (format: JPG, size: 480 &#x00D7; 640 to 970 &#x00D7; 1,552 pixels) from 2,063 patients (male: 56.8%, female: 43.2%). This study underwent a medical ethics review and was approved by the CSMUH Institutional Review Board (IRB) (IRB No: CS2-22003), and all patient identities were anonymized before images were released, eliminating the need for informed consent from included patients.</p>
</sec>
<sec id="sec9">
<label>2.1.2</label>
<title>Data annotation</title>
<p>Given that the quality of collected US images can be affected by various health conditions of the abdominal organs, several internists specializing in hepatology and gastroenterology (with 8&#x2013;20&#x202F;years of professional experience) were invited to establish accurate annotations for a high-quality dataset. The dataset was annotated using an interactive labeling mechanism for image segmentation to facilitate AI-assisted recognition of organs and related lesions required for US examination, thereby reducing labor-intensive processes. After each US image set of the same patient case was randomly assigned to a physician, the physicians marked only the intersecting line segments to indicate potential organ regions. A polygon-based contouring foreground was then automatically generated using the optimized graph-based segmentation algorithm, GrabCut (<xref ref-type="bibr" rid="ref30">Rother et al., 2004</xref>; <xref ref-type="bibr" rid="ref38">Xu et al., 2017</xref>), which progressively enhances and streamlines the segmentation contour endpoints by considering color resemblance and spatial closeness, as illustrated in <xref ref-type="fig" rid="fig1">Figure 1</xref>. The ground truth was then established for the dataset for subsequent modeling of detection and segmentation tasks. This dataset involved seven abdominal organs and vascular structures, including the hepatic vein, inferior vena cava, portal vein, gallbladder, kidney, pancreas, and spleen, and two related liver lesions, including the hepatic cysts and tumors, comprising 6,332, 3,977, 16,202, 8,183, 5,858, 3,492, 1,358, 2,630, and 8,191 marks, respectively.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p><bold>(A)</bold> Line segments and <bold>(B)</bold> polygon-based contouring foregrounds were created with our own interactive labeling mechanism. Intersecting line segments annotated by physicians are used to indicate potential organ regions, and the GrabCut segmentation algorithm is then used to generate the ground truth automatically.</p>
</caption>
<graphic xlink:href="frai-08-1618607-g001.tif">
<alt-text content-type="machine-generated">Ultrasound images labeled A and B. Image A has green lines outlining a structure with two segments. Image B shows a different green outline encapsulating an area partially overlapping with the marked region in A.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec10">
<label>2.1.3</label>
<title>Dataset split</title>
<p>The entire private dataset of patients and US images was randomly divided into training, validation, and testing sets based on patient cases, comprising 21,039, 6,775, and 6,897 images of 1,240, 408, and 415 cases, respectively, with approximate ratios of 60, 20, and 20%. <xref ref-type="table" rid="tab1">Table 1</xref> and <xref ref-type="fig" rid="fig2">Figure 2</xref> present detailed information regarding the number of images and the distribution of annotations for anatomical structures.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>The training, validation, and testing sub-datasets for abdominal anatomical recognition modeling.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Anatomical Landmarks</th>
<th align="center" valign="top" colspan="2">Training</th>
<th align="center" valign="top" colspan="2">Validation</th>
<th align="center" valign="top" colspan="2">Testing</th>
<th align="center" valign="top" colspan="3">Total</th>
</tr>
<tr>
<th align="center" valign="top">cases</th>
<th align="center" valign="top">images</th>
<th align="center" valign="top">cases</th>
<th align="center" valign="top">images</th>
<th align="center" valign="top">cases</th>
<th align="center" valign="top">images</th>
<th align="center" valign="top">cases</th>
<th align="center" valign="top">images</th>
<th align="center" valign="top">marks</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Hepatic vein</td>
<td align="center" valign="top">791</td>
<td align="center" valign="top">2,878</td>
<td align="center" valign="top">247</td>
<td align="center" valign="top">970</td>
<td align="center" valign="top">258</td>
<td align="center" valign="top">926</td>
<td align="center" valign="top">1,296</td>
<td align="center" valign="top">4,774</td>
<td align="center" valign="top">6,332</td>
</tr>
<tr>
<td align="left" valign="top">Inferior vena cava</td>
<td align="center" valign="top">673</td>
<td align="center" valign="top">2,340</td>
<td align="center" valign="top">220</td>
<td align="center" valign="top">827</td>
<td align="center" valign="top">224</td>
<td align="center" valign="top">805</td>
<td align="center" valign="top">1,117</td>
<td align="center" valign="top">3,972</td>
<td align="center" valign="top">3,977</td>
</tr>
<tr>
<td align="left" valign="top">Portal vein</td>
<td align="center" valign="top">1,100</td>
<td align="center" valign="top">7,893</td>
<td align="center" valign="top">352</td>
<td align="center" valign="top">2,561</td>
<td align="center" valign="top">366</td>
<td align="center" valign="top">2,586</td>
<td align="center" valign="top">1,818</td>
<td align="center" valign="top">13,040</td>
<td align="center" valign="top">16,202</td>
</tr>
<tr>
<td align="left" valign="top">Gallbladder</td>
<td align="center" valign="top">984</td>
<td align="center" valign="top">4,894</td>
<td align="center" valign="top">329</td>
<td align="center" valign="top">1,558</td>
<td align="center" valign="top">328</td>
<td align="center" valign="top">1,593</td>
<td align="center" valign="top">1,641</td>
<td align="center" valign="top">8,045</td>
<td align="center" valign="top">8,183</td>
</tr>
<tr>
<td align="left" valign="top">Kidney</td>
<td align="center" valign="top">1,034</td>
<td align="center" valign="top">3,564</td>
<td align="center" valign="top">346</td>
<td align="center" valign="top">1,149</td>
<td align="center" valign="top">343</td>
<td align="center" valign="top">1,144</td>
<td align="center" valign="top">1,723</td>
<td align="center" valign="top">5,857</td>
<td align="center" valign="top">5,858</td>
</tr>
<tr>
<td align="left" valign="top">Pancreas</td>
<td align="center" valign="top">806</td>
<td align="center" valign="top">2,098</td>
<td align="center" valign="top">265</td>
<td align="center" valign="top">662</td>
<td align="center" valign="top">265</td>
<td align="center" valign="top">730</td>
<td align="center" valign="top">1,336</td>
<td align="center" valign="top">3,490</td>
<td align="center" valign="top">3,492</td>
</tr>
<tr>
<td align="left" valign="top">Spleen</td>
<td align="center" valign="top">536</td>
<td align="center" valign="top">810</td>
<td align="center" valign="top">177</td>
<td align="center" valign="top">274</td>
<td align="center" valign="top">182</td>
<td align="center" valign="top">273</td>
<td align="center" valign="top">895</td>
<td align="center" valign="top">1,357</td>
<td align="center" valign="top">1,358</td>
</tr>
<tr>
<td align="left" valign="top">Hepatic cyst</td>
<td align="center" valign="top">382</td>
<td align="center" valign="top">1,421</td>
<td align="center" valign="top">130</td>
<td align="center" valign="top">408</td>
<td align="center" valign="top">119</td>
<td align="center" valign="top">384</td>
<td align="center" valign="top">631</td>
<td align="center" valign="top">2,213</td>
<td align="center" valign="top">2,630</td>
</tr>
<tr>
<td align="left" valign="top">Tumor</td>
<td align="center" valign="top">566</td>
<td align="center" valign="top">3,727</td>
<td align="center" valign="top">186</td>
<td align="center" valign="top">1,271</td>
<td align="center" valign="top">185</td>
<td align="center" valign="top">1,260</td>
<td align="center" valign="top">937</td>
<td align="center" valign="top">6,258</td>
<td align="center" valign="top">8,191</td>
</tr>
<tr>
<td align="left" valign="top">Total</td>
<td align="center" valign="top">1,240</td>
<td align="center" valign="top">21,039</td>
<td align="center" valign="top">408</td>
<td align="center" valign="top">6,775</td>
<td align="center" valign="top">415</td>
<td align="center" valign="top">6,897</td>
<td align="center" valign="top">2,063</td>
<td align="center" valign="top">34,711</td>
<td align="center" valign="top">56,223</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Each ultrasound image set of the same patient case will be assigned to only one of the training, validation, and testing sub-datasets.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Number of patients and ultrasound images, and the distribution of anatomical structure annotations in the private dataset.</p>
</caption>
<graphic xlink:href="frai-08-1618607-g002.tif">
<alt-text content-type="machine-generated">Flowchart and bar graph displaying data from 2,063 patients divided into training, validation, and testing sets. The training set includes 1,240 patients with 21,039 images; the validation set has 408 patients with 6,775 images; the testing set has 415 patients with 6,897 images. The bar graph shows annotations of anatomical structures, with the gallbladder and portal vein having the highest annotations, followed by the kidney and tumors.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="sec11">
<label>2.2</label>
<title>Studies for algorithm development</title>
<p>In the pre-deep learning era, image segmentation for anatomical landmark recognition relied on hand-crafted features and classical computer vision techniques, which were limited by occlusion and overlapping objects. Currently, the use of DNNs and transformers to enhance segmentation accuracy and scalability represents one of the main trends in contemporary medical image analysis.</p>
<sec id="sec12">
<label>2.2.1</label>
<title>DNN-based segmentation</title>
<p>Mask R-CNN (<xref ref-type="bibr" rid="ref14">He et al., 2017</xref>) is a typical two-stage image segmentation design (object detection followed by pixel-by-pixel mask prediction) that deals with varying shapes and generalizes to several types of scenes, handling simple overlapping objects effectively. The YOLO (You Only Look Once) series (<xref ref-type="bibr" rid="ref2">Boesch, 2024</xref>) employs a single-stage detection paradigm and has evolved significantly in speed, accuracy, feature extraction, and computational efficiency. It evolved from its grid-based, single-stage detection (YOLOv1) to a multi-scale anchor-based approach (YOLOv2, YOLOv3), improving accuracy and robustness. CSPDarkNet, PANet, CIoU loss (YOLOv4), and anchor-free designs with model scaling (YOLOv6, YOLOv7) optimized speed and precision. PGI, GELAN (YOLOv9), and the removal of Non-Maximum Suppression (YOLOv10) enhanced detection and segmentation speed. The latest version, YOLOv11, integrates object detection, segmentation, pose estimation, oriented bounding boxes, and classification to advance performance. However, the trade-off between high-precision segmentation and computational efficiency in DNN-based segmentation limits its applicability in real-world scenarios (<xref ref-type="bibr" rid="ref39">Xu et al., 2024</xref>).</p>
</sec>
<sec id="sec13">
<label>2.2.2</label>
<title>Transformer-based segmentation</title>
<p>Recent advancements have used transformers for image segmentation, improving the understanding of the global context. The architecture consists of an encoder-decoder structure, where the encoder captures contextual information and the decoder generates the output sequence. The self-attention mechanism enables transformers to evaluate the relative importance of input elements, effectively capturing long-range dependencies that may be overlooked by recurrent neural networks (RNNs) and convolutional neural networks (CNNs). This capability is particularly beneficial for segmental tasks, where the relevant context may not be confined to regions locally in US images. DETR (<xref ref-type="bibr" rid="ref4">Carion et al., 2020</xref>) was the first transformer to use query embeddings, combining CNN feature extraction with transformer-based decoding for object detection and instance segmentation, achieving end-to-end but at a high computational cost. By incorporating query embedding refinements and enhanced attention mechanisms, DINO (DETR with improved deNoising anchOrs) (<xref ref-type="bibr" rid="ref42">Zhang et al., 2023</xref>) stabilized bipartite matching and contrastive query selection, enhancing feature learning and convergence speed in detection and segmentation tasks. This approach accelerated learning while maintaining high precision, forming the foundation for models like MaskDINO (<xref ref-type="bibr" rid="ref20">Li et al., 2023</xref>) for dense prediction tasks. MaskDINO integrated DETR-like object detection and the DINO structure with mask prediction capabilities to extend efficient semantic and panoptic segmentation advantages. With multi-scale features, self-attention, and contrastive denoising training, MaskDINO captured global dependencies, leading to improved accuracy and robustness compared to DNN-based approaches. However, these transformer-based segmentation approaches still encounter computational efficiency challenges, hindering their applicability in real-time applications without optimization.</p>
</sec>
<sec id="sec14">
<label>2.2.3</label>
<title>State space models (SSM)</title>
<p>SSM is a mathematical framework that models sequence or time series data by maintaining a hidden internal state that evolves based on input signals and past states, optimizing model inference speed while maintaining model effectiveness. SSM is widely used in signal processing, control systems, time series forecasting, and deep learning. Mamba (<xref ref-type="bibr" rid="ref11">Gu and Dao, 2024</xref>), a modern SSM-based sequence model, was introduced with gated state transitions to enhance expressiveness while maintaining efficiency. It features parallelizable recurrence, reducing memory overhead and improving long-sequence modeling. By leveraging input-dependent gating and efficient kernel parameterization, Mamba achieves transformer-level performance while being computationally efficient for NLP and vision tasks. Mamba-2 (<xref ref-type="bibr" rid="ref9">Dao and Gu, 2024</xref>) built upon Mamba, refining its gating mechanisms, adaptive state transitions, parallelized recurrence, and efficient parameterization, achieving better expressiveness, efficiency, and scalability for long-range dependency modeling and rivaling transformers while reducing computational overhead. On the other hand, MambaVision (<xref ref-type="bibr" rid="ref12">Hatamizadeh and Kautz, 2024</xref>) proposed a hybrid mamba-transformer backbone, adopting Mamba for vision tasks and offering an effective alternative to deep learning and transformers for image and video understanding.</p>
</sec>
</sec>
<sec id="sec15">
<label>2.3</label>
<title>MaskHybrid, the proposed framework</title>
<p>We developed an AI-based anatomical recognition framework that presents the mamba-transformer hybrid design to enhance segmentation accuracy, visualization effects, and inference efficiency while mitigating the computational burden. <xref ref-type="fig" rid="fig3">Figure 3</xref> illustrates the architecture of the proposed framework and its major components: the mamba-transformer backbone, the MaskHybrid hybrid encoder, and the corresponding decoder. The hybrid designs were primarily implemented at the backbone and encoder levels to enhance the model performance and visualization for anatomical recognition.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Architecture and components of the proposed framework, MaskHybrid, are based on MaskDINO and further extended (gray-shaded area) to accelerate segmentation accuracy and inference efficiency.</p>
</caption>
<graphic xlink:href="frai-08-1618607-g003.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a machine learning model for ultrasound image processing. It outlines steps from input images through a Mamba-Transformer Backbone, which includes convolution blocks and Mamba blocks, to feature extraction and token optimization. Output includes masks, boxes, and classes. The system involves positional embeddings, transformer-based layers, and token matching and denoising processes.</alt-text>
</graphic>
</fig>
<sec id="sec16">
<label>2.3.1</label>
<title>Mamba-transformer backbone</title>
<p>In order to capture global dependencies while preserving the spatial structure of US images, the MambaVision-like architecture was employed to extract multi-scale features across different dimensions. The first two stages retained residual convolutional blocks for rapid feature extraction, while the subsequent two stages were modified to incorporate four layers of the Mamba-2-like blocks and followed by four layers of multi-head self-attention transformer blocks. This design preserves global context and long-range spatial dependencies in vision tasks, as shown in <xref ref-type="sec" rid="sec37">Supplementary Figure 1</xref>. At this stage, features were flattened and transformed at different scales for use in the subsequent hybrid encoder.</p>
</sec>
<sec id="sec17">
<label>2.3.2</label>
<title>MaskHybrid encoder</title>
<p>The hybrid encoder consisted of multiple repeated transformer-based encoder layers (<italic>N</italic>&#x202F;=&#x202F;6), each containing a multi-scale deformable self-attention block and two multilayer perceptron (MLP) blocks. Given the efficacy of Mamba designs in capturing long-range dependencies with a limited number of layers (<xref ref-type="bibr" rid="ref9">Dao and Gu, 2024</xref>), one of the intermediate layers was replaced by a Mamba-based layer, incorporating a Mamba-2-like block to substitute the self-attention process inherent in the transformer layer. The reference architecture of the encoder layers is shown in <xref ref-type="sec" rid="sec37">Supplementary Figure 2</xref>.</p>
</sec>
<sec id="sec18">
<label>2.3.3</label>
<title>MaskHybrid decoder</title>
<p>Unlike the encoder, which incorporated a mamba-based layer, the decoder comprised only multiple repeated transformer-based decoder layers (M&#x202F;=&#x202F;9), with each decoder layer including an additional multi-head cross-attention compared to the encoder layer. For small-token counts, such as feature tokens derived from our proposed encoder, the Mamba design using recurrent-like state-space updates may exhibit suboptimal learning due to its reliance on hidden state updates rather than pairwise token interactions. This limitation could reduce effectiveness due to weaker token-to-token interactions than transformers (<xref ref-type="bibr" rid="ref9">Dao and Gu, 2024</xref>). Therefore, the decoder was designed exclusively using transformer-based layers. The reference architecture of the decoder layers is shown in <xref ref-type="sec" rid="sec37">Supplementary Figure 3</xref>.</p>
</sec>
</sec>
<sec id="sec19">
<label>2.4</label>
<title>Performance evaluation</title>
<p>Average Precision (AP) is a performance metric used to measure the model capabilities, commonly employed in computer vision tasks like object detection and instance segmentation. In specific applications, AP can be further categorized based on the task at hand, such as box AP and mask AP. Box AP focuses on the overlap between the predicted bounding box and the ground truth box at a fixed intersection over union (IoU) threshold, calculating the area under the precision-recall curve (PR-AUC) by varying the confidence level. Mask AP (<xref ref-type="bibr" rid="ref14">He et al., 2017</xref>) is specifically designed explicitly for image segmentation tasks, emphasizing the overlap between the prediction and the ground truth masks at a specified IoU threshold. In this study, mask AP was used as the performance metric, and the mAP was then calculated as the mean of the mask AP values across all organ and lesion classes of abdominal anatomical landmarks, with the IoU threshold set to 0.15 and the confidence level set to 0.3, denoted as mAP15. Since a higher IoU threshold (such as IoU&#x202F;&#x2265;&#x202F;0.5) may lead to mistakenly excluding clinically reasonable predictions, a lower IoU threshold of 0.15 was determined for model training to appropriately mark anatomical structures and pathological features in a practical clinical setting.<disp-formula id="E1">
<mml:math id="M1">
<mml:mi mathvariant="italic">IoU</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext mathvariant="italic">Area of Overlap</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext mathvariant="italic">prediction mask</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext mathvariant="italic">ground truth mask</mml:mtext>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mtext mathvariant="italic">Area of Union</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext mathvariant="italic">prediction mask</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext mathvariant="italic">ground truth mask</mml:mtext>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula><disp-formula id="E2">
<mml:math id="M2">
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mtext mathvariant="italic">class</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mtext mathvariant="italic">interp</mml:mtext>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</disp-formula>Where <italic>P<sub>interp</sub></italic>(<italic>r</italic>) is the interpolated precision at a certain recall level <italic>r</italic>, which is defined as the highest precision found for any recall level <italic>r&#x2019;</italic>&#x202F;&#x2265;&#x202F;<italic>r</italic>.<disp-formula id="E3">
<mml:math id="M3">
<mml:mi mathvariant="italic">mAP</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>n</mml:mi>
</mml:mfrac>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mtext mathvariant="italic">classes</mml:mtext>
</mml:msub>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mtext mathvariant="italic">class</mml:mtext>
</mml:msub>
</mml:math>
</disp-formula>Where <italic>AP<sub>class</sub></italic> is the average precision of each class, and n is the number of classes.</p>
</sec>
<sec id="sec20">
<label>2.5</label>
<title>Implementation detail</title>
<p>In this study, original US images were uniformly resized to a dimension of 1,024 &#x00D7; 1,024 pixels, subjected to random horizontal flipping, and augmented with slight scaling using large-scale jittering (LSJ) (<xref ref-type="bibr" rid="ref10">Ghiasi et al., 2021</xref>) prior to training. The ResNet-50 (<xref ref-type="bibr" rid="ref15">He et al., 2016</xref>) and Swin Transformer (Swin-T) (<xref ref-type="bibr" rid="ref21">Liu et al., 2021</xref>) were used as backbones for baseline establishment within MaskDINO, representing small and large-sized models, respectively. The MaskHybrid model utilized six encoder layers and nine decoder layers (<italic>N</italic>&#x202F;=&#x202F;6, M&#x202F;=&#x202F;9). The feature channels in both the encoder and decoder were maintained at 256, and the hidden dimension of the feed-forward neural network (FFN) was set to 2,048. The same loss functions as MaskDINO (L1 loss and GIOU loss for box loss, focal loss for classification loss, and cross-entropy loss and dice loss for mask loss) were leveraged for model convergence. Unlike the commonly used IoU threshold of 0.5 for balanced evaluation on public datasets, a threshold of 0.15 (<xref ref-type="bibr" rid="ref35">Wang et al., 2022</xref>) was chosen because some anatomical landmarks had not been fully annotated by experts, resulting in the actual annotated size being relatively small compared to the ground truth. Employing a lower threshold may reduce the number of false negatives and affect overall model performance; however, it is considered more appropriate for screening purposes in clinical settings.</p>
<p>The fixed input image size results in a constrained token size given by the backbone network to the subsequent encoder and decoder. The tokens of MaskDINO (ResNet), MaskDINO (Swin-T), and MaskHybrid models are 21,760, 21,760, and 22,528, respectively. Since these token sizes are comparable, the candidate models require similar computing resources, with no significant impact on execution time or speed. In addition, due to hardware constraints, the baseline MaskDINO and our MaskHybrid were trained for 10 epochs on an NVIDIA RTX A6000 GPU with an initial learning rate of 1e-4. A batch size of two was used for evaluation to ensure a fair comparison, given the high memory consumption of the Swin-T backbone. An early stopping mechanism was implemented to prevent overfitting. All experiments were conducted using the PyTorch framework in this study.</p>
</sec>
</sec>
<sec sec-type="results" id="sec21">
<label>3</label>
<title>Results</title>
<sec id="sec22">
<label>3.1</label>
<title>Main results of anatomical recognition</title>
<p>As presented in <xref ref-type="table" rid="tab2">Table 2</xref>, the experiments demonstrated the efficacy of incorporating mamba-transformer architectures in anatomical landmarks segmentation. Specifically, the MaskHybrid model outperformed MaskDINO baselines with ResNet-50 and Swin Transformer backbones across most abdominal organs and lesion types, achieving a superior mAP15 score (74.13% vs. 70.68 and 72.60%). For example, it achieved the highest AP scores for the gallbladder (91.79%), kidney (95.47%), pancreas (89.36%), spleen (86.19%), hepatic vein (60.71%) and hepatic cyst (55.48%), suggesting that our model excels at detecting both organs and vascular structures. It also showed consistent segmentation performance, achieving 40.94% mAP at an average AP of higher IoU thresholds from 0.5 to 0.95, surpassing the baselines MaskDINO (Swin-T and ResNet), as shown in <xref ref-type="sec" rid="sec37">Supplementary Table 1</xref>. Since this study aims to provide a screening-based toolkit to assist inexperienced physicians or in the medical settings of rural areas, a lower IoU threshold was adopted to better recognize anatomical structures and pathological features, despite performing relatively well at most IoU thresholds.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Image segmentation performance of MaskHybrid under mAP15 metric compared to MaskDINO baselines with RestNet-50 and Swin Transformer backbones.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Models</th>
<th align="left" valign="top" rowspan="2">Dataset</th>
<th align="center" valign="top" rowspan="2">mAP15 (%)</th>
<th align="center" valign="top" colspan="9">Average Precision (%)</th>
</tr>
<tr>
<th align="center" valign="top">Hepatic vein</th>
<th align="center" valign="top">Inferior vena cava</th>
<th align="center" valign="top">Portal vein</th>
<th align="center" valign="top">Gall-bladder</th>
<th align="center" valign="top">Kidney</th>
<th align="center" valign="top">Pancreas</th>
<th align="center" valign="top">Spleen</th>
<th align="center" valign="top">Hepatic cyst</th>
<th align="center" valign="top">Tumor</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="2">MaskDINO (ResNet)</td>
<td align="left" valign="top">valid</td>
<td align="center" valign="middle">70.85</td>
<td align="center" valign="middle">59.92</td>
<td align="center" valign="middle">56.77</td>
<td align="center" valign="top">66.89</td>
<td align="center" valign="top">88.66</td>
<td align="center" valign="top">95.77</td>
<td align="center" valign="top">88.23</td>
<td align="center" valign="top">83.91</td>
<td align="center" valign="top">47.03</td>
<td align="center" valign="top">50.49</td>
</tr>
<tr>
<td align="left" valign="top">test</td>
<td align="center" valign="middle">70.68</td>
<td align="center" valign="middle">57.63</td>
<td align="center" valign="middle">63.61</td>
<td align="center" valign="top">64.38</td>
<td align="center" valign="top">88.85</td>
<td align="center" valign="top">94.44</td>
<td align="center" valign="top">86.30</td>
<td align="center" valign="top">82.29</td>
<td align="center" valign="top">51.46</td>
<td align="center" valign="top">47.15</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="2">MaskDINO (Swin-T)</td>
<td align="left" valign="top">valid</td>
<td align="center" valign="bottom">72.59</td>
<td align="center" valign="bottom">60.13</td>
<td align="center" valign="bottom"><bold>61.68</bold></td>
<td align="center" valign="bottom">69.50</td>
<td align="center" valign="bottom">90.01</td>
<td align="center" valign="bottom">96.71</td>
<td align="center" valign="bottom">89.54</td>
<td align="center" valign="bottom">83.19</td>
<td align="center" valign="bottom">47.43</td>
<td align="center" valign="bottom"><bold>55.12</bold></td>
</tr>
<tr>
<td align="left" valign="top">test</td>
<td align="center" valign="bottom">72.60</td>
<td align="center" valign="bottom">55.24</td>
<td align="center" valign="bottom"><bold>67.13</bold></td>
<td align="center" valign="bottom">68.29</td>
<td align="center" valign="bottom">91.58</td>
<td align="center" valign="bottom">95.29</td>
<td align="center" valign="bottom">85.47</td>
<td align="center" valign="bottom">83.74</td>
<td align="center" valign="bottom">53.27</td>
<td align="center" valign="bottom"><bold>53.39</bold></td>
</tr>
<tr>
<td align="left" valign="top" rowspan="2">MaskHybrid</td>
<td align="left" valign="top">valid</td>
<td align="center" valign="bottom"><bold>73.72</bold></td>
<td align="center" valign="bottom"><bold>64.35</bold></td>
<td align="center" valign="bottom">60.36</td>
<td align="center" valign="bottom"><bold>71.05</bold></td>
<td align="center" valign="bottom"><bold>90.63</bold></td>
<td align="center" valign="bottom"><bold>97.79</bold></td>
<td align="center" valign="bottom"><bold>91.64</bold></td>
<td align="center" valign="bottom"><bold>87.58</bold></td>
<td align="center" valign="bottom"><bold>48.55</bold></td>
<td align="center" valign="bottom">51.56</td>
</tr>
<tr>
<td align="left" valign="top">test</td>
<td align="center" valign="bottom"><bold>74.13</bold></td>
<td align="center" valign="bottom"><bold>60.71</bold></td>
<td align="center" valign="bottom">66.87</td>
<td align="center" valign="bottom"><bold>70.67</bold></td>
<td align="center" valign="bottom"><bold>91.79</bold></td>
<td align="center" valign="bottom"><bold>95.47</bold></td>
<td align="center" valign="bottom"><bold>89.36</bold></td>
<td align="center" valign="bottom"><bold>86.19</bold></td>
<td align="center" valign="bottom"><bold>55.48</bold></td>
<td align="center" valign="bottom">50.62</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The MaskHybrid framework improved the overall segmentation performance, reaching 74.13% mAP, surpassing the baselines (70.68&#x2013;72.60%). Compared to large-sized models of similar size, the MaskHybrid demonstrated comparable segmentation capabilities to the transformer-based object detection framework, MaskDINO (Swin-T), and it also had superior recognition ability and extensibility to small-sized model, MaskDINO (ResNet). The bold values represent the best-performing results among the models.</p>
</table-wrap-foot>
</table-wrap>
<p>The performance improvement is likely due to the enhanced contextual modeling provided by the mamba-transformer architecture, which supports longer-range dependencies and improved spatial reasoning. Furthermore, the MaskHybrid model showed significant improvements in segmenting challenging anatomical structures such as the hepatic vein, portal vein, and hepatic cyst. Larger organs (including the gallbladder, kidneys, pancreas, and spleen: 86.19&#x2013;95.47%) exhibited higher AP than blood vessels (60.71&#x2013;70.67%) due to their relatively larger volumes. These findings suggested that the mamba-transformer hybrid design effectively captured long-range spatial dependencies and contextual information, making it well-suited for complex ultrasound image segmentation tasks where accuracy and robustness are critical.</p>
<p>In addition, the MaskHybrid model, incorporating the mamba-transformer hybrid design, achieved the closest visualization effect to the ground truth regarding both annotation type and the number of recognized structures. In contrast, the MaskDINO baselines exhibited missed anatomical structures (hepatic vein in <xref ref-type="fig" rid="fig4">Figure 4A</xref> and portal vein in <xref ref-type="fig" rid="fig4">Figure 4B</xref>) or the erroneous identification of non-existent lesions (tumor in <xref ref-type="fig" rid="fig4">Figure 4A</xref>). However, these structures were correctly recognized in our MaskHybird model. Detailed comparisons between models are provided in the <xref ref-type="sec" rid="sec37">Supplementary Material</xref>.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Visualization comparison of baselines and our anatomical recognition model. MaskHybrid achieved the closest visualization effect to the ground truth regarding both annotation type and the number of recognized structures. <bold>(A)</bold> MaskDINO baselines missed the hepatic vein or the erroneous identification of the tumor. <bold>(B)</bold> MaskDINO baselines missed the portal vein.</p>
</caption>
<graphic xlink:href="frai-08-1618607-g004.tif">
<alt-text content-type="machine-generated">Ultrasound images in two panels labeled A and B show comparisons of various segmentation methods. Each panel contains four ultrasound scans marked as Ground Truth, MaskDINO (ResNet), MaskDINO (Swin-T), and MaskHybrid. Anatomical structures like the portal vein and inferior vena cava are outlined in different colors, highlighting differences in segmentation techniques.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec23">
<label>3.2</label>
<title>Ablation experiments</title>
<p>To further investigate the effect of the mamba-transformer hybrid design, ablation experiments were performed on the segmentation performance of anatomical structures with various backbone and encoder combinations. As presented in <xref ref-type="table" rid="tab3">Table 3</xref>, the hybrid architecture of MaskHybrid (Mamba-T) as the backbone with MaskDINO as the encoder achieved superior performance, yielding the highest mAP score of 74.13% among all configurations. This indicates that the hybrid architecture, particularly the Mamba-based backbone, is effective in capturing complex anatomical features and enhancing overall model accuracy. Specifically, replacing the Swin-T backbone in the baseline with our mamba-transformer architecture (Mamba-T) improved overall performance, demonstrating superior segmentation accuracy for nearly all anatomical structures.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Performance variation of different backbone and encoder combinations.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">backbone</th>
<th align="left" valign="top" rowspan="2">encoder</th>
<th align="center" valign="top" rowspan="2">mAP15 (%)</th>
<th align="center" valign="top" colspan="9">Average Precision (%)</th>
</tr>
<tr>
<th align="center" valign="top">Hepatic vein</th>
<th align="center" valign="top">Inferior vena cava</th>
<th align="center" valign="top">Portal vein</th>
<th align="center" valign="top">Gall-bladder</th>
<th align="center" valign="top">Kidney</th>
<th align="center" valign="top">Pancreas</th>
<th align="center" valign="top">Spleen</th>
<th align="center" valign="top">Hepatic cyst</th>
<th align="center" valign="top">Tumor</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">MaskDINO (ResNet)</td>
<td align="left" valign="middle">MaskDINO</td>
<td align="center" valign="middle">70.68</td>
<td align="center" valign="middle">57.63</td>
<td align="center" valign="middle">63.61</td>
<td align="center" valign="middle">64.38</td>
<td align="center" valign="middle">88.85</td>
<td align="center" valign="middle">94.44</td>
<td align="center" valign="middle">86.30</td>
<td align="center" valign="middle">82.29</td>
<td align="center" valign="middle">51.46</td>
<td align="center" valign="middle">47.15</td>
</tr>
<tr>
<td align="left" valign="middle">MaskDINO (Swin-T)</td>
<td align="left" valign="middle">MaskDINO</td>
<td align="center" valign="middle">72.60</td>
<td align="center" valign="middle">55.24</td>
<td align="center" valign="middle"><bold>67.13</bold></td>
<td align="center" valign="middle">68.29</td>
<td align="center" valign="middle">91.58</td>
<td align="center" valign="middle">95.29</td>
<td align="center" valign="middle">85.47</td>
<td align="center" valign="middle">83.74</td>
<td align="center" valign="middle">53.27</td>
<td align="center" valign="middle"><bold>53.39</bold></td>
</tr>
<tr>
<td align="left" valign="top">MaskHybrid (Mamba-T)</td>
<td align="left" valign="middle">MaskDINO</td>
<td align="center" valign="middle"><bold>74.13</bold></td>
<td align="center" valign="middle"><bold>60.71</bold></td>
<td align="center" valign="middle">66.87</td>
<td align="center" valign="middle">70.67</td>
<td align="center" valign="middle"><bold>91.79</bold></td>
<td align="center" valign="middle"><bold>95.47</bold></td>
<td align="center" valign="middle"><bold>89.36</bold></td>
<td align="center" valign="middle"><bold>86.19</bold></td>
<td align="center" valign="middle"><bold>55.48</bold></td>
<td align="center" valign="middle">50.62</td>
</tr>
<tr>
<td align="left" valign="middle">MaskHybrid (Mamba-T)</td>
<td align="left" valign="middle">MaskHybrid</td>
<td align="center" valign="middle">73.63</td>
<td align="center" valign="middle">60.24</td>
<td align="center" valign="middle">65.93</td>
<td align="center" valign="middle"><bold>70.92</bold></td>
<td align="center" valign="middle">91.64</td>
<td align="center" valign="middle">95.43</td>
<td align="center" valign="middle">87.92</td>
<td align="center" valign="middle">84.27</td>
<td align="center" valign="middle">54.51</td>
<td align="center" valign="middle">51.81</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The hybrid architecture with the mamba-transformer backbone achieved the highest model performance. The additional use of a MaskHybrid encoder can provide better visual explanations in specific clinical scenarios while still maintaining competitive performance. The bold values represent the best-performing results among the models.</p>
</table-wrap-foot>
</table-wrap>
<p>Notably, the MaskHybrid backbone paired with the MaskHybrid encoder achieved the highest AP for the Portal vein (70.92%), suggesting that matching the backbone and encoder architecture might lead to better feature alignment and precision for certain structures. On the other hand, while the MaskDINO (Swin-T) backbone and MaskDINO encoder pair had a slightly lower mAP (72.60%), it produced the highest AP for the inferior vena cava (67.13%) and tumor (53.39%), entailing that attention-based models may still offer specific benefits for difficult or irregular structure regions. In contrast, the MaskDINO (ResNet) backbone and MaskDINO encoder pair showed the lowest overall performance, both in terms of mAP and per-class AP scores. This underperformance revealed the limitations of early DNN-based backbones such as ResNet in comparison to transformer-based and derivative models in performing complex medical segmentation tasks.</p>
<p>Additionally, incorporating the MaskHybrid encoder enhanced visual interpretation in some clinical scenarios while still maintaining similar competitive performance, validating the effectiveness of this novel approach. For instance, while candidate models approximated the location of anatomical structures, only the MaskHybrid with hybrid encoder correctly identified the hepatic vein in <xref ref-type="fig" rid="fig5">Figure 5A</xref> and showed a more comprehensive tumor distribution than MaskDINO (Swin-T) and MaskHybrid in <xref ref-type="fig" rid="fig5">Figure 5B</xref>.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Visualization comparison of MaskHybrid models with and without the hybrid encoder. Both <bold>(A)</bold> and <bold>(B)</bold> are segmentations of hepatic veins and tumors. MaskHybrid with the hybrid encoder correctly identified the hepatic vein in <bold>(A)</bold> and showed a more comprehensive tumor distribution in <bold>(B)</bold>.</p>
</caption>
<graphic xlink:href="frai-08-1618607-g005.tif">
<alt-text content-type="machine-generated">Ultrasound images in two panels, A and B, comparing lesion detection techniques. Panel A shows tumors and a portal vein marked in orange and blue, respectively. The upper row displays Ground Truth and MaskDINO (Swin-T). The lower row shows MaskHybrid and MaskHybrid (hybrid encoder). Panel B depicts multiple tumors with similar layout and labels. Each sub-image highlights the variation in detection and marking across different methods.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="sec24">
<label>4</label>
<title>Discussion</title>
<p>In contemporary AI development, model accuracy remains a primary objective across all learning tasks. Driven by advancements in deep learning technologies, medical AI research frequently incorporates deeper and more complex network structures to demonstrate the capability to recognize anatomical structures comprehensively, which may sacrifice recognition efficiency.</p>
<sec id="sec25">
<label>4.1</label>
<title>Computational efficiency</title>
<p>Transformer-based models enhance accuracy through attention mechanisms that selectively focus on critical information. However, the computational complexity of the attention mechanism increases substantially with the number of features, rendering it computationally inefficient and challenging for real-time clinical applications. Given that practical AI applications frequently require timely responses, posing a challenge to balancing accuracy and efficiency, we utilized a hybrid design of Mamba and transformer architectures. This approach led to the development of an enhanced AI detection and segmentation framework, MaskHybrid, aimed at reducing inference latency while preserving model performance advantages. Previous studies (<xref ref-type="bibr" rid="ref9">Dao and Gu, 2024</xref>)&#x2013;(<xref ref-type="bibr" rid="ref12">Hatamizadeh and Kautz, 2024</xref>) have shown that hybrid architectures incorporating a limited number of Mamba layers alongside attention layers can achieve state-of-the-art evaluation metrics and visual representation. Consequently, we facilitated the hybrid design by modifying the model at the backbone and encoder levels to enhance the performance of the anatomical recognition model.</p>
</sec>
<sec id="sec26">
<label>4.2</label>
<title>Visualization effect</title>
<p>Visualizing image segmentation of intra-abdominal organs can be challenging due to disease symptoms. For example, large tumor areas can lead to poor organ recognition performance in segmentation models, as seen with the hepatic vein of ground truth in <xref ref-type="sec" rid="sec37">Supplementary Figure 6B</xref>. Although images may be affected by associated lesions, the baseline MaskDINO and our MaskHybrid models performed well, accurately delineating tumor regions closest to the ground truth. Furthermore, MaskHybrid mitigated the issue of overlapping segmented regions in identical anatomical structures, resulting in superior overall visualization outcomes.</p>
</sec>
<sec id="sec27">
<label>4.3</label>
<title>Modeling limitation</title>
<p>In our retrospective dataset, ultrasound images with mild image conditions, like ascites, were included in the cohort and used for model training, so they could be well recognized. As for images with much intestinal gas were regarded as poor echo windows and excluded at the beginning of the study; therefore, cases with severe gas conditions were out of the scope supported by our AI recognition models. In spite of the fact that the annotations in the training dataset were provided by medical experts, they may not always be perfectly accurate in their annotations. Physicians might overlook certain organs or lesions in US images during the labeling process, such as the inferior vena cava in <xref ref-type="fig" rid="fig6">Figure 6A</xref>, the hepatic vein in <xref ref-type="fig" rid="fig6">Figure 6B</xref>, and the hepatic vein in <xref ref-type="fig" rid="fig6">Figure 6C</xref>. Such training data limitations can restrict training performance. However, our model demonstrates good performance by leveraging long-range dependencies of image features, effectively identifying anatomical structures missed in the ground truth, thereby successfully recognizing the hepatic vein in <xref ref-type="fig" rid="fig6">Figure 6C</xref>. Moreover, the model may exhibit errors due to a lack of axis information or incorrect probe orientation. This limitation in spatial recognition may lead to misinterpretations, such as the case in <xref ref-type="sec" rid="sec37">Supplementary Figure 8</xref>, where the liver was mistaken for the spleen due to incorrect left&#x2013;right orientation. Additionally, this study is without external dataset validation.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Recognition of unannotated anatomical structures from ground truth. Despite training data limitations, MaskHybrid still effectively identifies anatomical structures in the segmentation of the missing inferior vena cava in <bold>(A)</bold>, the missing hepatic veins in <bold>(B)</bold>, and the missing hepatic veins in <bold>(C)</bold>.</p>
</caption>
<graphic xlink:href="frai-08-1618607-g006.tif">
<alt-text content-type="machine-generated">Ultrasound images showing three panels (A, B, C) with four comparisons each: Ground Truth, MaskDINO (ResNet), MaskDINO (Swin-T), and MaskHybrid. Each panel highlights anatomical structures like the portal vein, hepatic vein, and inferior vena cava in green and blue overlays, demonstrating the segmentation results of different AI models.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec28">
<label>4.4</label>
<title>Inference time</title>
<p>We evaluated model execution time to ensure timely responses during inference for real-time scenarios. As presented in <xref ref-type="table" rid="tab4">Table 4</xref>, the results showed that MaskHybrid exhibited a significantly shorter inference time (0.120&#x202F;&#x00B1;&#x202F;0.013&#x202F;s) compared to MaskDINO (Swin-T) (0.304&#x202F;&#x00B1;&#x202F;0.019&#x202F;s) for large-sized AI models of similar size, achieving more than 2.5 times faster. Although MaskDINO (ResNet) achieved the fastest inference time (0.117&#x202F;&#x00B1;&#x202F;0.013&#x202F;s), this improvement came at the potential cost of segmentation accuracy, as suggested by prior experiments. The slight increase in inference time from MaskHybrid to MaskHybrid with the hybrid encoder (0.122&#x202F;&#x00B1;&#x202F;0.014&#x202F;s) indicated that incorporating a Mamba-based layer design within the encoder introduced only a marginal computational overhead without significantly compromising efficiency. This finding highlighted the effectiveness of the hybrid architecture in reducing computational complexity while maintaining competitive performance, making our proposed framework well-suited for complex anatomical landmark segmentation tasks in the abdominal US, where accuracy and efficiency are critical in clinical practice.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Model Inference time of MaskHybrid compared to MaskDINO baselines.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">Inference Time (second)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">MaskDINO (ResNet)</td>
<td align="center" valign="middle">0.117&#x202F;&#x00B1;&#x202F;0.013</td>
</tr>
<tr>
<td align="left" valign="middle">MaskDINO (Swin-T)</td>
<td align="center" valign="middle">0.304&#x202F;&#x00B1;&#x202F;0.019</td>
</tr>
<tr>
<td align="left" valign="middle">MaskHybrid</td>
<td align="center" valign="middle">0.120&#x202F;&#x00B1;&#x202F;0.013</td>
</tr>
<tr>
<td align="left" valign="middle">MaskHybrid (hybrid encoder)</td>
<td align="center" valign="middle">0.122&#x202F;&#x00B1;&#x202F;0.014</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>For large-sized AI models of similar size, the MaskHybrid has performed more than 2.5 times faster in inference than MaskDINO (Swin-T).</p>
</table-wrap-foot>
</table-wrap>
<p>Overall, the main focus of this pilot study is the recognition and segmentation of anatomical structures and pathological features. Our framework provides comparable execution speed to small-sized segmentation models while offering superior accuracy and visualization compared to common large-sized models, potentially enabling near real-time diagnostic sonography that meets clinical needs. In future work, follow-up studies will evaluate whether the proposed method can distinguish different types of tumors, and a small-scale reader or clinical usability study will be conducted to further evaluate the effectiveness of MaskHybrid in supporting physician interpretation in clinical scenarios.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec29">
<label>5</label>
<title>Conclusion</title>
<p>In conclusion, the proposed AI-based anatomical recognition framework, MaskHybrid, achieved superior segmentation accuracy and visualization effect for the timely analysis of complex anatomical structures in ultrasound images. Experiments conducted on a retrospective dataset demonstrated the effectiveness and robustness of simultaneously detecting and segmenting multiple abdominal organs and lesions, particularly in challenging anatomical structures. This study is anticipated to facilitate improved diagnostic interpretation of abdominal ultrasound in the near future.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec30">
<title>Data availability statement</title>
<p>The data analyzed in this study is subject to the following licenses/restrictions: the ultrasound image dataset and corresponding metadata that support the findings of this study, including annotations and AI models, are not publicly available due to protocol and ethical restrictions of the Institutional Review Board of the Chung Shan Medical University Hospital. However, parts of anonymized data could be made available from the corresponding author upon reasonable request. Requests to access these datasets should be directed to Chi-Chih Wang, <email>bananaudwang@gmail.com</email>.</p>
</sec>
<sec sec-type="ethics-statement" id="sec31">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Institutional Review Board of Chung Shan Medical University Hospital. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="sec32">
<title>Author contributions</title>
<p>S-FC: Visualization, Funding acquisition, Project administration, Conceptualization, Methodology, Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Investigation. P-YW: Software, Resources, Visualization, Formal analysis, Validation, Data curation, Writing &#x2013; review &#x0026; editing, Methodology, Conceptualization. M-CT: Data curation, Writing &#x2013; review &#x0026; editing. VT: Validation, Supervision, Writing &#x2013; review &#x0026; editing. C-CW: Formal analysis, Data curation, Validation, Writing &#x2013; review &#x0026; editing, Supervision.</p>
</sec>
<sec sec-type="funding-information" id="sec33">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research and/or publication of this article. The study was funded by Industrial Technology Research Institute (Grant No. Q301AA3110).</p>
</sec>
<ack>
<p>The authors would like to express sincere gratitude to Chung Shan Medical University and Chung Shan Medical University Hospital for providing ultrasound image data and insightful comments, and to the Industrial Technology Research Institute for valuable support and constructive feedback throughout the research.</p>
</ack>
<sec sec-type="COI-statement" id="sec34">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec35">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="sec36">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec37">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2025.1618607/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/frai.2025.1618607/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Biswas</surname> <given-names>M.</given-names></name> <name><surname>Kuppili</surname> <given-names>V.</given-names></name> <name><surname>Edla</surname> <given-names>D. R.</given-names></name> <name><surname>Suri</surname> <given-names>H. S.</given-names></name> <name><surname>Saba</surname> <given-names>L.</given-names></name> <name><surname>Marinhoe</surname> <given-names>R. T.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Symtosis: a liver ultrasound tissue characterization and risk stratification in optimized deep learning paradigm</article-title>. <source>Comput. Methods Prog. Biomed.</source> <volume>155</volume>, <fpage>165</fpage>&#x2013;<lpage>177</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cmpb.2017.12.016</pub-id>, PMID: <pub-id pub-id-type="pmid">29512496</pub-id></citation></ref>
<ref id="ref2"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Boesch</surname> <given-names>G.</given-names></name></person-group>. (<year>2024</year>). YOLO explained: From v1 to v11. Available online at: viso.ai. <ext-link xlink:href="https://viso.ai/computer-vision/yolo-explained/" ext-link-type="uri">https://viso.ai/computer-vision/yolo-explained/</ext-link> (Accessed April 20, 2025).</citation></ref>
<ref id="ref3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>L.</given-names></name> <name><surname>Pfob</surname> <given-names>A.</given-names></name></person-group> (<year>2025</year>). <article-title>Artificial intelligence in abdominal and pelvic ultrasound imaging: current applications</article-title>. <source>Abdom. Radiol.</source> <volume>50</volume>, <fpage>1775</fpage>&#x2013;<lpage>1789</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00261-024-04640-x</pub-id>, PMID: <pub-id pub-id-type="pmid">39487919</pub-id></citation></ref>
<ref id="ref4"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Carion</surname> <given-names>N.</given-names></name> <name><surname>Massa</surname> <given-names>F.</given-names></name> <name><surname>Synnaeve</surname> <given-names>G.</given-names></name> <name><surname>Usunier</surname> <given-names>N.</given-names></name> <name><surname>Kirillov</surname> <given-names>A.</given-names></name> <name><surname>Zagoruyko</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). &#x201C;End-to-end object detection with transformers.&#x201D; In <italic>European conference on computer vision&#x2013;ECCV 2020. Lecture Notes in Computer Science 12346</italic>, 213&#x2013;229.</citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Tu</surname> <given-names>S.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Jin</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Computer-aided diagnosis of gallbladder polyps based on high resolution ultrasonography</article-title>. <source>Comput. Methods Prog. Biomed.</source> <volume>185</volume>:<fpage>105118</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cmpb.2019.105118</pub-id>, PMID: <pub-id pub-id-type="pmid">31671340</pub-id></citation></ref>
<ref id="ref6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cheng</surname> <given-names>P. M.</given-names></name> <name><surname>Malhi</surname> <given-names>H. S.</given-names></name></person-group> (<year>2017</year>). <article-title>Transfer learning with convolutional neural networks for classification of abdominal ultrasound images</article-title>. <source>J. Digit. Imaging</source> <volume>30</volume>, <fpage>234</fpage>&#x2013;<lpage>243</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10278-016-9929-2</pub-id>, PMID: <pub-id pub-id-type="pmid">27896451</pub-id></citation></ref>
<ref id="ref7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dadoun</surname> <given-names>H.</given-names></name> <name><surname>Rousseau</surname> <given-names>A.-L.</given-names></name> <name><surname>Kerviler</surname> <given-names>E.</given-names></name> <name><surname>Correas</surname> <given-names>J.-M.</given-names></name> <name><surname>Tissier</surname> <given-names>A.-M.</given-names></name> <name><surname>Joujou</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Deep learning for the detection, localization, and characterization of focal liver lesions on abdominal US images</article-title>. <source>Radiol. Artif. Intell.</source> <volume>4</volume>:<fpage>110</fpage>. doi: <pub-id pub-id-type="doi">10.1148/ryai.210110</pub-id>, PMID: <pub-id pub-id-type="pmid">35652113</pub-id></citation></ref>
<ref id="ref8"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Dandan</surname> <given-names>L.</given-names></name> <name><surname>Huanhuan</surname> <given-names>M.</given-names></name> <name><surname>Yu</surname> <given-names>J.</given-names></name> <name><surname>Yi</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). A multi-model organ segmentation method based on abdominal ultrasound image. In IEEE international conference on signal processing&#x2013;ICSP, 505&#x2013;510.</citation></ref>
<ref id="ref9"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Dao</surname> <given-names>T.</given-names></name> <name><surname>Gu</surname> <given-names>A.</given-names></name></person-group> (<year>2024</year>). &#x201C;Transformers are SSMs: generalized models and efficient algorithms through structured state space duality.&#x201D; In <italic>International Conference on Machine Learning&#x2013;ICML 2024. Proceedings of Machine Learning Research (PMLR) 235</italic>, pp. 10041&#x2013;10071.</citation></ref>
<ref id="ref10"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Ghiasi</surname> <given-names>G.</given-names></name> <name><surname>Cui</surname> <given-names>Y.</given-names></name> <name><surname>Srinivas</surname> <given-names>A.</given-names></name> <name><surname>Qian</surname> <given-names>R.</given-names></name> <name><surname>Lin</surname> <given-names>T.-Y.</given-names></name> <name><surname>Cubuk</surname> <given-names>E.D.</given-names></name> <etal/></person-group> (<year>2021</year>). &#x201C;Simple copy-paste is a strong data augmentation method for instance segmentation.&#x201D; In <italic>IEEE/CVF Conference on Computer Vision and Pattern Recognition&#x2013;CVPR 2021</italic>, 2917&#x2013;2927.</citation></ref>
<ref id="ref11"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>A.</given-names></name> <name><surname>Dao</surname> <given-names>T.</given-names></name></person-group> (<year>2024</year>). &#x201C;Mamba: linear-time sequence modeling with selective state spaces.&#x201D; In Conference on Language Modeling&#x2013;COLM 2024.</citation></ref>
<ref id="ref12"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Hatamizadeh</surname> <given-names>A.</given-names></name> <name><surname>Kautz</surname> <given-names>J.</given-names></name></person-group> (<year>2024</year>). Mambavision: a hybrid mamba-transformer vision backbone. arXiv preprint [Preprint]. doi:10.48550/arXiv.2407.08083 (Accessed April 20, 2025).</citation></ref>
<ref id="ref13"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Hatture</surname> <given-names>S.M.</given-names></name> <name><surname>Kadakol</surname> <given-names>N.</given-names></name></person-group> (<year>2021</year>). &#x201C;Identification of intra-abdominal organs using deep learning techniques. In ICT analysis and applications.&#x201D; Lecture Notes in Networks and Systems, 154, 547&#x2013;554.</citation></ref>
<ref id="ref14"><citation citation-type="other"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Gkioxari</surname> <given-names>G.</given-names></name> <name><surname>Doll&#x00E1;r</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group>. (<year>2017</year>).&#x201D; Mask R-CNN.&#x201D; In <italic>IEEE International Conference on Computer Vision&#x2013;ICCV 2017</italic>, pp. 2980&#x2013;2988.</citation></ref>
<ref id="ref15"><citation citation-type="other"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). &#x201C;Deep residual learning for image recognition.&#x201D; In <italic>IEEE Conference on Computer Vision and Pattern Recognition&#x2013;CVPR 2016</italic>, 770&#x2013;778.</citation></ref>
<ref id="ref16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Karimi</surname> <given-names>D.</given-names></name> <name><surname>Zeng</surname> <given-names>Q.</given-names></name> <name><surname>Mathur</surname> <given-names>P.</given-names></name> <name><surname>Avinash</surname> <given-names>A.</given-names></name> <name><surname>Mahdavi</surname> <given-names>S.</given-names></name> <name><surname>Spadinger</surname> <given-names>I.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Accurate and robust deep learning-based segmentation of the prostate clinical target volume in ultrasound images</article-title>. <source>Med. Image Anal.</source> <volume>57</volume>, <fpage>186</fpage>&#x2013;<lpage>196</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2019.07.005</pub-id>, PMID: <pub-id pub-id-type="pmid">31325722</pub-id></citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Krishnan</surname> <given-names>K. R.</given-names></name> <name><surname>Radhakrishnan</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>Hybrid approach to classification of focal and diffused liver disorders using ultrasound images with wavelets and texture features</article-title>. <source>IET Image Process.</source> <volume>11</volume>, <fpage>530</fpage>&#x2013;<lpage>538</lpage>. doi: <pub-id pub-id-type="doi">10.1049/iet-ipr.2016.1072</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>J. H.</given-names></name> <name><surname>Joo</surname> <given-names>I.</given-names></name> <name><surname>Kang</surname> <given-names>T. W.</given-names></name> <name><surname>Paik</surname> <given-names>Y. H.</given-names></name> <name><surname>Sinn</surname> <given-names>D. H.</given-names></name> <name><surname>Ha</surname> <given-names>S. Y.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Deep learning with ultrasonography: automated classification of liver fibrosis using a deep convolutional neural network</article-title>. <source>Eur. Radiol.</source> <volume>30</volume>, <fpage>1264</fpage>&#x2013;<lpage>1273</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00330-019-06407-1</pub-id>, PMID: <pub-id pub-id-type="pmid">31478087</pub-id></citation></ref>
<ref id="ref19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lei</surname> <given-names>Y.</given-names></name> <name><surname>Tian</surname> <given-names>S.</given-names></name> <name><surname>He</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <name><surname>Wang</surname> <given-names>B.</given-names></name> <name><surname>Patel</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Ultrasound prostate segmentation based on multidirectional deeply supervised V-net</article-title>. <source>Med. Phys.</source> <volume>46</volume>, <fpage>3194</fpage>&#x2013;<lpage>3206</lpage>. doi: <pub-id pub-id-type="doi">10.1002/mp.13577</pub-id>, PMID: <pub-id pub-id-type="pmid">31074513</pub-id></citation></ref>
<ref id="ref20"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Xu</surname> <given-names>H.-S.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Ni</surname> <given-names>L.</given-names></name> <name><surname>Shum</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). &#x201C;Mask DINO: towards a unified transformer-based framework for object detection and segmentation.&#x201D; In <italic>IEEE/CVF conference on computer vision and pattern recognition&#x2013;CVPR 2023</italic>, 3041&#x2013;3050.</citation></ref>
<ref id="ref21"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). &#x201C;Swin transformer: hierarchical vision transformer using shifted windows.&#x201D; In <italic>IEEE/CVF International Conference on Computer Vision&#x2013;ICCV 2021</italic>, 9992&#x2013;10002.</citation></ref>
<ref id="ref22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>M&#x0103;muleanu</surname> <given-names>M.</given-names></name> <name><surname>Urhu&#x021B;</surname> <given-names>C. M.</given-names></name> <name><surname>S&#x0103;ndulescu</surname> <given-names>L. D.</given-names></name> <name><surname>Kamal</surname> <given-names>C.</given-names></name> <name><surname>P&#x0103;tra&#x0219;cu</surname> <given-names>A.-M.</given-names></name> <name><surname>Ionescu</surname> <given-names>A. G.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Deep learning algorithms in the automatic segmentation of liver lesions in ultrasound investigations</article-title>. <source>Life</source> <volume>12</volume>:<fpage>1877</fpage>. doi: <pub-id pub-id-type="doi">10.3390/life12111877</pub-id>, PMID: <pub-id pub-id-type="pmid">36431012</pub-id></citation></ref>
<ref id="ref23"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Muneeswaran</surname> <given-names>V.</given-names></name> <name><surname>Rajasekaran</surname> <given-names>M.P.</given-names></name></person-group> (<year>2018</year>). &#x201C;Gallbladder shape estimation using tree-seed optimization tuned radial basis function network for assessment of acute cholecystitis.&#x201D; In <italic>Intelligent engineering informatics. Advances in Intelligent Systems and Computing</italic>, 695, pp. 229&#x2013;239.</citation></ref>
<ref id="ref24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Obaid</surname> <given-names>A. M.</given-names></name> <name><surname>Turki</surname> <given-names>A.</given-names></name> <name><surname>Bellaaj</surname> <given-names>H.</given-names></name> <name><surname>Ksantini</surname> <given-names>M.</given-names></name> <name><surname>AlTaee</surname> <given-names>A.</given-names></name> <name><surname>Alaerjan</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Detection of gallbladder disease types using deep learning: an informative medical method</article-title>. <source>Diagnostics</source> <volume>13</volume>:<fpage>1744</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics13101744</pub-id>, PMID: <pub-id pub-id-type="pmid">37238227</pub-id></citation></ref>
<ref id="ref25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Orlando</surname> <given-names>N.</given-names></name> <name><surname>Gillies</surname> <given-names>D. J.</given-names></name> <name><surname>Gyacskov</surname> <given-names>I.</given-names></name> <name><surname>Romagnoli</surname> <given-names>C.</given-names></name> <name><surname>D&#x2019;Souza</surname> <given-names>D.</given-names></name> <name><surname>Fenster</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>Automatic prostate segmentation using deep learning on clinically diverse 3D transrectal ultrasound images</article-title>. <source>Med. Phys.</source> <volume>47</volume>, <fpage>2413</fpage>&#x2013;<lpage>2426</lpage>. doi: <pub-id pub-id-type="doi">10.1002/mp.14134</pub-id>, PMID: <pub-id pub-id-type="pmid">32166768</pub-id></citation></ref>
<ref id="ref26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Peng</surname> <given-names>T.</given-names></name> <name><surname>Gu</surname> <given-names>Y.</given-names></name> <name><surname>Ruan</surname> <given-names>S.-J.</given-names></name> <name><surname>Wu</surname> <given-names>Q. J.</given-names></name> <name><surname>Cai</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>Novel solution for using neural networks for kidney boundary extraction in 2d ultrasound data</article-title>. <source>Biomol. Ther.</source> <volume>13</volume>:<fpage>1548</fpage>. doi: <pub-id pub-id-type="doi">10.3390/biom13101548</pub-id>, PMID: <pub-id pub-id-type="pmid">37892229</pub-id></citation></ref>
<ref id="ref27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Peng</surname> <given-names>T.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Gu</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>D.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Intelligent contour extraction approach for accurate segmentation of medical ultrasound images</article-title>. <source>Front. Physiol.</source> <volume>14</volume>:<fpage>1177351</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fphys.2023.1177351</pub-id>, PMID: <pub-id pub-id-type="pmid">37675280</pub-id></citation></ref>
<ref id="ref28"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Ravishankar</surname> <given-names>H.</given-names></name> <name><surname>Venkataramani</surname> <given-names>R.</given-names></name> <name><surname>Thiruvenkadam</surname> <given-names>S.</given-names></name> <name><surname>Sudhakar</surname> <given-names>P.</given-names></name> <name><surname>Vaidya</surname> <given-names>V.</given-names></name></person-group> (<year>2017</year>). &#x201C;Learning and incorporating shape models for semantic segmentation.&#x201D; In <italic>Medical Image Computing and Computer Assisted Intervention&#x2013;MICCAI 2017. Lecture Notes in Computer Science 10433</italic>, 203&#x2013;211.</citation></ref>
<ref id="ref29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Reddy</surname> <given-names>D. S.</given-names></name> <name><surname>Rajalakshmi</surname> <given-names>P.</given-names></name> <name><surname>Mateen</surname> <given-names>M. A.</given-names></name></person-group> (<year>2021</year>). <article-title>A deep learning based approach for classification of abdominal organs using ultrasound images</article-title>. <source>Biocybern. Biomed. Eng.</source> <volume>41</volume>, <fpage>779</fpage>&#x2013;<lpage>791</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bbe.2021.05.004</pub-id></citation></ref>
<ref id="ref30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rother</surname> <given-names>C.</given-names></name> <name><surname>Kolmogorov</surname> <given-names>V.</given-names></name> <name><surname>Blake</surname> <given-names>A.</given-names></name></person-group> (<year>2004</year>). <article-title>Grabcut: interactive foreground extraction using iterated graph cuts</article-title>. <source>ACM Trans. Graph.</source> <volume>23</volume>, <fpage>309</fpage>&#x2013;<lpage>314</lpage>. doi: <pub-id pub-id-type="doi">10.1145/1015706.1015720</pub-id></citation></ref>
<ref id="ref31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ryu</surname> <given-names>H.</given-names></name> <name><surname>Shin</surname> <given-names>S. Y.</given-names></name> <name><surname>Lee</surname> <given-names>J. Y.</given-names></name> <name><surname>Lee</surname> <given-names>K. M.</given-names></name> <name><surname>Kang</surname> <given-names>H.-J.</given-names></name> <name><surname>Yi</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <article-title>Joint segmentation and classification of hepatic lesions in ultrasound images using deep learning</article-title>. <source>Eur. Radiol.</source> <volume>31</volume>, <fpage>8733</fpage>&#x2013;<lpage>8742</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00330-021-07850-9</pub-id>, PMID: <pub-id pub-id-type="pmid">33881566</pub-id></citation></ref>
<ref id="ref32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>K. D.</given-names></name></person-group> (<year>2021</year>). <article-title>Current status of deep learning applications in abdominal ultrasonography</article-title>. <source>Ultrasonography</source> <volume>40</volume>, <fpage>177</fpage>&#x2013;<lpage>182</lpage>. doi: <pub-id pub-id-type="doi">10.14366/usg.20085</pub-id>, PMID: <pub-id pub-id-type="pmid">33242931</pub-id></citation></ref>
<ref id="ref33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tomizawa</surname> <given-names>M.</given-names></name> <name><surname>Shinozaki</surname> <given-names>F.</given-names></name> <name><surname>Hasegawa</surname> <given-names>R.</given-names></name> <name><surname>Shirai</surname> <given-names>Y.</given-names></name> <name><surname>Motoyoshi</surname> <given-names>Y.</given-names></name> <name><surname>Sugiyama</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Abdominal ultrasonography for patients with abdominal pain as a first-line diagnostic imaging modality</article-title>. <source>Exp. Ther. Med.</source> <volume>13</volume>, <fpage>1932</fpage>&#x2013;<lpage>1936</lpage>. doi: <pub-id pub-id-type="doi">10.3892/etm.2017.4209</pub-id>, PMID: <pub-id pub-id-type="pmid">28565789</pub-id></citation></ref>
<ref id="ref34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Turco</surname> <given-names>S.</given-names></name> <name><surname>Tiyarattanachai</surname> <given-names>T.</given-names></name> <name><surname>Ebrahimkheil</surname> <given-names>K.</given-names></name> <name><surname>Eisenbrey</surname> <given-names>J.</given-names></name> <name><surname>Kamaya</surname> <given-names>A.</given-names></name> <name><surname>Mischi</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Interpretable machine learning for characterization of focal liver lesions by contrast-enhanced ultrasound</article-title>. <source>IEEE Trans. Ultrason. Ferroelectr. Freq. Control</source> <volume>69</volume>, <fpage>1670</fpage>&#x2013;<lpage>1681</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TUFFC.2022.3161719</pub-id>, PMID: <pub-id pub-id-type="pmid">35320099</pub-id></citation></ref>
<ref id="ref35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>T.-Y.</given-names></name> <name><surname>Chen</surname> <given-names>Y.-H.</given-names></name> <name><surname>Chen</surname> <given-names>J.-T.</given-names></name> <name><surname>Liu</surname> <given-names>J.-T.</given-names></name> <name><surname>Wu</surname> <given-names>P.-Y.</given-names></name> <name><surname>Chang</surname> <given-names>S.-Y.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Diabetic macular edema detection using end-to-end deep fusion model and anatomical landmark visualization on an edge computing device</article-title>. <source>Front. Med.</source> <volume>9</volume>:<fpage>851644</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmed.2022.851644</pub-id>, PMID: <pub-id pub-id-type="pmid">35445051</pub-id></citation></ref>
<ref id="ref36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xi</surname> <given-names>I. L.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Guan</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>P. J.</given-names></name> <name><surname>Horii</surname> <given-names>S. C.</given-names></name> <name><surname>Soulen</surname> <given-names>M. C.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Deep learning for differentiation of benign and malignant solid liver lesions on ultrasonography</article-title>. <source>Abdom. Radiol.</source> <volume>46</volume>, <fpage>534</fpage>&#x2013;<lpage>543</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00261-020-02564-w</pub-id>, PMID: <pub-id pub-id-type="pmid">32681268</pub-id></citation></ref>
<ref id="ref37"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Z.</given-names></name> <name><surname>Huo</surname> <given-names>Y.</given-names></name> <name><surname>Park</surname> <given-names>J.H.</given-names></name> <name><surname>Landman</surname> <given-names>B.</given-names></name> <name><surname>Milkowski</surname> <given-names>A.</given-names></name> <name><surname>Grbic</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2018</year>). Less is more: simultaneous view classification and landmark detection for abdominal ultrasound images. In Medical image computing and computer assisted intervention&#x2013;MICCAI 2018. Lecture Notes in Computer Science 11071, 711&#x2013;719.</citation></ref>
<ref id="ref38"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>N.</given-names></name> <name><surname>Price</surname> <given-names>B.</given-names></name> <name><surname>Cohen</surname> <given-names>S.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Huang</surname> <given-names>T.</given-names></name></person-group> (<year>2017</year>). Deep grabcut for object selection. arXiv preprint [Preprint]. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1707.00243</pub-id> (Accessed April 20, 2025).</citation></ref>
<ref id="ref39"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Y.</given-names></name> <name><surname>Quan</surname> <given-names>R.</given-names></name> <name><surname>Xu</surname> <given-names>W.</given-names></name> <name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>F.</given-names></name></person-group> (<year>2024</year>). <article-title>Advances in medical image segmentation: a comprehensive review of traditional, deep learning and hybrid approaches</article-title>. <source>Bioengineering</source> <volume>11</volume>:<fpage>1034</fpage>. doi: <pub-id pub-id-type="doi">10.3390/bioengineering11101034</pub-id>, PMID: <pub-id pub-id-type="pmid">39451409</pub-id></citation></ref>
<ref id="ref40"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yin</surname> <given-names>S.</given-names></name> <name><surname>Peng</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>You</surname> <given-names>X.</given-names></name> <name><surname>Fischer</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Automatic kidney segmentation in ultrasound images using subsequent boundary distance regression and pixelwise classification networks</article-title>. <source>Med. Image Anal.</source> <volume>60</volume>:<fpage>101602</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.media.2019.101602</pub-id>, PMID: <pub-id pub-id-type="pmid">31760193</pub-id></citation></ref>
<ref id="ref41"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Yin</surname> <given-names>S.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Peng</surname> <given-names>Q.</given-names></name> <name><surname>You</surname> <given-names>X.</given-names></name> <name><surname>Furth</surname> <given-names>S.L.</given-names></name> <etal/></person-group> (<year>2019</year>). &#x201C;Fully-automatic segmentation of kidneys in clinical ultrasound images using a boundary distance regression network.&#x201D; In <italic>IEEE International Symposium on Biomedical Imaging&#x2013;ISBI 2019</italic>. pp. 1741&#x2013;1744.</citation></ref>
<ref id="ref42"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Su</surname> <given-names>H.</given-names></name> <name><surname>Zhu</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). &#x201C;DINO: DETR with improved denoising anchor boxes for end-to-end object detection.&#x201D; In <italic>International Conference on Learning Representations&#x2013;ICLR</italic> 2023.</citation></ref>
<ref id="ref43"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Wu</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Fan</surname> <given-names>Y.</given-names></name> <name><surname>Zheng</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>SEG-LUS: a novel ultrasound segmentation method for liver and its accessory structures based on multi-head self-attention</article-title>. <source>Comput. Med. Imaging Graph.</source> <volume>113</volume>:<fpage>102338</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compmedimag.2024.102338</pub-id>, PMID: <pub-id pub-id-type="pmid">38290353</pub-id></citation></ref>
</ref-list>
</back>
</article>