<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioeng. Biotechnol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Bioengineering and Biotechnology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioeng. Biotechnol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-4185</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1733689</article-id>
<article-id pub-id-type="doi">10.3389/fbioe.2025.1733689</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>CervSpineNet: a hybrid deep learning-based approach for the segmentation of cervical spinous processes</article-title>
<alt-title alt-title-type="left-running-head">Sawant et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbioe.2025.1733689">10.3389/fbioe.2025.1733689</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Sawant</surname>
<given-names>Jay Sunil</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3258372"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Moukheiber</surname>
<given-names>Lama</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Nair</surname>
<given-names>Anupama</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3268474"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mahajan</surname>
<given-names>Anubha</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3268601"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Byun</surname>
<given-names>Jaehui</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3302775"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pichaimani</surname>
<given-names>Ishwarya</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yoon</surname>
<given-names>Sangwook T.</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Martin</surname>
<given-names>Christopher T.</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Mitchell</surname>
<given-names>Cassie S.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/229833"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Laboratory for Pathology Dynamics, Department of Biomedical Engineering, Georgia Institute of Technology and Emory University</institution>, <city>Atlanta</city>, <state>GA</state>, <country country="US">United States</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Center for Machine Learning, Georgia Institute of Technology</institution>, <city>Atlanta</city>, <state>GA</state>, <country country="US">United States</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>School of Computer Science, Georgia Institute of Technology</institution>, <city>Atlanta</city>, <state>GA</state>, <country country="US">United States</country>
</aff>
<aff id="aff4">
<label>4</label>
<institution>Department of Orthopedic Surgery, Emory University</institution>, <city>Atlanta</city>, <state>GA</state>, <country country="US">United States</country>
</aff>
<aff id="aff5">
<label>5</label>
<institution>Department of Orthopedic Surgery, University of Minnesota</institution>, <city>Minneapolis</city>, <state>MN</state>, <country country="US">United States</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Cassie S. Mitchell, <email xlink:href="mailto:cassie.mitchell@bme.gatech.edu">cassie.mitchell@bme.gatech.edu</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-19">
<day>19</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1733689</elocation-id>
<history>
<date date-type="received">
<day>27</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>10</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>18</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Sawant, Moukheiber, Nair, Mahajan, Byun, Pichaimani, Yoon, Martin and Mitchell.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Sawant, Moukheiber, Nair, Mahajan, Byun, Pichaimani, Yoon, Martin and Mitchell</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-19">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Accurate segmentation of cervical spinous processes on lateral X-rays is essential for reliable anatomical landmarking, surgical planning, and longitudinal assessment of spinal deformity. However, no publicly available dataset provides pixel-level annotations of these structures, and manual delineation remains time-consuming and operator dependent. To address this gap, we curated an expert-labeled dataset of 500 cervical spine radiographs and developed CervSpineNet, a hybrid deep learning framework for automated spinous process segmentation.</p>
</sec>
<sec>
<title>Methods</title>
<p>CervSpineNet integrates a transformer-based encoder to capture global anatomical context with a lightweight convolutional decoder to refine local boundaries. Training used a compound loss function that combines Dice, Focal Tversky, Hausdorff distance transform, and Structural Similarity (SSIM) terms to jointly optimize region overlap, class balance, structural fidelity, and boundary accuracy. The model was trained and evaluated on three dataset variants: original images, contrast-enhanced images using CLAHE, and augmented images. Performance was benchmarked against four baselines: U-Net, DeepLabV3&#x002B;, the Segment Anything Model (SAM), and a text-guided SegFormer.</p>
</sec>
<sec>
<title>Results</title>
<p>Across all experimental settings, CervSpineNet consistently outperformed competing methods, achieving mean Dice coefficients above 0.93, IoU values above 0.87, and SSIM above 0.98, with substantially lower HD95 distances. The model demonstrated strong agreement with ground truth, with global MAE &#x2248; 0.005, and maintained efficient inference times of 5&#x2013;10 seconds per image. With a compact footprint of approximately 345 MB, CervSpineNet runs on standard clinical hardware and reduces manual annotation time by about 96%.</p>
</sec>
<sec>
<title>Discussion</title>
<p>These results indicate that combining transformer-driven global context with convolutional boundary refinement enables robust and reproducible spinous process segmentation on lateral cervical radiographs. By pairing an expert-annotated dataset with a high-performing, computationally efficient model, this work provides a scalable foundation for AI-assisted cervical spine analysis, supporting rapid segmentation for surgical evaluation, deformity monitoring, and large-scale retrospective studies in both research and clinical practice.</p>
</sec>
</abstract>
<kwd-group>
<kwd>artificial intelligence</kwd>
<kwd>automated musculoskeletal landmark detection</kwd>
<kwd>cervical spine segmentation</kwd>
<kwd>cervical spinous process dataset</kwd>
<kwd>deep learning in radiology</kwd>
<kwd>hybrid transformer&#x2013;CNN architecture</kwd>
<kwd>machine learning</kwd>
<kwd>radiology workflow automation</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>National Institutes of Health</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/100000002</institution-id>
</institution-wrap>
</funding-source>
</award-group>
<award-group id="gs2">
<funding-source id="sp2">
<institution-wrap>
<institution>Directorate for Engineering</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/100000084</institution-id>
</institution-wrap>
</funding-source>
</award-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The work was supported by National Institutes of Health grant R35GM152245, the National Science Foundation CAREER award 1944247, and the Chan Zuckerberg Initiative award 253558 to CaM. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="9"/>
<table-count count="5"/>
<equation-count count="20"/>
<ref-count count="61"/>
<page-count count="18"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Biomechanics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>The cervical spine&#x2014;the upper segment of the vertebral column comprising seven articulating vertebrae (C1&#x2013;C7)&#x2014;plays a critical biomechanical and protective role. The craniocervical junction, formed by the atlas (C1) and axis (C2), enables head rotation and flexion&#x2013;extension, while the subaxial spine (C3&#x2013;C7) provides load-bearing stability and supports the weight of the head (<xref ref-type="bibr" rid="B24">Kaiser et al., 2025</xref>). Each vertebra features a posterior bony prominence called the spinous process, whose morphology varies substantially across levels and between individuals (<xref ref-type="bibr" rid="B14">Cramer, 2014</xref>). These spinous processes serve as essential anatomic landmarks for vertebral identification, muscle attachment, and radiologic orientation. Their visibility and geometry make them key indicators in diagnostic imaging, surgical navigation, and postoperative evaluation.</p>
<p>Accurate assessment of the cervical spinous processes is clinically significant in several contexts. Avulsion injuries such as clay-shoveler&#x2019;s fractures typically involve the lower cervical or upper thoracic processes and are common in activities requiring rapid spinal rotation (<xref ref-type="bibr" rid="B39">Posthuma de Boer et al., 2016</xref>). Similarly, congenital variants&#x2014;such as hypoplasia, dysplasia, or persistent apophyses&#x2014;may mimic fractures and lead to misinterpretation if not correctly recognized (<xref ref-type="bibr" rid="B42">Riew et al., 2010</xref>; <xref ref-type="bibr" rid="B16">Farooqi et al., 2017</xref>; <xref ref-type="bibr" rid="B20">Gould et al., 2020</xref>). Understanding these variants is vital for distinguishing developmental anomalies from acute pathology and for avoiding unnecessary interventions (<xref ref-type="bibr" rid="B11">Chen et al., 2006</xref>). Beyond diagnosis, spinous process delineation also assists in evaluating outcomes following a spinal surgery called anterior cervical discectomy and fusion (ACDF), which is a widely performed procedure for degenerative conditions of the cervical spine. Postoperative evaluation of fusion status, indicating successful bony union and mechanical stability between adjacent vertebrae, represents a critical clinical outcome. However, studies show that inter-observer variability in assessing postoperative fusion from dynamic radiographs can be substantial, often requiring supplemental computed tomography (CT) scans or quantitative methods (<xref ref-type="bibr" rid="B4">Ariyaratne et al., 2024</xref>). With ACDF case volumes projected to rise markedly through 2040 (<xref ref-type="bibr" rid="B37">Neifert et al., 2020</xref>), automated segmentation tools could streamline assessment and reduce dependence on manual measurements.</p>
<p>Semantic segmentation&#x2014;the pixel-level delineation of structures within images&#x2014;is central to computational imaging and biomedical analysis (<xref ref-type="bibr" rid="B22">Hollon et al., 2020</xref>; <xref ref-type="bibr" rid="B38">Ouyang et al., 2020</xref>). Manual segmentation remains the reference standard but is labor-intensive, time-consuming, and prone to inter-observer variability (<xref ref-type="bibr" rid="B30">Ma et al., 2024</xref>). Automated methods can dramatically reduce workload, improve consistency, and enable high-throughput image analysis (<xref ref-type="bibr" rid="B49">Wang et al., 2019</xref>). However, manual labeling of X-rays is particularly challenging due to overlapping anatomy, limited soft-tissue contrast, and the absence of public datasets for small bony structures such as spinous processes (<xref ref-type="bibr" rid="B49">Wang et al., 2019</xref>; <xref ref-type="bibr" rid="B51">Wang S. et al., 2021</xref>; <xref ref-type="bibr" rid="B17">Galbusera and Cina, 2024</xref>). While recent advances in deep learning have achieved outstanding results for CT and MRI segmentation (<xref ref-type="bibr" rid="B1">Ahmad et al., 2023</xref>; <xref ref-type="bibr" rid="B36">Muthukrishnan et al., 2024</xref>; <xref ref-type="bibr" rid="B45">Son et al., 2024</xref>; <xref ref-type="bibr" rid="B57">Yang et al., 2024</xref>), segmentation in plain radiographs remains relatively underexplored.</p>
<p>To date, there is no publicly available dataset containing cervical spine X-rays with corresponding pixel-level annotations of the spinous processes. Prior work suggests that certain vertebral levels, notably C1/C2 and C6/C7, are particularly difficult to identify using machine learning models (<xref ref-type="bibr" rid="B44">Shim et al., 2022</xref>; <xref ref-type="bibr" rid="B48">van Santbrink et al., 2025</xref>). Preliminary zero-shot experiments using transformer-based encoders such as SAM (<xref ref-type="bibr" rid="B10">Bucher et al., 2019</xref>) and conventional segmentation architectures like DeepLabV3&#x2b; produced unsatisfactory results. These findings underscore the need for a domain-specific dataset and a dedicated segmentation framework optimized for the cervical spine.</p>
<p>Recent literature highlights that effective medical image segmentation models must balance local boundary precision with global contextual awareness. Conventional convolutional neural networks (CNNs) excel at fine detail but are limited in modeling long-range dependencies, whereas transformer architectures capture global context at the expense of spatial granularity. Hybrid models that integrate these complementary strengths have demonstrated improved performance across biomedical imaging tasks. Approaches such as MedSAM and TransUNet combine transformer-based encoders with U-Net&#x2013;style decoders, yielding sharper boundaries and stronger structural consistency across diverse medical modalities (<xref ref-type="bibr" rid="B12">Chen J. et al., 2024</xref>; <xref ref-type="bibr" rid="B30">Ma et al., 2024</xref>). Similarly, hybrid convolution&#x2013;transformer systems in musculoskeletal imaging achieve strong Dice and IoU performance and near-expert agreement (<xref ref-type="bibr" rid="B56">Xu et al., 2022</xref>; <xref ref-type="bibr" rid="B21">Ham et al., 2023</xref>; <xref ref-type="bibr" rid="B33">Mostafa et al., 2024</xref>; <xref ref-type="bibr" rid="B2">Al-Antari et al., 2025</xref>; <xref ref-type="bibr" rid="B8">Bao et al., 2025</xref>). These developments support hybrid encoder&#x2013;decoder architectures as a robust design paradigm for accurate and data-efficient medical image segmentation.</p>
<p>Building on this foundation, the present study introduces CervSpineNet, a hybrid deep learning framework for automated segmentation of cervical spinous processes in lateral X-rays. The primary contributions are threefold:<list list-type="order">
<list-item>
<p>A new expert-labeled dataset of cervical spine radiographs with pixel-level binary masks of the spinous processes.</p>
</list-item>
<list-item>
<p>A hybrid transformer&#x2013;CNN architecture that integrates a ViT-B encoder for global context modeling with a traditional convolutional decoder with added layers for edge refinement.</p>
</list-item>
<list-item>
<p>A compound loss function that jointly optimizes region overlap, edge sharpness, class balance, and structural similarity.</p>
</list-item>
</list>
</p>
<p>CervSpineNet consistently achieved mean Dice coefficients exceeding 0.93 across experiments and demonstrated excellent agreement with ground-truth masks. Beyond accuracy, the framework reduces manual annotation time by approximately 96% and is efficient enough to run on standard hospital hardware. Together, the proposed dataset and model provide a scalable foundation for automated, high-fidelity cervical spine analysis, advancing both the methodological and translational frontiers of biomedical image segmentation.</p>
</sec>
<sec sec-type="methods" id="s2">
<label>2</label>
<title>Methodology</title>
<p>This section outlines the methodological framework used to develop and evaluate the proposed CervSpineNet model, from dataset curation and preprocessing to model training, benchmarking, and performance assessment. The overall workflow is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Schematic overview of the CervSpineNet pipeline for automated cervical spinous process segmentation. The workflow includes dataset curation from the publicly available Cervical Spine X-ray Atlas (CSXA), manual annotation of spinous processes, and annotation-to-mask generation for binary ground-truth creation. Preprocessing steps include Contrast Limited Adaptive Histogram Equalization (CLAHE) and data augmentation to enhance variability and image contrast. The processed datasets are used for model training and inference, with results evaluated through quantitative metrics and visual comparison to expert-annotated ground truth.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g001.tif">
<alt-text content-type="machine-generated">Flowchart depicting a data processing pipeline for cervical spine images, starting with dataset curation, including manual annotation and mask conversion. This is followed by CLAHE processing and data augmentation under data pre-processing. The data then undergoes model training. Results are obtained and used for inference, completing performance evaluation.</alt-text>
</graphic>
</fig>
<sec id="s2-1">
<label>2.1</label>
<title>Data acquisition</title>
<p>Prior work shows CNN-based segmentation is effective for vertebral bodies in radiographs and MRI (<xref ref-type="bibr" rid="B13">Chen Y. et al., 2024</xref>; <xref ref-type="bibr" rid="B53">Wang H. et al., 2025</xref>). However, no public dataset provides pixel-level masks of the cervical spinous processes, so we curated a task-specific corpus of lateral cervical spine X-rays and their corresponding binary masks.</p>
<p>We randomly sampled 500 PNG images from the Cervical Spine X-ray Atlas (CSXA) (<xref ref-type="bibr" rid="B41">Ran et al., 2024</xref>) (4,963 PNGs with JSON annotations; one image per patient) and manually annotated the spinous processes on a tablet with a stylus. Annotations were converted to binary masks using an OpenCV color-segmentation pipeline: images were converted to HSV; red hue bands (0&#xa0;&#xb0;&#x2013;10&#xa0;&#xb0;, 160&#xa0;&#xb0;&#x2013;180&#xa0;&#xb0;) corresponding to tracings were thresholded; masks from both bands were combined; contours were extracted and filled to produce clean binary ground-truth masks. All 500 image&#x2013;mask pairs were created with consistent dimensions. Trained annotators labeled the images; an expert spine surgeon (Dr. Sangwook T. Yoon) provided final review and approval. The annotation workflow is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Representative workflow for creating binary masks of the cervical spinous processes from lateral X-ray images. <bold>(A)</bold> Original cervical spine radiograph from the Cervical Spine X-ray Atlas (CSXA) dataset. <bold>(B)</bold> Manual annotation of individual spinous processes using a tablet and stylus. <bold>(C)</bold> Automated mask conversion using a color-based segmentation algorithm in HSV space, where annotated red regions are thresholded and converted to binary masks. The resulting ground-truth masks serve as pixel-level labels for model training and evaluation.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g002.tif">
<alt-text content-type="machine-generated">Sequence of images showing a cervical spine X-ray analysis: (A) A raw X-ray image of the neck. (B) The same X-ray with vertebrae manually outlined in red. (C) A binary mask highlighting the segmented vertebrae in white against a black background. Blue arrows indicate the progression from one stage to the next.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Data bifurcation, pre-processing, and augmentation</title>
<p>To ensure fair evaluation and minimize overfitting (<xref ref-type="bibr" rid="B35">Muraina, 2022</xref>), we used an 80/20 split, allocating 400 images for training and 100 for testing (<xref ref-type="bibr" rid="B9">Bichri et al., 2024</xref>); the test set remained untouched throughout all experiments (<xref ref-type="bibr" rid="B29">Lones, 2021</xref>). For the training images, we generated three distinct dataset variants, each used to train a separate model configuration:<list list-type="order">
<list-item>
<p>Original dataset: the 400 unaltered radiographs and their corresponding masks.</p>
</list-item>
<list-item>
<p>CLAHE dataset: the 400 original radiographs processed with Contrast Limited Adaptive Histogram Equalization (CLAHE) to compensate for non-uniform intensity distributions common in radiographs. Specifically, CLAHE operates by dividing the image into small contextual regions, equalizing each region&#x2019;s histogram, and applying a clip limit to avoid noise amplification before recombining the tiles via bilinear interpolation (<xref ref-type="bibr" rid="B61">Zuiderveld, 1994</xref>). We used the standard OpenCV implementation.</p>
</list-item>
<list-item>
<p>Augmented dataset: the 400 original &#x2b; 400 augmented image&#x2013;mask pairs (800 total). To increase the effective training set size and improve model generalization (<xref ref-type="bibr" rid="B19">Goceri, 2023</xref>), augmentations including affine transformations with rotations of &#xb1;10&#xa0;&#xb0; and &#xb1;45&#xa0;&#xb0; and translations of &#xb1;10 pixels along x and y axes were applied to the original dataset. The larger &#xb1;45&#xa0;&#xb0; rotations were included intentionally, as several expert-provided sample radiographs contained substantial initial misalignment. Mask transformations used nearest-neighbor interpolation to preserve labels.</p>
</list-item>
</list>
</p>
<p>All three training variants (original, CLAHE, and augmented) were derived from the same underlying set of 400 cervical radiographs. These variants reflect different preprocessing pipelines applied to identical base images rather than independent datasets, thus enabling us to evaluate how raw, intensity-normalized, and synthetically diversified training distributions affected model performance.</p>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Experimental setup</title>
<p>We trained and evaluated each model on the three dataset variants: Original (400 images); CLAHE (400 images); Augmented (800 images). The test set (n &#x3d; 100) remained constant across experiments. We compared GPU vs. CPU training/inference for efficiency. All experiments used a single system with PyTorch and an NVIDIA H100 Tensor Core GPU; the uniform environment ensured consistent timing and memory profiles. The H100&#x2019;s tensor cores and memory bandwidth supported large-batch training and stable convergence.</p>
<p>A test across five different images from the testing set show that GPU inference averaged 5&#x2013;8&#xa0;s/image; CPU inference on an Intel&#xae; Xeon&#xae; Gold 6,136 (2 &#xd7; 12 cores) averaged 9&#x2013;10&#xa0;s/image. The proposed CervSpineNet is computationally lightweight (&#x223c;345&#xa0;MB) and typically utilizes &#x223c;8&#x2013;9&#xa0;GPU cores during inference, enabling deployment without specialized hardware.</p>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>Proposed hybrid segmentation approach: CervSpineNet architecture</title>
<p>
<xref ref-type="fig" rid="F3">Figure 3</xref> shows the overall architecture and I/O. The encoder is transformer-based; the decoder is a lightweight U-Net&#x2013;style module with residual and squeeze-and-excitation (SE) blocks and bilinear up-sampling.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Schematic representation of the CervSpineNet hybrid architecture for automated segmentation of cervical spinous processes. The framework performs training and inference using paired cervical spine X-ray images and binary masks. CervSpineNet integrates a Vision Transformer (ViT-B) encoder for global contextual feature extraction with a U-Net&#x2013;style decoder for spatial detail recovery. Each stage of the decoder includes residual blocks (RB) to maintain gradient flow, squeeze-and-excitation blocks (SEB) for adaptive channel attention, and bilinear upsampling (BU) layers for smooth boundary reconstruction. Preprocessing involves image resizing and binarization, while the output mask is generated via sigmoid activation and optimized using the composite loss function combining Dice, Focal Tversky, Hausdorff Distance Transform, and SSIM terms.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g003.tif">
<alt-text content-type="machine-generated">Diagram depicting a cervical spine segmentation model. It includes a flowchart of an X-ray image processed by a ViT-b Encoder with stages like patch embedding, positional encoding, and transformer blocks. The output latent feature map feeds into a decoder with stages increasing in resolution. Parallelly, GT and predicted masks are compared for loss computation using an optimizer labeled AdamW.</alt-text>
</graphic>
</fig>
<sec id="s2-4-1">
<label>2.4.1</label>
<title>Inputs and notations</title>
<p>We denote the input image by <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and the binary target mask by <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Feature maps <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>&#xb7;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> with shape <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Convolutions use kernel <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and bias <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Moreover, <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is convolution and <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>&#xb7;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is sigmoid.</p>
</sec>
<sec id="s2-4-2">
<label>2.4.2</label>
<title>Encoder: ViT-B</title>
<p>Vision transformers (ViT) capture global context and long-range dependencies valuable for medical segmentation (<xref ref-type="bibr" rid="B59">Zhang et al., 2024</xref>), but benefit from local refinements from downstream decoders (<xref ref-type="bibr" rid="B26">Khan and Khan, 2025</xref>). As a result, we base our encoder on the ViT-B backbone of the Segment Anything Model (SAM). Specifically, we load the ViT-B variant from the original SAM repository through the sam_model_registry interface using the official PyTorch implementation of SAM. The prompt encoder and mask decoder heads are eliminated from this model, leaving only the image encoder.</p>
<p>Given an input radiograph/x-ray image that is resized to <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>024</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>024</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and rescaled to [0,1], the SAM ViT-B image encoder processes the radiograph and yields a dense feature map <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> with 256 channels at a spatial resolution of <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mn>64</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> as the output. This is given by <xref ref-type="disp-formula" rid="e1">Equation 1</xref>.<disp-formula id="e1">
<mml:math id="m12">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>64</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Our unique U-Net-style decoder receives these feature maps and uses multi-scale up sampling to return the image to its original resolution.</p>
<p>The encoder is based on the SAM ViT-B image encoder, which we use as a generic feature extractor for cervical spine radiographs. Each input radiograph is first resized to 1,024 &#xd7; 1,024 and linearly projected into a sequence of non-overlapping 16 &#xd7; 16 patches. These patches are embedded into a high-dimensional token space, augmented with learned 2D positional encodings, and passed through a stack of multi-head self-attention and feedforward transformer blocks. Across these layers, the encoder aggregates information over the entire field-of-view, so that each token encodes both local appearance (e.g., vertebral boundaries) and long-range context (e.g., overall spinal alignment). Finally, the token sequence is reshaped back into a 2D feature map of size 64 &#xd7; 64 with 256 channels, which serves as the latent representation fed into our U-Net&#x2013;style decoder for pixel-wise mask prediction.</p>
<p>The encoder weights are initialized from the SA-1B pre-trained SAM ViT-B checkpoint and then fine-tuned end-to-end together with our decoder on the curated cervical spine dataset using the compound loss described in <xref ref-type="sec" rid="s2-4-4">Section 2.4.4</xref>. In contrast, we also train a complete SAM baseline (<xref ref-type="sec" rid="s2-5-3">Section 2.5.3</xref>) that keeps the original image encoder, prompt encoder, and mask decoder. In this baseline, the entire SAM model is optimized using the same training set, and centroids obtained from the ground-truth masks are utilized as point prompts.</p>
<p>To qualitatively consider if the ViT-B encoder in CervSpineNet uses global contextual information, we generated three types of complementary attention maps for the trained hybrid model. During a forward pass, we cached the self-attention inputs to all transformer blocks and the final encoder feature map. First, we computed an attention-rollout map by creating the self-attention matrix for each block at the final token resolution, adding an identity term, normalizing rows, and multiplying the matrices across blocks; the vector of token importances that resulted was reshaped to the encoder grid (&#x2248;64 &#xd7; 64). Second, we computed a query-centric attention map from the last block by selecting a query token centered on the mid-cervical spinous process and visualizing its attention weights to all other tokens. Third, we produced a Grad-CAM map on the output of the encoder by back-propagating the mean foreground prediction to the encoder feature map and aggregating gradient-weighted activations. All maps were upsampled to image resolution and overlaid with the radiograph for visual interpretation.</p>
</sec>
<sec id="s2-4-3">
<label>2.4.3</label>
<title>Decoder</title>
<p>U-Net decoders progressively restore spatial resolution and boundary fidelity (<xref ref-type="bibr" rid="B58">Yuan and Cheng, 2024</xref>). Our decoder stages include Conv &#x2b; ReLU &#x2192; Residual Block &#x2192; SE Block &#x2192; Bilinear Upsample &#x2192; 1 &#xd7; 1 Conv &#x2b; Sigmoid. SE blocks re-weight channels to enhance task-relevant features (<xref ref-type="bibr" rid="B50">Wang J. et al., 2021</xref>); residual blocks maintain gradient flow and stabilize training (<xref ref-type="bibr" rid="B5">Ashkani Chenarlogh et al., 2022</xref>).</p>
<sec id="s2-4-3-1">
<label>2.4.3.1</label>
<title>Convolution stage</title>
<p>
<disp-formula id="e2">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>O</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x2a;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<xref ref-type="disp-formula" rid="e2">Equation 2</xref> denotes the convolution stage where <inline-formula id="inf12">
<mml:math id="m14">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the input feature map and <inline-formula id="inf13">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>O</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the resulting output feature map; <inline-formula id="inf14">
<mml:math id="m16">
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes convolution, <inline-formula id="inf15">
<mml:math id="m17">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the convolution kernel, <inline-formula id="inf16">
<mml:math id="m18">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the bias term, and <inline-formula id="inf17">
<mml:math id="m19">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the ReLU activation function.</p>
</sec>
<sec id="s2-4-3-2">
<label>2.4.3.2</label>
<title>Residual block</title>
<p>
<disp-formula id="e3">
<mml:math id="m20">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>B</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>&#x2a;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2a;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>The residual block depicted by <xref ref-type="disp-formula" rid="e3">Equation 3</xref> contains two stacked Conv &#x2b; ReLU layers with a skip connection. <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf19">
<mml:math id="m22">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> are the first and second convolution layers, and <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> are the bias terms for each respective layer.</p>
</sec>
<sec id="s2-4-3-3">
<label>2.4.3.3</label>
<title>SE block&#x2014;squeeze (global average pooling)</title>
<p>
<disp-formula id="e4">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
<xref ref-type="disp-formula" rid="e4">Equation 4</xref> gives the squeeze formula for the SE block where <inline-formula id="inf22">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the activation value at channel <inline-formula id="inf23">
<mml:math id="m27">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and spatial location <inline-formula id="inf24">
<mml:math id="m28">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf25">
<mml:math id="m29">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are the height and width of the feature map, and <inline-formula id="inf26">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the global descriptor for channel <inline-formula id="inf27">
<mml:math id="m31">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This step compresses spatial information into a single number per channel.</p>
</sec>
<sec id="s2-4-3-4">
<label>2.4.3.4</label>
<title>SE block&#x2014;excitation (2-layer MLP)</title>
<p>
<disp-formula id="e5">
<mml:math id="m32">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<xref ref-type="disp-formula" rid="e5">Equation 5</xref> gives the excitation formula for the SE block where <inline-formula id="inf28">
<mml:math id="m33">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the vector of channel weights, <inline-formula id="inf29">
<mml:math id="m34">
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the vector of all <inline-formula id="inf30">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> values, <inline-formula id="inf31">
<mml:math id="m36">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is the first and <inline-formula id="inf32">
<mml:math id="m37">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is the second fully connected layer, and <inline-formula id="inf33">
<mml:math id="m38">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the linear transformation applied to the channel descriptor <inline-formula id="inf34">
<mml:math id="m39">
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf35">
<mml:math id="m40">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf36">
<mml:math id="m41">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are the Sigmoid and ReLU activations respectively.</p>
</sec>
<sec id="s2-4-3-5">
<label>2.4.3.5</label>
<title>Channel re-weighting</title>
<p>
<disp-formula id="e6">
<mml:math id="m42">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>Channel re-weighing is represented by <xref ref-type="disp-formula" rid="e6">Equation 6</xref>, where <inline-formula id="inf37">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the weight from the excitation step, and <inline-formula id="inf38">
<mml:math id="m44">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes element-wise multiplication with broadcasting of <inline-formula id="inf39">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> across all spatial locations <inline-formula id="inf40">
<mml:math id="m46">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> in channel <inline-formula id="inf41">
<mml:math id="m47">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s2-4-3-6">
<label>2.4.3.6</label>
<title>Bilinear upsampling</title>
<p>
<disp-formula id="e7">
<mml:math id="m48">
<mml:mrow>
<mml:msup>
<mml:mi>F</mml:mi>
<mml:mo>&#x2296;</mml:mo>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>The output feature map <italic>F<sup>&#x2296;</sup>
</italic> after sampling is given by <xref ref-type="disp-formula" rid="e7">Equation 7</xref>. <inline-formula id="inf43">
<mml:math id="m50">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf44">
<mml:math id="m51">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are the input feature map and number of channels respectively as mentioned earlier, and <inline-formula id="inf45">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is bilinear interpolation. Each stage of the decoder contains one <inline-formula id="inf46">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> operation. These operations increase the spatial resolution of the feature map <inline-formula id="inf47">
<mml:math id="m54">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> by a factor of two in both height and width using bilinear interpolation.</p>
<p>Our decoder is U-Net&#x2013;inspired but intentionally departs from the textbook U-Net design. Because the encoder is a SAM ViT-B transformer that produces a single low-resolution feature map (256 channels at 64 &#xd7; 64), the decoder operates on this representation only and does not use multi-scale encoder&#x2013;decoder skip connections or a symmetric convolutional contracting path. Instead, it forms a purely expanding path with four stages of bilinear up-sampling followed by 3 &#xd7; 3 convolutions, residual blocks, and SE blocks that refine features at each scale before the final 1 &#xd7; 1 convolution and sigmoid.</p>
</sec>
</sec>
<sec id="s2-4-4">
<label>2.4.4</label>
<title>Loss function</title>
<p>Composite losses can improve robustness by balancing region overlap, boundary accuracy, and imbalance (<xref ref-type="bibr" rid="B46">Taghanaki et al., 2019</xref>; <xref ref-type="bibr" rid="B34">Mu et al., 2022</xref>). To jointly optimize region-wise overlap, class imbalance, boundary accuracy, and structural fidelity, we selected the four loss elements: Dice, Focal Tversky (FT), Hausdorff Distance Transform (HD95), and Structural Similarity Index (SSIM). The Hausdorff Distance Transform term specifically penalizes boundary misalignment, the SSIM term promotes structurally coherent, non-blurry masks that respect the overall spine morphology, Focal Tversky down-weights easy background pixels and concentrates learning on imbalanced, thin vertebral structures, and Dice loss offers a powerful region-overlap term.</p>
<p>Let <inline-formula id="inf48">
<mml:math id="m55">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> be the predicted probability map and <inline-formula id="inf49">
<mml:math id="m56">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> be the ground-truth mask over pixel set <inline-formula id="inf50">
<mml:math id="m57">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> with <inline-formula id="inf51">
<mml:math id="m58">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and a small constant <inline-formula id="inf52">
<mml:math id="m59">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> for numerical stability.</p>
<p>The Dice Loss is given by <xref ref-type="disp-formula" rid="e8">Equation 8</xref>:<disp-formula id="e8">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>For the Focal Tversky loss, we first compute the True Positive (TP), False Positive (FP) and False Negative (FN) as given in <xref ref-type="disp-formula" rid="e9">Equation 9</xref>:<disp-formula id="e9">
<mml:math id="m61">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>Thus, the Tversky Index, represented by <xref ref-type="disp-formula" rid="e10">Equation 10</xref>, is given by<disp-formula id="e10">
<mml:math id="m62">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>And the Focal Tversky Loss, given by <xref ref-type="disp-formula" rid="e11">Equation 11</xref>, is computed as:<disp-formula id="e11">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>Where, <inline-formula id="inf53">
<mml:math id="m64">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.75</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.3</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.75</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in our experiments.</p>
<p>For the Hausdorff Distance Transform Loss, depicted by <xref ref-type="disp-formula" rid="e12">Equation 12</xref>, we form binarized masks, <inline-formula id="inf54">
<mml:math id="m65">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2d8;</mml:mo>
</mml:mover>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf55">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and compute their Euclidean distance transforms <inline-formula id="inf56">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2d8;</mml:mo>
</mml:mover>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf57">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. For each pixel <inline-formula id="inf58">
<mml:math id="m69">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf59">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2d8;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the distance to the nearest boundary of <inline-formula id="inf60">
<mml:math id="m71">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2d8;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, and similarly for <inline-formula id="inf61">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The loss averages the absolute difference between these distance fields:<disp-formula id="e12">
<mml:math id="m73">
<mml:mrow>
<mml:mfenced open="" close="|" separators="|">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="" close="|" separators="|">
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x2d8;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>Finally, for the SSIM loss given by <xref ref-type="disp-formula" rid="e13">Equation 13</xref>, we compute the image-wise average SSIM following the standard definition (<xref ref-type="disp-formula" rid="e18">Equation 18</xref>), and define the loss as:<disp-formula id="e13">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
<p>After ablations with the composition of loss functions (<xref ref-type="sec" rid="s2-7-2">Section 2.7.2</xref>), the best results were obtained with a weighted sum of Dice, Focal Tversky, Hausdorff Distance Transform, and SSIM (<xref ref-type="disp-formula" rid="e14">Equation 14</xref>):<disp-formula id="e14">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>Based on our pilot experiments, we empirically selected the weights (0.5, 0.3, 0.1, 0.1). We prioritize Dice as the primary region-overlap term, give Focal Tversky a moderate weight to address class imbalance, and use smaller regularization weights for the boundary-focused HDT and SSIM terms to sharpen contours without destabilizing optimization.</p>
</sec>
</sec>
<sec id="s2-5">
<label>2.5</label>
<title>Baselines implemented: performance comparison</title>
<p>To avoid model-class bias and follow current benchmarking guidance (<xref ref-type="bibr" rid="B3">Antonelli et al., 2022</xref>; <xref ref-type="bibr" rid="B47">Tejani et al., 2024</xref>), we compared CervSpineNet to strong and architecturally diverse baselines (<xref ref-type="bibr" rid="B55">Wolfrath et al., 2024</xref>): DeepLabV3&#x2b;, U-Net, the full Segment Anything Model (SAM), and a text-guided SegFormer. All baselines and CervSpineNet were trained and evaluated on the same training/test split with identical pre-processing and data augmentations. U-Net, DeepLabV3&#x2b;, and text-guided SegFormer were trained on images resized to 512 &#xd7; 512, which is standard for CNN-based medical segmentation and keeps GPU memory requirements manageable. The SAM ViT-B and the proposed CervSpineNet were trained on 1,024 &#xd7; 1,024 inputs to match the native input resolution of the SAM image encoder. All predictions were resampled back to the original image resolution for evaluation. The batch size for every training experiment was kept 1, and the models were trained on 50 epochs, selecting the checkpoint with the best testing metrics as the metrics saturated around 40&#x2013;45 for all experiments. This ensured that the performance differences mainly reflect architectural choices rather than differences in data or optimization.</p>
<sec id="s2-5-1">
<label>2.5.1</label>
<title>DeepLabV3 &#x2b;</title>
<p>DeepLabV3&#x2b; performs strongly across medical segmentation tasks (<xref ref-type="bibr" rid="B23">Hou et al., 2024</xref>; <xref ref-type="bibr" rid="B25">Ketenci &#xc7;ay et al., 2025</xref>). We implemented DeepLabV3&#x2b; with a ResNet-50 backbone pretrained on ImageNet. The single-channel radiographs were resized to 512 &#xd7; 512 pixels and duplicated to three channels. A 1-channel logit map that has been upsampled to the input resolution is produced by the model. An AdamW optimizer and BCE Loss were used to refine DeepLabV3&#x2b; on our cervical dataset.</p>
</sec>
<sec id="s2-5-2">
<label>2.5.2</label>
<title>U-net</title>
<p>U-Net remains a robust baseline across modalities (<xref ref-type="bibr" rid="B43">Ronneberger et al., 2015</xref>; <xref ref-type="bibr" rid="B15">Du et al., 2020</xref>; <xref ref-type="bibr" rid="B6">Azad et al., 2022</xref>). We created a 4-level U-Net from scratch using 64 initial filters as a convolutional baseline. The inputs were 512 &#xd7; 512. We used the AdamW optimizer with BCE Loss.</p>
</sec>
<sec id="s2-5-3">
<label>2.5.3</label>
<title>Segment anything model (SAM)</title>
<p>SAM is trained on &#x3e;1B masks (<xref ref-type="bibr" rid="B27">Kirillov et al., 2023</xref>) and, with task-specific prompting, can be adapted to medical images (<xref ref-type="bibr" rid="B32">Mazurowski et al., 2023</xref>). For SAM, we used the official ViT-B variant from the Meta Segment-Anything repository, loaded via the sam_model_registry. We kept the full architecture, namely image encoder, prompt encoder, mask decoder, and resized X-rays to 1,024 &#xd7; 1,024 as recommended. For each training image, we derived oracle point prompts by placing a single positive point at the centroid of each ground-truth spinous process mask. We passed these points through the prompt encoder and supervised the resulting masks with binary cross-entropy &#x2b; Dice loss. We then fine-tuned all SAM parameters with the AdamW optimizer for 50 epochs. At inference time, we used one positive centroid point per spinous process and placed a threshold for the output probabilities at 0.5. This setup follows recent SAM adaptations to medical imaging and represents a reasonable, strong SAM baseline for our task.</p>
</sec>
<sec id="s2-5-4">
<label>2.5.4</label>
<title>Vision-language model&#x2014;text guided SegFormer</title>
<p>We used MedCLIP (domain-adapted CLIP) (<xref ref-type="bibr" rid="B52">Wang et al., 2022</xref>) to encode text prompts describing the spinal processes; embeddings were L<sub>2</sub>-normalized and pooled to a fixed vector <inline-formula id="inf62">
<mml:math id="m76">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. SegFormer employs a hierarchical MiT encoder and lightweight MLP. We modulated the final encoder stage with the text embedding via a linear projection and channel-wise scaling; the decoder produced logits upsampled to input resolution and passed through sigmoid. For text guidance, we created brief, anatomy-focused prompts for every target structure and used MedCLIP to encode them once. Only the SegFormer parameters were optimized on our training set, and the text encoder was kept frozen. For the decoder to predict masks that were explicitly conditioned on the selected prompt, the resulting text embedding was broadcast to every pixel and used to modulate the final MiT feature map via channel-wise scaling. This configuration replicates current text-guided segmentation baselines and gives our hybrid model a fair, repeatable benchmark without requiring further task-specific language backbone tuning.</p>
</sec>
</sec>
<sec id="s2-6">
<label>2.6</label>
<title>Evaluation metrics</title>
<p>Following common practice (<xref ref-type="bibr" rid="B28">Li et al., 2025</xref>; <xref ref-type="bibr" rid="B60">Zhang et al., 2025</xref>), we report Dice, Intersection-over-Union (IoU), Structural Similarity (SSIM), Hausdorff Distance (HD95), and Volumetric Similarity (VS) on the held-out test set. <xref ref-type="disp-formula" rid="e15">Equations 15</xref>-<xref ref-type="disp-formula" rid="e17">17</xref> display the Dice, IoU and VS terms as implemented in this study.<disp-formula id="e15">
<mml:math id="m77">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xb7;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xb7;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
<disp-formula id="e16">
<mml:math id="m78">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
<disp-formula id="e17">
<mml:math id="m79">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<p>Here, TP/FP/TN/FN are computed from thresholded (0.5) binary predictions and ground-truth masks for each pixel.<list list-type="bullet">
<list-item>
<p>True Positive (TP) &#x2013; prediction &#x3d; 1, ground truth &#x3d; 1 (pixels correctly classified as foreground)</p>
</list-item>
<list-item>
<p>False Positive (FP) &#x2013; prediction &#x3d; 1, ground truth &#x3d; 0 (pixels incorrectly classified as foreground)</p>
</list-item>
<list-item>
<p>True Negative &#x2013; prediction &#x3d; 0, ground truth &#x3d; 0 (pixels correctly classified as background)</p>
</list-item>
<list-item>
<p>False Negative (FN) &#x2013; prediction &#x3d; 0, ground truth &#x3d; 1 (missed foreground pixels)</p>
</list-item>
</list>
</p>
<p>SSIM captures luminance, contrast, and structure similarity. We used the standard SSIM metric (<xref ref-type="bibr" rid="B7">Bakurov et al., 2022</xref>), defined for a predicted mask <inline-formula id="inf63">
<mml:math id="m80">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and ground truth mask <inline-formula id="inf64">
<mml:math id="m81">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as shown in <xref ref-type="disp-formula" rid="e18">Equation 18</xref>:<disp-formula id="e18">
<mml:math id="m82">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>where, <inline-formula id="inf65">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the mean pixel intensities of <inline-formula id="inf66">
<mml:math id="m84">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf67">
<mml:math id="m85">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> , <inline-formula id="inf68">
<mml:math id="m86">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are the corresponding variances, <inline-formula id="inf69">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the cross-covariance between <inline-formula id="inf70">
<mml:math id="m88">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf71">
<mml:math id="m89">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>; <inline-formula id="inf72">
<mml:math id="m90">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> are small constants for stability.</p>
<p>HD95, as given by <xref ref-type="disp-formula" rid="e20">Equation 20</xref>, is the 95th percentile surface-to-surface distance between predicted and ground-truth objects (<xref ref-type="bibr" rid="B40">Ramachandran et al., 2023</xref>). Let <inline-formula id="inf73">
<mml:math id="m91">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> be sets of foreground boundary points with Euclidean distance <italic>d</italic>. The directed surface distance sets are calculated as in <xref ref-type="disp-formula" rid="e19">Equation 19</xref>:<disp-formula id="e19">
<mml:math id="m92">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:munder>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:munder>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
</p>
<p>Thus, for all distances, <inline-formula id="inf74">
<mml:math id="m93">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x222a;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.<disp-formula id="e20">
<mml:math id="m94">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>D</mml:mi>
<mml:mn>95</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mn>95</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-7">
<label>2.7</label>
<title>Ablation studies</title>
<p>Ablation experiments are essential in medical image segmentation, particularly for hybrid architectures, as they clarify the relative contributions of individual components to overall performance (<xref ref-type="bibr" rid="B18">Gao et al., 2021</xref>; <xref ref-type="bibr" rid="B54">Wang X. et al., 2025</xref>). To systematically characterize the behavior of CervSpineNet, we performed two sets of ablations: (i) architectural ablations isolating encoder and decoder contributions, and (ii) loss-function ablations assessing how each loss term influences error modes and structural fidelity.</p>
<sec id="s2-7-1">
<label>2.7.1</label>
<title>Architectural ablation experiments</title>
<p>To quantify how much performance arises from the encoder versus the decoder, we evaluated three architectural variants under identical training conditions, preprocessing, composite loss, and data splits.<list list-type="roman-lower">
<list-item>
<p>Pure CNN Encoder&#x2013;Decoder Baseline</p>
</list-item>
</list>
</p>
<p>We first constructed a fully convolutional baseline in which both the encoder and decoder are simple CNNs. The encoder comprises four convolution&#x2013;BatchNorm&#x2013;ReLU blocks, each followed by 2 &#xd7; 2 max pooling, reducing the spatial resolution from 1,024 &#xd7; 1,024 to 64 &#xd7; 64 while expanding feature channels from 3 to 256. The decoder is deliberately minimal, consisting of stacked 3 &#xd7; 3 convolutions with ReLU activations and bilinear upsampling, terminating in a 1 &#xd7; 1 sigmoid layer for mask generation. No residual connections, attention modules, or skip connections are used. This configuration provides a reference model that isolates the performance of a straightforward fully convolutional architecture.<list list-type="simple">
<list-item>
<p>ii. CNN Encoder &#x2b; Full Decoder</p>
</list-item>
</list>
</p>
<p>Next, we evaluated a model with the same CNN encoder as above but augmented with our full decoder. Each decoding stage includes a 3 &#xd7; 3 convolution, a residual block to enhance representational depth without compromising gradient flow, and a squeeze-and-excitation (SE) block for adaptive channel reweighting based on global context. Bilinear upsampling is applied at each stage. Because the encoder and training protocol remain constant, differences in performance directly reflect the contribution of residual pathways and channel attention to anatomical structure reconstruction.<list list-type="simple">
<list-item>
<p>iii. ViT-B Encoder (SAM) &#x2b; Simple Decoder</p>
</list-item>
</list>
</p>
<p>Finally, we substituted the CNN encoder with the Vision Transformer&#x2013;B (ViT-B) image encoder from Segment Anything, while retaining the simple decoder of the first experiment. The ViT-B encoder processes 1,024 &#xd7; 1,024 inputs to produce 256-channel 64 &#xd7; 64 feature maps enriched with long-range dependencies via self-attention. All transformer layers are unfrozen during training. Using the same composite loss, optimizer (AdamW), and data splits allows this ablation to cleanly assess the effect of replacing local convolutional features with global transformer-based representations, independent of decoder capacity.</p>
</sec>
<sec id="s2-7-2">
<label>2.7.2</label>
<title>Loss ablation experiments</title>
<p>We also performed a dedicated loss-function ablation using the full CervSpineNet architecture (ViT-B encoder &#x2b; residual/SE decoder). The optimizer, learning rate schedule, and data partitions were held constant across conditions. Loss terms were added progressively to address complementary error modes.</p>
<p>Beginning with a Dice-only baseline emphasizing volumetric overlap, we incorporated the Focal Tversky (FT) loss to penalize false negatives&#x2014;critical for foreground&#x2013;background imbalance in postoperative cervical spine X-rays. We then added the SSIM term to encourage structural and textural fidelity, particularly along vertebral margins and disc spaces. Finally, we introduced a differentiable Hausdorff distance surrogate (HD95) to explicitly penalize boundary deviations and outlier predictions.</p>
<p>This yielded five configurations&#x2014;Dice; Dice &#x2b; FT; Dice &#x2b; SSIM; Dice &#x2b; FT &#x2b; SSIM; and Dice &#x2b; FT &#x2b; SSIM &#x2b; HD95&#x2014;each trained for 50 epochs and evaluated on an identical test set using Dice, IoU, SSIM, HD95, and volumetric similarity metrics.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<p>This section presents the outcomes of the experiments designed to evaluate the proposed CervSpineNet model and its comparative baselines. Quantitative results are reported across the original, CLAHE-enhanced, and augmented datasets, followed by statistical significance testing, qualitative visual assessments, and an analysis of time efficiency between automated and manual segmentation.</p>
<sec id="s3-1">
<label>3.1</label>
<title>Quantitative comparison</title>
<p>Mean performance metrics (Dice, IoU, SSIM, HD95, VS) on the testing data were computed for all models across the three dataset variants: the original set (<xref ref-type="table" rid="T1">Table 1</xref>), CLAHE-enhanced images (<xref ref-type="table" rid="T2">Table 2</xref>), and the augmented dataset (<xref ref-type="table" rid="T3">Table 3</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Quantitative evaluation of segmentation models on the original dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Models</th>
<th align="center">Dice</th>
<th align="center">IoU</th>
<th align="center">SSIM</th>
<th align="center">HD95</th>
<th align="center">VS</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">SAM</td>
<td align="center">0.8553</td>
<td align="center">0.7532</td>
<td align="center">0.9752</td>
<td align="center">25.744</td>
<td align="center">0.9132</td>
</tr>
<tr>
<td align="left">U-net</td>
<td align="center">0.9013</td>
<td align="center">0.8226</td>
<td align="center">0.969</td>
<td align="center">
<bold>3.1296</bold>
</td>
<td align="center">0.9726</td>
</tr>
<tr>
<td align="left">Text-guided SegFormer</td>
<td align="center">0.9266</td>
<td align="center">0.8641</td>
<td align="center">0.9778</td>
<td align="center">4.2251</td>
<td align="center">0.9819</td>
</tr>
<tr>
<td align="left">DeepLabV3&#x2b;</td>
<td align="center">0.9287</td>
<td align="center">0.8676</td>
<td align="center">0.9781</td>
<td align="center">4.0418</td>
<td align="center">
<bold>0.9831</bold>
</td>
</tr>
<tr>
<td align="left">CervSpineNet</td>
<td align="center">
<bold>0.9315</bold>
</td>
<td align="center">
<bold>0.8726</bold>
</td>
<td align="center">
<bold>0.9831</bold>
</td>
<td align="center">3.3549</td>
<td align="center">0.9818</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Mean Dice, IoU, SSIM, HD95, and Volumetric Similarity (VS) scores are reported on the held-out test set (n &#x3d; 100). CervSpineNet achieved the highest Dice and SSIM, indicating strong overlap and structural fidelity, while maintaining the lowest HD95 (better boundary accuracy).</p>
</fn>
<fn>
<p>Across all tables, numeric values in bold font indicate the best mean score yielded</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Quantitative evaluation on the CLAHE-enhanced dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Models</th>
<th align="center">Dice</th>
<th align="center">IoU</th>
<th align="center">SSIM</th>
<th align="center">HD95</th>
<th align="center">VS</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">SAM</td>
<td align="center">0.8246</td>
<td align="center">0.7103</td>
<td align="center">0.9733</td>
<td align="center">27.555</td>
<td align="center">0.8929</td>
</tr>
<tr>
<td align="left">U-net</td>
<td align="center">0.9033</td>
<td align="center">0.8268</td>
<td align="center">0.9707</td>
<td align="center">3.4191</td>
<td align="center">0.9598</td>
</tr>
<tr>
<td align="left">Text-guided SegFormer</td>
<td align="center">0.9250</td>
<td align="center">0.8614</td>
<td align="center">0.9776</td>
<td align="center">4.2898</td>
<td align="center">0.9778</td>
</tr>
<tr>
<td align="left">DeepLabV3&#x2b;</td>
<td align="center">0.9260</td>
<td align="center">0.8631</td>
<td align="center">0.9778</td>
<td align="center">4.7765</td>
<td align="center">
<bold>0.9806</bold>
</td>
</tr>
<tr>
<td align="left">CervSpineNet</td>
<td align="center">
<bold>0.9313</bold>
</td>
<td align="center">
<bold>0.8722</bold>
</td>
<td align="center">
<bold>0.9829</bold>
</td>
<td align="center">
<bold>2.6561</bold>
</td>
<td align="center">0.9777</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Performance comparison of five segmentation models on contrast-normalized images. CervSpineNet maintained consistent superiority across Dice, SSIM, and HD95, demonstrating robustness to illumination and contrast variations typical of radiographs.</p>
</fn>
<fn>
<p>Across all tables, numeric values in bold font indicate the best mean score yielded</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Quantitative evaluation on the augmented dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Models</th>
<th align="center">Dice</th>
<th align="center">IoU</th>
<th align="center">SSIM</th>
<th align="center">HD95</th>
<th align="center">VS</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">SAM</td>
<td align="center">0.7955</td>
<td align="center">0.6698</td>
<td align="center">0.9722</td>
<td align="center">28.8584</td>
<td align="center">0.8435</td>
</tr>
<tr>
<td align="left">U-net</td>
<td align="center">0.9099</td>
<td align="center">0.8367</td>
<td align="center">0.9712</td>
<td align="center">3.0973</td>
<td align="center">0.9721</td>
</tr>
<tr>
<td align="left">Text-guided SegFormer</td>
<td align="center">0.9289</td>
<td align="center">0.8679</td>
<td align="center">0.9781</td>
<td align="center">4.0540</td>
<td align="center">0.9820</td>
</tr>
<tr>
<td align="left">DeepLabV3&#x2b;</td>
<td align="center">0.9303</td>
<td align="center">0.8704</td>
<td align="center">0.9784</td>
<td align="center">3.7779</td>
<td align="center">
<bold>0.9838</bold>
</td>
</tr>
<tr>
<td align="left">CervSpineNet</td>
<td align="center">
<bold>0.9326</bold>
</td>
<td align="center">
<bold>0.8744</bold>
</td>
<td align="center">
<bold>0.9832</bold>
</td>
<td align="center">
<bold>2.3806</bold>
</td>
<td align="center">0.9833</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Results of all segmentation models after data augmentation with rotations and translations. CervSpineNet again produced the best or near-best mean scores across all metrics, confirming generalization under data diversity.</p>
</fn>
<fn>
<p>Across all tables, numeric values in bold font indicate the best mean score yielded</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Repeated trials with identical hyperparameters produced standard deviations in the range 0.001&#x2013;0.005 for Dice, IoU, SSIM, and VS and &#x223c;0.3&#x2013;0.6 for HD95, confirming the stability and reproducibility of model performance across the testing data. <xref ref-type="fig" rid="F4">Figure 4</xref> shows the distribution of Dice coefficients for all models evaluated on the 100 test images under the three preprocessing conditions: (A) original data, (B) CLAHE-enhanced data, and (C) augmented data. Across all three dataset variants, CervSpineNet exhibited the narrowest spread and highest median Dice scores, indicating robustness to preprocessing variations and the radiographic variability that these variants are designed to reflect.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Distribution of Dice coefficients across test datasets for different preprocessing conditions. Boxplots show per-image Dice scores (n &#x3d; 100) for all segmentation models evaluated on <bold>(A)</bold> the original dataset, <bold>(B)</bold> the CLAHE-enhanced dataset, and <bold>(C)</bold> the augmented dataset. CervSpineNet demonstrates the highest median Dice and the smallest interquartile range across all three data variants, indicating superior segmentation accuracy and stability relative to baseline models.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g004.tif">
<alt-text content-type="machine-generated">Box plots showing Dice coefficient distributions for different segmentation models across three datasets: Original, CLAHE, and Augmented. Models include DeepLabV3+, CervSpineNet, SAM, SegFormer, and U-Net. Each plot presents results from zero to one with varying distributions.</alt-text>
</graphic>
</fig>
<p>CervSpineNet achieved the highest or near-highest mean values for nearly every metric, with particularly strong performance in SSIM and HD95, which reflect structural accuracy and boundary precision, respectively. To determine whether these improvements were statistically significant, formal non-parametric tests were conducted as described in the following section.</p>
<p>A summary of the mean test-set performance for all four architectural configurations adopted in this study is shown in <xref ref-type="table" rid="T4">Table 4</xref>.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Architectural Ablation Experiments and the metrics yielded: This table follows a similar structure as <xref ref-type="table" rid="T1">Tables 1</xref>&#x2013;<xref ref-type="table" rid="T3">3</xref> and demonstrates a comparison between different ablation experiments done with the hybrid model architecture utilizing the same metrics as used previously.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Experiment</th>
<th align="center">Dice</th>
<th align="center">IoU</th>
<th align="center">SSIM</th>
<th align="center">HD95</th>
<th align="center">VS</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Pure CNN &#x2b; basic decoder</td>
<td align="center">0.8516</td>
<td align="center">0.7616</td>
<td align="center">0.9828</td>
<td align="center">33.9749</td>
<td align="center">0.9222</td>
</tr>
<tr>
<td align="center">Pure CNN &#x2b; full decoder</td>
<td align="center">0.8852</td>
<td align="center">0.8021</td>
<td align="center">0.9849</td>
<td align="center">20.175</td>
<td align="center">0.9534</td>
</tr>
<tr>
<td align="center">ViT-b encoder &#x2b; basic decoder</td>
<td align="center">0.9307</td>
<td align="center">0.8713</td>
<td align="center">
<bold>0.9886</bold>
</td>
<td align="center">8.8335</td>
<td align="center">0.9804</td>
</tr>
<tr>
<td align="center">ViT-b encoder &#x2b; full decoder (CervSpineNet)</td>
<td align="center">
<bold>0.9315</bold>
</td>
<td align="center">
<bold>0.8726</bold>
</td>
<td align="center">0.9831</td>
<td align="center">
<bold>3.3549</bold>
</td>
<td align="center">
<bold>0.9818</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Across all tables, numeric values in bold font indicate the best mean score yielded</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Finally, the results of the loss ablation experiment are demonstrated in <xref ref-type="table" rid="T5">Table 5</xref>. Different sets of loss functions used, and their metrics scores are depicted in this table. The experiments demonstrated in <xref ref-type="table" rid="T4">Tables 4</xref>, <xref ref-type="table" rid="T5">5</xref> also yielded minute variations in the range of 0.002&#x2013;0.004 for Dice, IoU, SSIM, and VS and variations of &#x223c;0.45&#x2013;0.70 for HD95.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Loss Ablation Experiments and the metrics yielded: different combinations of loss functions and their results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Experiment</th>
<th align="center">Dice</th>
<th align="center">IoU</th>
<th align="center">SSIM</th>
<th align="center">HD95</th>
<th align="center">VS</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Dice</td>
<td align="center">0.9153</td>
<td align="center">0.8457</td>
<td align="center">0.9815</td>
<td align="center">5.8591</td>
<td align="center">0.9694</td>
</tr>
<tr>
<td align="center">Dice &#x2b; FT</td>
<td align="center">0.9288</td>
<td align="center">0.868</td>
<td align="center">0.9826</td>
<td align="center">3.9504</td>
<td align="center">0.9743</td>
</tr>
<tr>
<td align="center">Dice &#x2b; SSIM</td>
<td align="center">0.931</td>
<td align="center">0.8719</td>
<td align="center">0.9832</td>
<td align="center">3.8787</td>
<td align="center">0.9812</td>
</tr>
<tr>
<td align="center">Dice &#x2b; FT &#x2b; SSIM</td>
<td align="center">0.9286</td>
<td align="center">0.868</td>
<td align="center">0.9829</td>
<td align="center">5.0694</td>
<td align="center">0.979</td>
</tr>
<tr>
<td align="center">Dice &#x2b; FT &#x2b; SSIM &#x2b; HD95 (CervSpineNet)</td>
<td align="center">
<bold>0.9315</bold>
</td>
<td align="center">
<bold>0.8726</bold>
</td>
<td align="center">
<bold>0.9831</bold>
</td>
<td align="center">
<bold>3.3549</bold>
</td>
<td align="center">
<bold>0.9818</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>FT indicates Focal Tversky Loss; SSIM is Structural Similarity Index Loss; HD95 represents 95th percentile of Hausdorff Distance Loss.</p>
</fn>
<fn>
<p>Across all tables, numeric values in bold font indicate the best mean score yielded</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Statistical significance analysis of model performances</title>
<p>Prior studies highlight the robustness of the Wilcoxon Signed-Rank test and the Friedman test, which detects overall performance differences across multiple models evaluated on the same samples, for evaluating segmentation algorithms (<xref ref-type="bibr" rid="B33">Mostafa et al., 2024</xref>). We compared five segmentation models (SAM, U-Net, text-guided SegFormer, DeepLabV3&#x2b;, and CervSpineNet) using five metrics (Dice, IoU, SSIM, HD95, and VS) across the same per-image testing scores.</p>
<p>The Friedman test assessed overall model differences for each metric, followed by pairwise Wilcoxon signed-rank tests with Bonferroni correction (p &#x3c; 0.05). Across all metrics, significant overall differences were observed (Friedman p &#x3c; 0.001).</p>
<p>CervSpineNet significantly outperformed SAM and U-Net (corrected p &#x3c; 0.001) in every metric and dataset variant. It also exceeded DeepLabV3&#x2b; in SSIM, HD95, and Volumetric Similarity while maintaining comparable Dice and IoU (corrected p &#x2248; 0.05). Against the text-guided SegFormer, CervSpineNet achieved statistically higher performance in all metrics except VS, where the difference was not significant. These results confirm that the proposed hybrid model, CervSpineNet, provides robust and consistent segmentation.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Qualitative inference visualization and error analysis</title>
<p>Visual inspection of unseen test radiographs further corroborated the quantitative findings (<xref ref-type="fig" rid="F5">Figure 5</xref>). Panels <bold>(A)</bold>, <bold>(B)</bold>, and <bold>(C)</bold> show representative segmentation results from the original, CLAHE-enhanced, and augmented datasets, respectively, comparing <bold>CervSpineNet</bold> with all baseline models. Across all preprocessing conditions, <bold>CervSpineNet</bold> produced the most accurate and anatomically consistent delineations, with smoother contours, sharper boundaries, and markedly fewer false or incomplete predictions than either CNN-only or transformer-only architectures. These qualitative outcomes parallel the quantitative improvements reported in <xref ref-type="sec" rid="s3-1">Section 3.1</xref>, illustrating how the hybrid architecture captures global contextual features while preserving fine boundary details.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Qualitative comparison of model predictions on representative test radiographs. Representative examples <bold>(A&#x2013;C)</bold> show three randomly selected test radiographs and their corresponding segmentation results for each baseline model and the proposed CervSpineNet. As evidenced by the red markings, CervSpineNet demonstrates superior boundary sharpness and anatomical conformity of the spinous processes compared with U-Net, DeepLabV3&#x2b;, SAM, and the text-guided SegFormer. The red circles display the visual prediction errors made by the baselines when compared to the ground-truth mask.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g005.tif">
<alt-text content-type="machine-generated">Comparison of cervical spine segmentation methods across three panels labeled A, B, and C. Each panel includes an original X-ray, followed by segmentation results from six methods: Labels, SAM, U-Net, Segformer, DeepLabV3+, and CervSpineNet. SAM, U-Net, Segformer, and DeepLabV3+ images have regions circled in red, indicating areas of interest or errors. CervSpineNet presents a clean segmentation without annotations.</alt-text>
</graphic>
</fig>
<p>To examine segmentation fidelity in more detail, <xref ref-type="fig" rid="F6">Figure 6</xref> visualizes per-pixel discrepancies between CervSpineNet predictions and expert ground-truth masks. In these heatmaps, uncolored regions indicate perfect agreement, while increasing color intensity reflects greater deviation from the annotated boundaries. False positives represent regions erroneously predicted as foreground outside the annotated structure, false negatives correspond to missed spinous process pixels within the ground-truth region, and thin color bands along the contours indicate minor boundary mismatches or thickness differences.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Qualitative error analysis and per-image Dice, HD95, and mean absolute error (MAE) for high-quality CervSpineNet predictions. Representative examples show original cervical spine X-rays, expert ground-truth (GT) masks, CervSpineNet-predicted masks, and corresponding error heatmaps. Colored overlays highlight mis-segmented regions, where greater intensity denotes larger deviation from the GT boundaries. Quantitative values include Dice, HD95, Global MAE (overall pixel-wise deviation) and Foreground-only MAE (error within annotated regions). Lower values indicate higher structural fidelity and more accurate boundary reconstruction.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g006.tif">
<alt-text content-type="machine-generated">X-ray analysis of cervical spine images with ground truth and CervSpineNet predictions. Each row shows an original X-ray, corresponding segmented masks, predicted outcomes, and error heatmaps. Metrics include Dice, HD95, Global MAE, and Foreground-only MAE, indicating segmentation accuracy and error measurements.</alt-text>
</graphic>
</fig>
<p>A complementary quantitative error assessment was performed using Dice, HD95, and Mean Absolute Error (MAE) calculated on continuous (non-binarized) probability maps to preserve boundary sensitivity. <xref ref-type="fig" rid="F6">Figure 6</xref> demonstrate a sample of radiographs with good model predictions backed by the metrics used for comparison. On the other hand, <xref ref-type="fig" rid="F7">Figure 7</xref> depicts some edge cases where the model failed to perform well due to a variety of reasons often resulting in over/under prediction of structures or significant disagreement from ground truth masks.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Qualitative error analysis and per-image Dice, HD95, and mean absolute error (MAE) for low-quality CervSpineNet predictions. It is like <xref ref-type="fig" rid="F6">Figure 6</xref> in terms of the results displayed but this figure demonstrates the failure modes of the hybrid model and includes some edge case radiographs that causes the model to predict masks poorly and yield below-par metric results.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g007.tif">
<alt-text content-type="machine-generated">Comparison table showing cervical spine X-ray images, Ground Truth (GT) masks, CervSpineNet predicted masks, and error heatmaps across three cases. Dice, HD95, Global MAE, and Foreground-only MAE scores are provided for each case, reflecting segmentation accuracy metrics.</alt-text>
</graphic>
</fig>
<p>A pixel-by-pixel error heatmap between the ground-truth spinous process mask and the model prediction is superimposed on the lateral cervical X-ray in <xref ref-type="fig" rid="F6">Figures 6</xref>, <xref ref-type="fig" rid="F7">7</xref>. Areas where prediction and ground truth agree appear like the original image, whereas colored bands around the spinous processes mark disagreement: cooler colors indicate small or no error, and warmer colors (green&#x2013;yellow&#x2013;red) highlight increasing mismatch, with the brightest regions corresponding to the largest segmentation errors. In these figures, the Global MAE captures the average pixel-wise deviation across the entire image, reflecting overall segmentation fidelity, whereas the Foreground-only MAE quantifies deviation exclusively within annotated spinous process regions, emphasizing local structural precision. Both metrics were computed at the native image resolution, where lower MAE values indicate stronger correspondence with expert annotations. CervSpineNet consistently achieved the lowest Global and Foreground-only MAE scores, confirming high boundary accuracy and strong structural alignment across the test set.</p>
<p>
<xref ref-type="fig" rid="F6">Figure 6</xref> shows three representative test radiographs with high-quality CervSpineNet segmentations. In these typical cases, spinous process masks from C1&#x2013;C7 closely follow the cortical margins with minimal over- or under-segmentation. These examples support the good overall performance observed in the aggregate metrics by achieving high quantitative agreement with the reference annotations (Dice &#x2248; 0.93, HD95 &#x2248; 3, Global MAE &#x2248; 0.005, and Foreground-only MAE &#x2248; 0.05).</p>
<p>
<xref ref-type="fig" rid="F7">Figure 7</xref> illustrates three characteristic failure modes encountered by CervSpineNet. In Case 1, adjacent spinous processes are extremely close and partially overlapping, causing the model to merge two levels (C3&#x2013;C4) into a single elongated mask and consequently overestimate the superior spinous process. In Case 2, a fractured spinous process produces an irregular and discontinuous contour. While the expert ground truth precisely outlines the fracture margins, the model predicts a smoother, unbroken process, resulting in reduced Dice scores and a prominent error band in the heatmap. In Case 3, the C7 spinous process is scarcely visible due to low contrast and soft-tissue overlap at the cervicothoracic junction; although the model accurately segments C1&#x2013;C6, it fails to detect C7 entirely, yielding a level-specific false negative. Together, these examples indicate that severe pathology, low bone&#x2013;soft-tissue contrast, and tightly packed vertebral levels are the conditions under which CervSpineNet is most vulnerable. Future work will prioritize augmenting the training set with such challenging cases and exploring level-aware constraints or post-processing strategies to mitigate mask merging and missed levels.</p>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> presents attention visualizations for three representative held-out test cases&#x2014;two typical high-quality segmentations and one challenging example with substantial shoulder overlap. In the well-segmented cases, both attention-rollout and Grad-CAM overlays highlight a continuous chain of vertebral bodies and spinous processes spanning the upper through lower cervical levels. This pattern indicates that the transformer encoder integrates information across the full cervical column rather than relying solely on local image patches. Similarly, the query-centric attention map, when the query token is placed on a mid-cervical spinous process, assigns high attention weights to both adjacent and more distant vertebral levels, reinforcing evidence of long-range contextual reasoning. In contrast, the difficult case shows more spatially diffuse attention that partially shifts toward the overlapping shoulder and soft-tissue regions, corresponding to the observed segmentation errors. Collectively, these qualitative results support the conclusion that CervSpineNet&#x2019;s transformer encoder leverages global anatomical context along the cervical spine.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Transformer Attention Maps for ViT-B. Attention maps generated for three test cases; two with good (1, 2) and one with bad metrics and predicted masks (3). <bold>(a)</bold> is the original radiograph, <bold>(b)</bold> is the hybrid model predicted mask, <bold>(c)</bold> is the query centric map, <bold>(d)</bold> represents the attention-rollout map, and <bold>(e)</bold> depicts the Grad-CAM map.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g008.tif">
<alt-text content-type="machine-generated">Row of three sets of 5 cervical spine images. In each set: a) X-ray of the neck. b) Segmented areas in white on a black background. c) Neural network heat map showing likely vertebrae positions. d) Detailed heat map indicating vertebrae features. e) Final processed image with key structures highlighted in red and yellow.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Comparison of manual and automated segmentation efficiency</title>
<p>We conducted a systematic analysis to compare the time required for manual versus automated segmentation of cervical spinous processes. For manual annotation, three annotators&#x2014;two trained annotators and one expert spine surgeon&#x2014;outlined the spinous processes on a predefined set of radiographs using a tablet interface. The total manual annotation time was determined by averaging the duration recorded across all three annotators. This process was divided into two stages: (1) annotation, measuring the average time required to trace the spinous processes, and (2) mask generation, recording the time needed to transform these annotations into binary masks using the color-based segmentation algorithm described in <xref ref-type="sec" rid="s2-1">Section 2.1</xref>.</p>
<p>
<xref ref-type="fig" rid="F9">Figure 9</xref> compares the time required for manual versus automated segmentation. <xref ref-type="fig" rid="F9">Figure 9A</xref> summarizes the average annotation time per image for all manual annotators&#x2014; two annotators and 1 expert&#x2014;alongside the automated CervSpineNet model. Manual annotation times ranged from 46 to 98&#xa0;s, with an overall average of 72&#xa0;s, whereas CervSpineNet produced segmentations in only &#x223c;7.5&#xa0;s per image. <xref ref-type="fig" rid="F9">Figure 9B</xref> shows total processing time, combining annotation and mask-generation stages. Manual processing required approximately 3&#x2013;4&#xa0;min per image (mean &#x3d; 210&#xa0;s). In contrast, automated segmentation using CervSpineNet took only 5&#x2013;10&#xa0;s (mean &#x3d; 7.5&#xa0;s), corresponding to an overall &#x223c;96.43% reduction in total processing time.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>
<bold>(A)</bold> Horizontal bar chart showing average annotation time per image for each manual annotator (Expert, Annotator 1, Annotator 2, and Annotator Group Average) compared with automated segmentation using CervSpineNet. Manual annotation required 46&#x2013;98&#xa0;s per image on average, whereas automated inference required only &#x223c;7.5&#xa0;s. <bold>(B)</bold> Vertical bar chart illustrating total segmentation time (annotation &#x2b; mask generation) for manual versus automated approaches. Manual processing required approximately 3&#x2013;4&#xa0;min per image (mean &#x3d; 210&#xa0;s), while CervSpineNet produced binary masks in 5&#x2013;10&#xa0;s (mean &#x3d; 7.5&#xa0;s), representing a &#x223c;96.43% reduction in total processing time. Error bars indicate standard deviation.</p>
</caption>
<graphic xlink:href="fbioe-13-1733689-g009.tif">
<alt-text content-type="machine-generated">Panel A shows a bar chart comparing the annotation time in seconds across different methods: CervSpineNet (7.5 s), Annotator Group Average (72 s), Annotator 2 (46 s), Annotator 1 (72 s), and Expert Annotator (98 s). Panel B presents a bar chart showing total time per image, with Expert Annotator taking 210 seconds and CervSpineNet taking 7.5 seconds, indicating a 96% reduction in time.</alt-text>
</graphic>
</fig>
<p>Moreover, for inter-rater variability and to compare the model performance to human variability, we selected three of the test set images that the expert and trained annotators had already annotated and compared those segmentations to the masks predicted by the hybrid model using standard overlap and boundary metrics: Dice coefficient, Intersection-over-Union (IoU), 95th-percentile Hausdorff distance (HD95, in pixels), structural similarity index (SSIM), and volumetric similarity (VS).</p>
<p>Human&#x2013;human agreement was high: expert vs. human 1 and expert vs. human two reached mean Dice scores of 0.91 (IoU &#x2248; 0.83&#x2013;0.84, HD95 &#x2248; 6.3 pixels, SSIM &#x2248; 0.97, VS &#x2248; 0.96&#x2013;0.98), while human 1 vs. human 2 yielded a similar Dice of 0.92 (IoU 0.85, HD95 6.24 pixels, SSIM 0.97, VS 0.98). On the same images, the proposed model reached a Dice of 0.92 vs. the expert (0.924), with higher IoU (0.86), lower HD95 (5.71 pixels), SSIM of 0.98, and VS of 0.99. Model vs. R1 and model vs. R2 comparisons were also in the same range: Dice &#x2248; 0.92, IoU &#x2248; 0.86, HD95 &#x2248; 6.8 and 5.5 pixels, SSIM &#x2248; 0.98. These results collectively indicate that model segmentations are as consistent with the expert as those of the additional human raters, and considering boundary accuracy, HD95 is slightly better than the average human&#x2013;human variability on this sample, though we acknowledge this was based on three test cases.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>This section interprets the results of the study in the context of existing literature, emphasizing the methodological advances, clinical relevance, and potential impact of the developed CervSpineNet model. The discussion also outlines the limitations of the current work and identifies future directions for improving performance and generalizability.</p>
<sec id="s4-1">
<label>4.1</label>
<title>Overview and model interpretation</title>
<p>This study introduces an automated segmentation framework for delineating cervical spinous processes from lateral spine radiographs. Because no public dataset existed for this task and manual annotation is labor-intensive, both a curated dataset and a novel hybrid model, CervSpineNet, were developed. Early zero-shot testing with standard segmentation models produced inconsistent and inaccurate delineations, confirming the need for a dedicated dataset capable of capturing the subtle morphology of spinous processes.</p>
<p>The CervSpineNet combines a Vision Transformer (ViT-B) encoder for global contextual modeling with a U-Net&#x2013;style convolutional decoder for fine-grained boundary reconstruction (<xref ref-type="bibr" rid="B12">Chen J. et al., 2024</xref>; <xref ref-type="bibr" rid="B2">Al-Antari et al., 2025</xref>). This hybrid design maintains full spatial resolution by remaining fully convolutional, ensuring that predicted masks match input dimensions&#x2014;a practical advantage for radiological workflows. Integrated residual and squeeze-and-excitation blocks selectively enhance task-relevant features and stabilize training. Collectively, these design elements allow the model to generate binary masks that closely align with ground truth and exhibit lower structural error, as confirmed across all dataset variants.</p>
<p>Although CervSpineNet follows the overall concept of a transformer&#x2013;CNN hybrid, its architecture is specifically tailored to the demands of cervical spine X-ray segmentation. We decouple the SAM ViT-B image encoder and fine-tune it on 1,024 &#xd7; 1,024 radiographs, leveraging its large-scale pretraining to capture long-range anatomical context along the full C1&#x2013;C7 column. On top of these rich features, we employ a compact U-Net style decoder with residual and squeeze-and-excitation blocks, sharpening local boundaries and preserving thin, curved spinous process shapes. Training is driven by a compound loss that combines Dice overlap, Focal Tversky, Hausdorff distance&#x2013;transform and SSIM terms to jointly encourage correct vertebral coverage, suppression of false positives and accurate contour geometry. In our experiments (<xref ref-type="sec" rid="s3-1">Section 3.1</xref>, <xref ref-type="table" rid="T1">Tables 1</xref>&#x2013;<xref ref-type="table" rid="T3">3</xref>), this SAM-based hybrid consistently outperforms standard U-Net, DeepLabV3&#x2b;, a fine-tuned full SAM model and a text-guided SegFormer, indicating that the proposed combination of encoder, decoder and loss function confers a tangible advantage for this specific small-structure segmentation task.</p>
<p>Overall, the architectural ablations demonstrate that both encoder choice and decoder design substantially influence segmentation performance, albeit in complementary ways. Enhancing the decoder with residual and squeeze-and-excitation blocks consistently improves overlap, volumetric, and boundary metrics, indicating that richer decoding capacity is critical for refining fine spinal structures and suppressing noise. In contrast, replacing a purely convolutional encoder with a ViT-B transformer encoder yields the largest performance gains&#x2014;particularly for boundary accuracy&#x2014;highlighting the importance of long-range contextual reasoning and global shape modeling in cervical spine anatomy. Together, these results validate the design of CervSpineNet: the combination of a transformer-based encoder with an enriched decoder provides measurable advantages, while simpler components remain competitive options for lower-complexity or resource-constrained settings.</p>
<p>Loss ablations similarly show that region-based and structure-aware supervision outperforms Dice alone. Adding either Focal Tversky or SSIM to Dice produces consistent improvements in Dice, IoU, and SSIM, underscoring the value of class-imbalance handling and structural consistency for stable mask learning. The Dice &#x2b; SSIM configuration is particularly effective for overlap and volumetric similarity, reflecting the benefit of enforcing local textural coherence in postoperative spine X-rays. These trends support our use of a compound loss: Dice and FT promote reliable region overlap and recall, SSIM preserves fine anatomical detail, and the HD95 surrogate sharpens boundaries by explicitly penalizing outlier deviations.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Performance evaluation and clinical relevance</title>
<p>Across the original, CLAHE-enhanced, and augmented datasets, CervSpineNet consistently outperformed CNN-based and transformer-based baselines on Dice, IoU, SSIM, HD95, and volumetric similarity metrics. Statistical testing confirmed that these improvements were significant, highlighting the robustness and reproducibility of the approach. Notably, the model accurately segmented difficult lower cervical levels (C6&#x2013;C7), which are often challenging even for experts, underscoring its ability to generalize to low-contrast or ambiguous regions.</p>
<p>To the best of our knowledge, this work provides the first public dataset and segmentation framework dedicated to 2D X-ray&#x2013;based cervical spinous process delineation. Accurate and efficient segmentation of these structures has broad clinical value. In Anterior Cervical Discectomy and Fusion (ACDF), precise localization of spinous processes supports surgical planning and fusion evaluation (<xref ref-type="bibr" rid="B37">Neifert et al., 2020</xref>; <xref ref-type="bibr" rid="B31">Martin et al., 2025</xref>). Beyond ACDF, automatic spinous process segmentation could aid in the assessment of vertebral alignment after trauma, the monitoring of spinal deformities such as scoliosis or kyphosis, and image-guided navigation during minimally invasive procedures.</p>
<p>From a workflow standpoint, CervSpineNet offers substantial efficiency gains. It reduces average manual segmentation time from approximately 3&#x2013;4&#xa0;min per image to 5&#x2013;10&#xa0;s, representing a &#x223c;96% time reduction. The lightweight architecture (&#x223c;345&#xa0;MB) and CPU-compatible inference make the model suitable for clinical integration without specialized hardware. Collectively, these features position CervSpineNet as a practical, reproducible tool for advancing AI-assisted cervical spine imaging and analysis.</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Limitations and future work</title>
<p>Despite strong performance, several limitations warrant consideration. First, training the hybrid architecture is more computationally demanding than lighter CNN baselines due to the transformer encoder and multi-stage decoding. Model performance may also be sensitive to threshold selection, class imbalance, and annotation noise, and the fixed-size image resizing may introduce subtle aspect-ratio bias. Although CervSpineNet performs well on internal data, external validation across different scanners, institutions, and acquisition protocols remains necessary to assess generalization and potential domain shift. Additionally, while we report strong segmentation metrics (Dice, IoU, HD95, etc.), we did not directly evaluate downstream clinical indices&#x2014;such as spinous-process&#x2013;derived angular measures or deformity parameters. Establishing links between CervSpineNet&#x2019;s segmentations and clinically meaningful quantitative metrics will be essential for demonstrating translational impact.</p>
<p>Future optimization efforts should investigate encoder freezing, mixed-precision training, gradient checkpointing, or knowledge distillation to reduce computational burden. Boundary accuracy may be further improved through boundary-aware decoders, attention-gated skip connections, or uncertainty-based quality control mechanisms. Expanding the dataset to incorporate diverse imaging protocols, patient populations, and post-operative presentations will enhance robustness. Extending the framework to multi-view radiographs or 3D modalities (CT, MRI) may also enrich anatomical representation and support more complex clinical tasks.</p>
<p>Prospective clinical validation and workflow-integrated deployment studies will be critical for determining real-world utility in surgical planning, postoperative monitoring, and routine spine care. Beyond surgical settings, automated spinous-process segmentation may facilitate longitudinal deformity surveillance, trauma assessment, and AI-assisted spine navigation. These directions position CervSpineNet as a promising foundation for scalable, clinically relevant spine-imaging solutions.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>This study presents CervSpineNet, a hybrid transformer&#x2013;CNN framework for automated segmentation of cervical spinous processes on lateral X-ray images, along with the first curated dataset developed specifically for this task. By coupling a ViT-B encoder that captures global anatomical context with a lightweight convolutional decoder optimized for fine structural reconstruction, CervSpineNet achieves strong performance, with mean Dice scores exceeding 0.93 and SSIM values above 0.98 across multiple dataset variants. Statistical comparisons demonstrate significant improvements over established baselines, and the model reduces manual annotation time by approximately 96%, producing accurate binary masks within 5&#x2013;10&#xa0;s on standard clinical hardware.</p>
<p>With its high accuracy, compact computational footprint, and openly accessible dataset, CervSpineNet offers a practical and reproducible foundation for future clinical integration and methodological research. The framework has potential applications in spine imaging, surgical planning, postoperative monitoring, and quantitative radiology, and it provides a scalable platform for advancing automated analysis of cervical spine anatomy.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="ethics-statement" id="s7">
<title>Ethics statement</title>
<p>Ethical approval was not required for the study involving humans in accordance with the local legislation and institutional requirements. Written informed consent to participate in this study was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>JS: Validation, Formal Analysis, Writing &#x2013; review and editing, Software, Data curation, Writing &#x2013; original draft, Conceptualization, Visualization, Methodology. LM: Visualization, Writing &#x2013; original draft, Validation, Formal Analysis, Writing &#x2013; review and editing, Data curation, Software, Methodology. AN: Writing &#x2013; original draft, Data curation, Writing &#x2013; review and editing, Validation, Methodology, Formal Analysis, Visualization, Software. AM: Methodology, Formal Analysis, Software, Writing &#x2013; review and editing, Data curation, Conceptualization. JB: Writing &#x2013; review and editing, Visualization, Formal Analysis, Writing &#x2013; original draft, Methodology, Data curation. IP: Writing &#x2013; review and editing, Data curation, Methodology. SY: Data curation, Validation, Conceptualization, Writing &#x2013; review and editing. ChM: Data curation, Resources, Writing &#x2013; review and editing, Conceptualization. CaM: Funding acquisition, Writing &#x2013; review and editing, Conceptualization, Writing &#x2013; original draft, Investigation, Resources, Project administration, Formal Analysis, Validation, Supervision, Visualization, Methodology.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/505223/overview">Christopher G. Provatidis</ext-link>, National Technical University of Athens, Greece</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/485878/overview">Zifei Liang</ext-link>, New York University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3277503/overview">Moxin Zhao</ext-link>, Chinese Academy of Sciences (CAS), China</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahmad</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Strand</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Sparres&#xe4;ter</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Tarai</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lundstr&#xf6;m</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Bergstr&#xf6;m</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Automatic segmentation of large-scale CT image datasets for detailed body composition analysis</article-title>. <source>BMC Bioinforma.</source> <volume>24</volume>, <fpage>346</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-023-05462-2</pub-id>
<pub-id pub-id-type="pmid">37723444</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Al-Antari</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Al-Tam</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Al-Hejri</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Al-Huda</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Y&#x131;ld&#x131;r&#x131;m</surname>
<given-names>&#xd6;.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>A hybrid segmentation and classification CAD framework for automated myocardial infarction prediction from MRI images</article-title>. <source>Sci. Rep.</source> <volume>15</volume>, <fpage>14196</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-98893-1</pub-id>
<pub-id pub-id-type="pmid">40269099</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Antonelli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Reinke</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bakas</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Farahani</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kopp-Schneider</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Landman</surname>
<given-names>B. A.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>The medical segmentation decathlon</article-title>. <source>Nat. Commun.</source> <volume>13</volume>, <fpage>4128</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-022-30695-9</pub-id>
<pub-id pub-id-type="pmid">35840566</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ariyaratne</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jenko</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Iyengar</surname>
<given-names>K. P.</given-names>
</name>
<name>
<surname>Davies</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Azzopardi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hughes</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Anatomy and pathologies of the spinous process</article-title>. <source>Diseases</source> <volume>12</volume>, <fpage>302</fpage>. <pub-id pub-id-type="doi">10.3390/diseases12120302</pub-id>
<pub-id pub-id-type="pmid">39727632</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ashkani Chenarlogh</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Shabanzadeh</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ghelich Oghli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sirjani</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Farzin Moghadam</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Akhavan</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Clinical target segmentation using a novel deep neural network: double attention Res-U-Net</article-title>. <source>Sci. Rep.</source> <volume>12</volume>, <fpage>6717</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-10429-z</pub-id>
<pub-id pub-id-type="pmid">35468984</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name>
<surname>Azad</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Khodapanah Aghdam</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Rauland</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Haddadi Avval</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bozorgpour</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Medical image segmentation review: the success of U-Net</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://ui.adsabs.harvard.edu/abs/2022arXiv221114830A">https://ui.adsabs.harvard.edu/abs/2022arXiv221114830A</ext-link> (Accessed November 01, 2022)</comment>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bakurov</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Buzzelli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Schettini</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Castelli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vanneschi</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Structural similarity index (SSIM) revisited: a data-driven approach</article-title>. <source>Expert Syst. Appl.</source> <volume>189</volume>, <fpage>116087</fpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2021.116087</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Deep ensemble learning-driven fully automated multi-structure segmentation for precision craniomaxillofacial surgery</article-title>. <source>Front. Bioeng. Biotechnol.</source> <volume>13</volume>, <fpage>1580502</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2025.1580502</pub-id>
<pub-id pub-id-type="pmid">40406586</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bichri</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chergui</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hain</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Investigating the impact of train/test split ratio on the performance of pre-trained models with custom datasets</article-title>. <source>Int. J. Adv. Comput. Sci. Appl.</source> <volume>15</volume>. <pub-id pub-id-type="doi">10.14569/ijacsa.2024.0150235</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name>
<surname>Bucher</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vu</surname>
<given-names>T.-H.</given-names>
</name>
<name>
<surname>Cord</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>P&#xe9;rez</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Zero-shot semantic segmentation</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://ui.adsabs.harvard.edu/abs/2019arXiv190600817B">https://ui.adsabs.harvard.edu/abs/2019arXiv190600817B</ext-link> (Accessed June 01, 2019)</comment>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Branstetter</surname>
<given-names>B. F. T.</given-names>
</name>
<name>
<surname>Welch</surname>
<given-names>W. C.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Multiple posterior vertebral fusion abnormalities: a case report and review of the literature</article-title>. <source>AJR Am. J. Roentgenol.</source> <volume>186</volume>, <fpage>1256</fpage>&#x2013;<lpage>1259</lpage>. <pub-id pub-id-type="doi">10.2214/AJR.04.1874</pub-id>
<pub-id pub-id-type="pmid">16632715</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2024a</year>). <article-title>TransUNet: rethinking the U-Net architecture design for medical image segmentation through the lens of transformers</article-title>. <source>Med. Image Anal.</source> <volume>97</volume>, <fpage>103280</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2024.103280</pub-id>
<pub-id pub-id-type="pmid">39096845</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Readie</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ligozio</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Mandal</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Jabbar</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2024b</year>). <article-title>VertXNet: an ensemble method for vertebral body segmentation and identification from cervical and lumbar spinal X-rays</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>3341</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-023-49923-3</pub-id>
<pub-id pub-id-type="pmid">38336974</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cramer</surname>
<given-names>G. D.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>General characteristics of the spine</article-title>,&#x201d; in <source>Clinical anatomy of the spine, spinal cord, and Ans</source> (<publisher-name>Elsevier</publisher-name>), <fpage>15</fpage>&#x2013;<lpage>64</lpage>.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Du</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Medical image segmentation based on U-Net: a review</article-title>. <source>J. Imaging Sci. Technol.</source> <volume>64</volume>, <fpage>020508</fpage>. <pub-id pub-id-type="doi">10.2352/j.imagingsci.technol.2020.64.2.020508</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Farooqi</surname>
<given-names>R. R.</given-names>
</name>
<name>
<surname>Mehmood</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kotwal</surname>
<given-names>H. A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Hyperplasia of lamina and spinous process of C5 vertebrae and associated hemivertebra at C4 level</article-title>. <source>J. Orthop. Case Rep.</source> <volume>7</volume>, <fpage>79</fpage>&#x2013;<lpage>81</lpage>. <pub-id pub-id-type="doi">10.13107/jocr.2250-0685.698</pub-id>
<pub-id pub-id-type="pmid">28630847</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Galbusera</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Cina</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Image annotation and curation in radiology: an overview for machine learning practitioners</article-title>. <source>Eur. Radiol. Exp.</source> <volume>8</volume>, <fpage>11</fpage>. <pub-id pub-id-type="doi">10.1186/s41747-023-00408-y</pub-id>
<pub-id pub-id-type="pmid">38316659</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Metaxas</surname>
<given-names>D. N.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>UTNet: a hybrid transformer architecture for medical image segmentation</article-title>,&#x201d; in <source>Medical image computing and computer assisted intervention &#x2013; MICCAI 2021</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>61</fpage>&#x2013;<lpage>71</lpage>.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goceri</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Medical image data augmentation: techniques, comparisons and interpretations</article-title>. <source>Artif. Intell. Rev.</source> <volume>56</volume>, <fpage>1</fpage>&#x2013;<lpage>45</lpage>. <pub-id pub-id-type="doi">10.1007/s10462-023-10453-z</pub-id>
<pub-id pub-id-type="pmid">37362888</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gould</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Sohail</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Haines</surname>
<given-names>C. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Anterior cervical discectomy and fusion: techniques, complications, and future directives</article-title>. <source>Semin. Spine Surg.</source> <volume>32</volume>, <fpage>100772</fpage>. <pub-id pub-id-type="doi">10.1016/j.semss.2019.100772</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ham</surname>
<given-names>D.-W.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>Y.-S.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>S.-M.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>K.-S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Measurement of interspinous motion in dynamic cervical radiographs using a deep learning-based segmentation model</article-title>. <source>J. Neurosurg. Spine</source> <volume>39</volume>, <fpage>329</fpage>&#x2013;<lpage>334</lpage>. <pub-id pub-id-type="doi">10.3171/2023.5.SPINE23293</pub-id>
<pub-id pub-id-type="pmid">37327141</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hollon</surname>
<given-names>T. C.</given-names>
</name>
<name>
<surname>Pandian</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Adapa</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Urias</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Save</surname>
<given-names>A. V.</given-names>
</name>
<name>
<surname>Khalsa</surname>
<given-names>S. S. S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Near real-time intraoperative brain tumor diagnosis using stimulated Raman histology and deep neural networks</article-title>. <source>Nat. Med.</source> <volume>26</volume>, <fpage>52</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-019-0715-9</pub-id>
<pub-id pub-id-type="pmid">31907460</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>LM-DeeplabV3&#x2b;: a lightweight image segmentation algorithm based on multi-scale feature interaction</article-title>. <source>Appl. Sci.</source> <volume>14</volume>, <fpage>1558</fpage>. <pub-id pub-id-type="doi">10.3390/app14041558</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kaiser</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Reddy</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Launico</surname>
<given-names>M. V.</given-names>
</name>
<name>
<surname>Lugo-Pico</surname>
<given-names>J. G.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>Anatomy, head and neck: cervical vertebrae</article-title>,&#x201d; in <source>StatPearls</source> (<publisher-loc>Treasure Island (FL)</publisher-loc>: <publisher-name>StatPearls Publishing LLC</publisher-name>).</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ketenci &#xc7;ay</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Ye&#x15f;il</surname>
<given-names>&#xc7;.</given-names>
</name>
<name>
<surname>&#xc7;ay</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Y&#x131;lmaz</surname>
<given-names>B. G.</given-names>
</name>
<name>
<surname>&#xd6;z&#xe7;ini</surname>
<given-names>F. H.</given-names>
</name>
<name>
<surname>&#x130;lg&#xfc;y</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>DeepLabv3 &#x2b; method for detecting and segmenting apical lesions on panoramic radiography</article-title>. <source>Clin. Oral Investig.</source> <volume>29</volume>, <fpage>101</fpage>. <pub-id pub-id-type="doi">10.1007/s00784-025-06156-0</pub-id>
<pub-id pub-id-type="pmid">39888441</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Multi-axis vision transformer for medical image segmentation</article-title>. <source>Eng. Appl. Artif. Intell.</source> <volume>158</volume>, <fpage>111251</fpage>. <pub-id pub-id-type="doi">10.1016/j.engappai.2025.111251</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name>
<surname>Kirillov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mintun</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ravi</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Rolland</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gustafson</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Segment anything</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://ui.adsabs.harvard.edu/abs/2023arXiv230402643K">https://ui.adsabs.harvard.edu/abs/2023arXiv230402643K</ext-link> (Accessed April 01, 2023)</comment>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Multi-label conditioned diffusion for cardiac MR image augmentation and segmentation</article-title>. <source>Bioeng. (Basel)</source> <volume>12</volume>, <fpage>812</fpage>. <pub-id pub-id-type="doi">10.3390/bioengineering12080812</pub-id>
<pub-id pub-id-type="pmid">40868325</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lones</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>How to avoid machine learning pitfalls: a guide for academic researchers</article-title>. <source>arXiv [cs.LG]</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2108.02497</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>You</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Segment anything in medical images</article-title>. <source>Nat. Commun.</source> <volume>15</volume>, <fpage>654</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-024-44824-z</pub-id>
<pub-id pub-id-type="pmid">38253604</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Martin</surname>
<given-names>C. T.</given-names>
</name>
<name>
<surname>Yoon</surname>
<given-names>S. T.</given-names>
</name>
<name>
<surname>Alluri</surname>
<given-names>R. K.</given-names>
</name>
<name>
<surname>Benzel</surname>
<given-names>E. C.</given-names>
</name>
<name>
<surname>Bono</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Cho</surname>
<given-names>S. K.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>How reliable is the assessment of fusion status following ACDF using dynamic flexion-extension radiographs?</article-title> <source>Glob. Spine J.</source> <volume>15</volume>, <fpage>2450</fpage>&#x2013;<lpage>2457</lpage>. <pub-id pub-id-type="doi">10.1177/21925682241303107</pub-id>
<pub-id pub-id-type="pmid">39639494</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mazurowski</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Konz</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Segment anything model for medical image analysis: an experimental study</article-title>. <source>Med. Image Anal.</source> <volume>89</volume>, <fpage>102918</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2023.102918</pub-id>
<pub-id pub-id-type="pmid">37595404</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mostafa</surname>
<given-names>R. R.</given-names>
</name>
<name>
<surname>Khedr</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Aghbari</surname>
<given-names>Z. A.</given-names>
</name>
<name>
<surname>Afyouni</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kamel</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Ahmed</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Medical image segmentation approach based on hybrid adaptive differential evolution and crayfish optimizer</article-title>. <source>Comput. Biol. Med.</source> <volume>180</volume>, <fpage>109011</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.109011</pub-id>
<pub-id pub-id-type="pmid">39146840</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>The combined focal cross entropy and dice loss function for segmentation of protein secondary structures from cryo-EM 3D density maps</article-title>,&#x201d; in <conf-name>2022 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name> (<publisher-name>IEEE</publisher-name>).</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Muraina</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Ideal dataset splitting ratios in machine learning algorithms: general concerns for data scientists and data analysts</article-title>,&#x201d; in <conf-name>7th International Mardin Artuklu Scientific Researches Conference</conf-name>. (<publisher-loc>Mardin, Turkey</publisher-loc>).</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Muthukrishnan</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Jaipurkar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Damodaran</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Continuum topological derivative - a novel application tool for segmentation of CT and MRI images</article-title>. <source>Neuroimage Rep.</source> <volume>4</volume>, <fpage>100215</fpage>. <pub-id pub-id-type="doi">10.1016/j.ynirp.2024.100215</pub-id>
<pub-id pub-id-type="pmid">40568569</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Neifert</surname>
<given-names>S. N.</given-names>
</name>
<name>
<surname>Martini</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Yuk</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Mcneill</surname>
<given-names>I. T.</given-names>
</name>
<name>
<surname>Caridi</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Steinberger</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Predicting trends in cervical spinal surgery in the United States from 2020 to 2040</article-title>. <source>World Neurosurg.</source> <volume>141</volume>, <fpage>e175</fpage>&#x2013;<lpage>e181</lpage>. <pub-id pub-id-type="doi">10.1016/j.wneu.2020.05.055</pub-id>
<pub-id pub-id-type="pmid">32416237</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ouyang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ghorbani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ebinger</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Langlotz</surname>
<given-names>C. P.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Video-based AI for beat-to-beat assessment of cardiac function</article-title>. <source>Nature</source> <volume>580</volume>, <fpage>252</fpage>&#x2013;<lpage>256</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-020-2145-8</pub-id>
<pub-id pub-id-type="pmid">32269341</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Posthuma De Boer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Van Wulfften Palthe</surname>
<given-names>A. F. Y.</given-names>
</name>
<name>
<surname>Stadhouder</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bloemers</surname>
<given-names>F. W.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>The clay shoveler&#x2019;s fracture: a case report and review of the literature</article-title>. <source>J. Emerg. Med.</source> <volume>51</volume>, <fpage>292</fpage>&#x2013;<lpage>297</lpage>. <pub-id pub-id-type="doi">10.1016/j.jemermed.2016.03.020</pub-id>
<pub-id pub-id-type="pmid">27262733</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramachandran</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Eswarlal</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Lehman</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Colbert</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Assessment of optimizers and their performance in autosegmenting lung tumors</article-title>. <source>J. Med. Phys.</source> <volume>48</volume>, <fpage>129</fpage>&#x2013;<lpage>135</lpage>. <pub-id pub-id-type="doi">10.4103/jmp.jmp_54_23</pub-id>
<pub-id pub-id-type="pmid">37576091</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ran</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>A high-quality dataset featuring classified and annotated cervical spine X-ray atlas</article-title>. <source>Sci. Data</source> <volume>11</volume>, <fpage>625</fpage>. <pub-id pub-id-type="doi">10.1038/s41597-024-03383-0</pub-id>
<pub-id pub-id-type="pmid">38871800</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Riew</surname>
<given-names>K. D.</given-names>
</name>
<name>
<surname>Ecker</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Dettori</surname>
<given-names>J. R.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Anterior cervical discectomy and fusion for the management of axial neck pain in the absence of radiculopathy or myelopathy</article-title>. <source>Evid. Based Spine Care J.</source> <volume>1</volume>, <fpage>45</fpage>&#x2013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1055/s-0030-1267067</pub-id>
<pub-id pub-id-type="pmid">22956927</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). <source>U-Net: Convolutional networks for biomedical image segmentation</source>. <publisher-name>Springer International Publishing</publisher-name>, <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shim</surname>
<given-names>J.-H.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>W. S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>K. G.</given-names>
</name>
<name>
<surname>Yee</surname>
<given-names>G. T.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y. J.</given-names>
</name>
<name>
<surname>Jeong</surname>
<given-names>T. S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Evaluation of U-Net models in automated cervical spine and cranial bone segmentation using X-ray images for traumatic atlanto-occipital dislocation diagnosis</article-title>. <source>Sci. Rep.</source> <volume>12</volume>, <fpage>21438</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-23863-w</pub-id>
<pub-id pub-id-type="pmid">36509842</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Son</surname>
<given-names>W. J.</given-names>
</name>
<name>
<surname>Ahn</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J. Y.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Automated brain segmentation on computed tomographic images using perceptual loss based convolutional neural networks</article-title>. <source>Investig. Magn. Reson. Imaging</source> <volume>28</volume>, <fpage>193</fpage>. <pub-id pub-id-type="doi">10.13104/imri.2024.0023</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taghanaki</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kevin Zhou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Georgescu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Combo loss: handling input and output imbalance in multi-organ segmentation</article-title>. <source>Comput. Med. Imaging Graph.</source> <volume>75</volume>, <fpage>24</fpage>&#x2013;<lpage>33</lpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2019.04.005</pub-id>
<pub-id pub-id-type="pmid">31129477</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tejani</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Klontzas</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Gatti</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Mongan</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Moy</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>S. H.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Checklist for artificial intelligence in medical imaging (CLAIM): 2024 update</article-title>. <source>Radiol. Artif. Intell.</source> <volume>6</volume>, <fpage>e240300</fpage>. <pub-id pub-id-type="doi">10.1148/ryai.240300</pub-id>
<pub-id pub-id-type="pmid">38809149</pub-id>
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Van Santbrink</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Schuermans</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Cerfonteijn</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Breeuwer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Smeets</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Van Santbrink</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>AI-assisted image recognition of cervical spine vertebrae in dynamic X-ray recordings</article-title>. <source>Bioeng. (Basel)</source> <volume>12</volume>, <fpage>679</fpage>. <pub-id pub-id-type="doi">10.3390/bioengineering12070679</pub-id>
<pub-id pub-id-type="pmid">40722371</pub-id>
</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zuluaga</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Pratt</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>P. A.</given-names>
</name>
<name>
<surname>Aertsen</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>DeepIGeoS: a deep interactive geodesic framework for medical image segmentation</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>41</volume>, <fpage>1559</fpage>&#x2013;<lpage>1572</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2018.2840695</pub-id>
<pub-id pub-id-type="pmid">29993532</pub-id>
</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lv</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021a</year>). <article-title>SAR-U-Net: squeeze-and-excitation block and atrous spatial pyramid pooling based residual U-Net for automatic liver segmentation in computed tomography</article-title>. <source>Comput. Methods Programs Biomed.</source> <volume>208</volume>, <fpage>106268</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2021.106268</pub-id>
<pub-id pub-id-type="pmid">34274611</pub-id>
</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2021b</year>). <article-title>Annotation-efficient deep learning for automatic medical image segmentation</article-title>. <source>Nat. Commun.</source> <volume>12</volume>, <fpage>5915</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-021-26216-9</pub-id>
<pub-id pub-id-type="pmid">34625565</pub-id>
</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Agarwal</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>MedCLIP: contrastive learning from unpaired medical images and text</article-title>. <source>Proc. Conf. Empir. Methods Nat. Lang. Process</source> <volume>2022</volume>, <fpage>3876</fpage>&#x2013;<lpage>3887</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.256</pub-id>
<pub-id pub-id-type="pmid">39144675</pub-id>
</mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Dou</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2025a</year>). <article-title>Cervical vertebral body segmentation in X-ray and magnetic resonance imaging based on YOLO-UNet: automatic segmentation approach and available tool</article-title>. <source>Digit. Health</source> <volume>11</volume>, <fpage>20552076251347695</fpage>. <pub-id pub-id-type="doi">10.1177/20552076251347695</pub-id>
<pub-id pub-id-type="pmid">40469781</pub-id>
</mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mu</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2025b</year>). <article-title>Lightweight multi-stage aggregation transformer for robust medical image segmentation</article-title>. <source>Med. Image Anal.</source> <volume>103</volume>, <fpage>103569</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2025.103569</pub-id>
<pub-id pub-id-type="pmid">40279826</pub-id>
</mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name>
<surname>Wolfrath</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wolfrath</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Banerjee</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kothari</surname>
<given-names>A. N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Stronger baseline models -- A key requirement for aligning machine learning research with clinical utility</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://ui.adsabs.harvard.edu/abs/2024arXiv240912116W">https://ui.adsabs.harvard.edu/abs/2024arXiv240912116W</ext-link> (Accessed September 01, 2024)</comment>.</mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>A medical image segmentation method based on multi-dimensional statistical features</article-title>. <source>Front. Neurosci.</source> <volume>16</volume>, <fpage>1009581</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2022.1009581</pub-id>
<pub-id pub-id-type="pmid">36188458</pub-id>
</mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Application of MRI image segmentation algorithm for brain tumors based on improved YOLO</article-title>. <source>Front. Neurosci.</source> <volume>18</volume>, <fpage>1510175</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2024.1510175</pub-id>
<pub-id pub-id-type="pmid">39840016</pub-id>
</mixed-citation>
</ref>
<ref id="B58">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yuan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Medical image segmentation with UNet-based multi-scale context fusion</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>15687</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-66585-x</pub-id>
<pub-id pub-id-type="pmid">39468067</pub-id>
</mixed-citation>
</ref>
<ref id="B59">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hei</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Automatic medical image segmentation with vision transformer</article-title>. <source>Appl. Sci. (Basel)</source> <volume>14</volume>, <fpage>2741</fpage>. <pub-id pub-id-type="doi">10.3390/app14072741</pub-id>
</mixed-citation>
</ref>
<ref id="B60">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>A real-time cell image segmentation method based on multi-scale feature fusion</article-title>. <source>Bioengineering</source> <volume>12</volume>, <fpage>843</fpage>. <pub-id pub-id-type="doi">10.3390/bioengineering12080843</pub-id>
<pub-id pub-id-type="pmid">40868356</pub-id>
</mixed-citation>
</ref>
<ref id="B61">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zuiderveld</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>1994</year>). &#x201c;<article-title>Contrast limited adaptive histogram equalization</article-title>,&#x201d; in <source>Graphics gems IV</source> (<publisher-name>Academic Press Professional, Inc.</publisher-name>), <fpage>474</fpage>&#x2013;<lpage>485</lpage>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>