<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Oncol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Oncology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Oncol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2234-943X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fonc.2026.1731007</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Assessing the robustness and clinical evaluation of a deep&#x2212;learning segmentation model for head and neck cancer</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Schanne</surname><given-names>Daniel H.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2801120/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Cuenot</surname><given-names>L&#xe9;andre</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Br&#xfc;ningk</surname><given-names>Sarah</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Reyes</surname><given-names>Mauricio</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn004"><sup>&#x2021;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes">
<name><surname>Elicin</surname><given-names>Olgun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<xref ref-type="author-notes" rid="fn004"><sup>&#x2021;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/726171/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Radiation Oncology, Inselspital, Bern University Hospital and University of Bern</institution>, <city>Bern</city>,&#xa0;<country country="ch">Switzerland</country></aff>
<aff id="aff2"><label>2</label><institution>ARTORG Center for Biomedical Engineering Research, University of Bern</institution>, <city>Bern</city>,&#xa0;<country country="ch">Switzerland</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Olgun Elicin, <email xlink:href="mailto:olgun.elicin@insel.ch">olgun.elicin@insel.ch</email></corresp>
<fn fn-type="equal" id="fn003">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work and share first authorship</p></fn>
<fn fn-type="equal" id="fn004">
<label>&#x2021;</label>
<p>These authors have contributed equally to this work and share last authorship</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-13">
<day>13</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>16</volume>
<elocation-id>1731007</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>22</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Schanne, Cuenot, Br&#xfc;ningk, Reyes and Elicin.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Schanne, Cuenot, Br&#xfc;ningk, Reyes and Elicin</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-13">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background and purpose</title>
<p>Deep learning (DL)-based autosegmentation has improved delineation of organs at risk in radiotherapy for head and neck cancer (HNC). However, automated segmentation of gross tumor volumes (GTVp, GTVn) remains challenging, and robustness under real-world imaging conditions is insufficiently characterized. This study evaluates the robustness and clinical usability of a DL-based PET/CT segmentation model for HNC under clinically relevant perturbations.</p>
</sec>
<sec>
<title>Materials and methods</title>
<p>A 3D Dynamic U-Net was trained on the public HECKTOR 2022 dataset (474 training, 50 test cases). Synthetic perturbations (noise, blur, ghosting, bias-field, spike noise, and motion) were applied to PET and CT images at varying severity levels, generating 36 variants per patient. Segmentation quality was measured using Dice score, Hausdorff Distance, and accuracy. Clinical usability was assessed for 50 baseline and 18 perturbed cases by two clinicians using a five-point Likert scale. Radiomic features were correlated with robustness metrics.</p>
</sec>
<sec>
<title>Results</title>
<p>Baseline Dice scores were 0.766 (GTVp) and 0.698 (GTVn). Performance dropped significantly under spike noise and bias-field artifacts, especially for GTVn. Clinical usability remained high for GTVp (77.8%) but declined to 27.9% for GTVn under severe perturbations. Lesion volume and surface complexity positively correlated with robustness degradation, while high PET contrast offered protective effects against certain perturbations.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>DL-based PET/CT segmentation models for HNC show strong baseline performance and robustness for primary tumors. However, nodal tumor segmentation remains vulnerable to specific image artifacts. Enhancing robustness through targeted data augmentation and validation under variable conditions is essential for clinical integration.</p>
</sec>
</abstract>
<kwd-group>
<kwd>autosegmentation</kwd>
<kwd>deep learning</kwd>
<kwd>head and neck cancer</kwd>
<kwd>PET/CT</kwd>
<kwd>robustness</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="27"/>
<page-count count="10"/>
<word-count count="4665"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Head and Neck Cancer</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Head and neck cancer (HNC) is the sixth most common malignancy globally, with radiotherapy (RT) constituting one of the primary therapeutic modalities (<xref ref-type="bibr" rid="B1">1</xref>). RT planning is complex and requires precise delineation of target volumes and organs at risk (OAR), traditionally a manual and labor-intensive task performed by radiation oncologists (<xref ref-type="bibr" rid="B2">2</xref>&#x2013;<xref ref-type="bibr" rid="B4">4</xref>). Recent advancements in deep learning (DL)-based autosegmentation models have significantly impacted this step, achieving reliable OAR segmentation with few manual corrections when no gross changes in anatomy are present (<xref ref-type="bibr" rid="B5">5</xref>). However, DL segmentation of target structures remains challenging. Gross tumor volumes (GTV) often present as irregularly shaped lesions that can cross anatomical boundaries and infiltrate adjacent structures, complicating automatic segmentation. Clinical target volumes (CTV), which encompass regions of suspected microscopic tumor spread, rely on extensive domain knowledge of tumor biology and anatomical spread patterns, posing additional hurdles for automated methods. Anatomical changes following pretreatment procedures, such as surgery or chemotherapy, further complicate accurate delineation (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>Despite these challenges, significant progress has been achieved in DL-based autosegmentation for HNC (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B9">9</xref>). Recent literature demonstrates promising results, particularly regarding the integration of multimodal imaging like computed tomography (CT) and fluorodeoxyglucose positron emission tomography (FDG-PET). Recently, the HECKTOR (Head and Neck Tumor segmentation and Outcome prediction in PET/CT images) challenges have played a pivotal role in driving this progress by providing standardized, annotated datasets and evaluating model robustness and clinical relevance. For instance, in the HECKTOR 2022 challenge, the winning ensemble achieved Dice similarity coefficients of 0.788 for primary tumors (GTVp) and nodal metastases (GTVn) segmentation, underscoring the capability of advanced DL architectures to approach clinical standards in segmentation accuracy (<xref ref-type="bibr" rid="B10">10</xref>).</p>
<p>Despite these successes, critical gaps remain. First, the robustness of DL segmentation models under realistic clinical perturbations, such as anatomical changes, patient movement, imaging noise, or varying acquisition protocols, remains insufficiently characterized. Model performance can decline when faced with data of different quality, from changed imaging equipment, or from new sources, underscoring the need for rigorous robustness evaluations. Recent work suggests that aggressive data augmentation can improve network resilience to imaging variability (<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B12">12</xref>). Furthermore, conventional DL metrics, such as the Dice coefficient or Hausdorff distance (HD), might not fully capture the clinical utility and acceptability of segmentation outputs from the clinician&#x2019;s perspective (<xref ref-type="bibr" rid="B13">13</xref>). For clinical implementation, a DL model must demonstrate not only strong quantitative performance but also high clinical usability and robustness to image quality perturbations commonly encountered in clinical routine.</p>
<p>This study addresses these gaps by rigorously evaluating the performance and robustness of a 3D Dynamic U-Net-based DL segmentation model trained on the publicly available HECKTOR 2022 PET/CT dataset (<uri xlink:href="https://hecktor.grand-challenge.org/Data/">https://hecktor.grand-challenge.org/Data/</uri>). Specifically, we assess the robustness of the segmentation performance under various synthetic perturbations representing clinically relevant image degradation. Additionally, we correlate traditional DL metrics with clinical grading by experienced HNC-specialist radiation oncologists, providing direct insights into the clinical relevance and usability of DL segmentation outputs. By systematically analyzing the correlation between image-derived radiomic features and segmentation robustness, we also aim to identify lesion-specific factors influencing model stability.</p>
<p>In contrast to performance-driven studies that primarily focus on improving algorithms, our goal here is to establish a clear and reproducible characterization of model robustness under realistic perturbations. By defining the boundaries of current state-of-the-art segmentation in head and neck cancer, we provide a factual basis for future methodological work on mitigation strategies, while keeping the present study focused on systematic evaluation and clinical relevance.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Dataset and reference contours</title>
<p>We used the public MICCAI HECKTOR 2022 PET/CT dataset (524 patients from nine European/North&#x2212;American centers: <ext-link ext-link-type="uri" xlink:href="https://hecktor.grand-challenge.org/Data/">https://hecktor.grand-challenge.org/Data/</ext-link>). Expert-contoured structures of the GTVp and GTVn provided in the dataset served as the reference standard. Because the data are fully anonymized, additional ethics approval was waived.</p>
<p>A stratified 90 / 10 split yielded a development cohort of 474 patients and an internal hold&#x2212;out cohort of 50 patients. Development images were trained in five&#x2212;fold cross&#x2212;validation; the 50 hold&#x2212;out cases were reserved exclusively for robustness and clinical&#x2212;grading experiments. These were categorized into two groups: one consisting of baseline images (without perturbations), representing 32 images, and the other comprising perturbed images modified by the three perturbations yielding the largest performance decrease. For each perturbation and modality, three cases were selected, resulting in a total of six cases for each of the three perturbations, accounting for the final 18 cases (i.e, 3 cases x 3 perturbations x 2 modalities).</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Pre&#x2212;processing and augmentation</title>
<p>Aspects of the preprocessing procedure and further data augmentation were adapted from the methods employed by the winning team of the HECKTOR Challenge 2022 (<xref ref-type="bibr" rid="B10">10</xref>). In brief, PET volumes were first resampled to the native CT matrix (524 &#xd7; 524 px) and then both modalities were interpolated to 1 mm&#xb3; isotropic voxels. A head&#x2212;centered crop of 200 &#xd7; 200 &#xd7; &#x2264; 310 voxels removed irrelevant lower&#x2212;body anatomy.</p>
<p>CT densities were clipped to &#xb1;3 SD, min&#x2212;max scaled to [0, 1]; PET SUVs were z&#x2212;normalized. Training data underwent random affine jitter, flips, and CT&#x2212;only density transforms (Gaussian noise, smoothing, contrast, shift). All augmentations were applied with an occurrence probability of 20%. Training patches measured 192 &#xd7; 192 &#xd7; 192 voxels and were centered based on labels, with a 10% probability of being centered on background, 45% on primary tumors, and 45% on nodal tumors. In cases where only one tumor type was present, the sampling probability for that tumor was increased to 90%.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Network and training</title>
<p>A 3&#x2212;D Dynamic U&#x2212;Net, implemented in MONAI (<xref ref-type="bibr" rid="B14">14</xref>), was adopted for the task. The model features six encoder&#x2013;decoder stages, with imaging information inputted via two channels (CT, PET), and yields three probability maps (background, GTVp, GTVn). Additionally, batch normalization was implemented in conjunction with residual blocks to enhance training stability and model performance. The model was trained using a hybrid loss function that combined Dice and cross-entropy losses. A five-fold cross-validation strategy was used following common practices to enhance model generalization. Training parameters were set based on literature reports of previous winning entries of the Hecktor challenge: AdamW optimizer, learning rate of 1e-4, weight decay of 3e-5, batch size of 2. Training lasted 100 epochs per fold with mixed&#x2212;precision floating point on a workstation&#x2212;class NVIDIA A100 GPU. To ensure the final model was robust and representative of state-of-the-art performance, inference was performed using an ensemble of the five models trained during the 5-fold cross-validation. The final segmentation masks were generated by averaging the softmax probability maps from all five folds before applying the 0.5 threshold (obtained through the validation set. This ensembling strategy aligns with the methodology of top-performing teams in the HECKTOR challenge.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Inference</title>
<p>Each test volume was forwarded once (no test&#x2212;time augmentation). Soft&#x2212;max probabilities were thresholded at 0.5; small, isolated components were retained, as validation showed &lt; 2% false positives.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Robustness protocol</title>
<p>Six TorchIO (<xref ref-type="bibr" rid="B15">15</xref>) perturbations, Gaussian and spike noise, bias&#x2212;field, motion, blur, ghosting, were applied at three severity levels to CT and PET separately, creating 36 variants per patient plus the baseline. Segmentation quality was measured with Dice, HD95%, sensitivity, specificity, and accuracy. Dice loss was defined as 1 - (2|P &#x2229;Y|/(|P |+ |Y |)), where P is the prediction and Y is the ground truth.</p>
<p>Inspired by Boone et&#xa0;al. (<xref ref-type="bibr" rid="B16">16</xref>), we assessed model robustness as &#x394;Dice = Dice_baseline &#x2013; Dice_perturbed. P&#x2212;values from paired Wilcoxon tests were Benjamini&#x2013;Hochberg corrected.</p>
</sec>
<sec id="s2_6">
<label>2.6</label>
<title>Correlation based on texture analysis</title>
<p>Thirteen shape/density descriptors (volume, surface, boundary length, compactness, centroid distance, CT/PET variability, CT/PET contrast, SUVmax, mean CT number, regions, entropy) were extracted with scikit&#x2212;image. Pearson coefficients (&#x3c1;) between each property and &#x394;Dice were computed for every perturbation.</p>
</sec>
<sec id="s2_7">
<label>2.7</label>
<title>Clinical grading study</title>
<p>Two radiation oncologists with 12 and 17 years of experience in treating HNC graded segmentation usability on a five&#x2212;point Likert scale (1 = unusable, 2 = Requires significant modifications, 3 = Requires some modifications, 4 = Requires minor modifications, 5 = fully acceptable). All 32 baseline cases and 18 representative perturbed cases (the three most deleterious artefacts) were reviewed.</p>
</sec>
<sec id="s2_8">
<label>2.8</label>
<title>Statistical analyses</title>
<p>Inter&#x2212;observer agreement used weighted Cohen&#x2019;s &#x3ba;. To better account for the ordinal nature of the Likert scale, relationships between quantitative metrics and clinical grades were assessed using Spearman&#x2019;s rank correlation (&#x3c1;), consistent with the analysis. For the exploratory radiomics analysis, correlations between image features and robustness metrics were computed using Pearson&#x2019;s coefficients, with p-values adjusted for multiple comparisons using the Benjamini-Hochberg procedure to control the false discovery rate. Code is available at <ext-link ext-link-type="uri" xlink:href="https://github.com/Leandre354/ECEProject">https://github.com/Leandre354/ECEProject</ext-link> for reproducibility.</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Dataset and the patient cohort</title>
<p>The training and test dataset included 524 patients with oropharyngeal cancer drawn from multi-institutional (n=9) cohorts. Median age was 61 years in training (IQR 54&#x2013;67) and 60 years in testing (IQR 55&#x2013;64), and the cohorts were predominantly male (82% in both groups). The high HPV-positivity rate (81-95%) further reflects the contemporary profile of HPV-associated oropharyngeal cancers (<xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>).</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Summary of patient demographics and clinical characteristics in the HECKTOR training and testing cohorts.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Variable</th>
<th valign="middle" align="left">Training n (%)</th>
<th valign="middle" align="left">Baseline test n (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Median age (IQR)</td>
<td valign="middle" align="left">61 (54-67)</td>
<td valign="middle" align="left">60 (55-64)</td>
</tr>
<tr>
<td valign="middle" align="left">Male sex</td>
<td valign="middle" align="left">401 (81.8%)</td>
<td valign="middle" align="left">26 (81.2%)</td>
</tr>
<tr>
<td valign="middle" align="left">Stage I-II</td>
<td valign="middle" align="left">258 (54.5%)</td>
<td valign="middle" align="left">20 (62.5%)</td>
</tr>
<tr>
<td valign="middle" align="left">Stage III-IV</td>
<td valign="middle" align="left">216 (45.5%)</td>
<td valign="middle" align="left">12 (37.5%)</td>
</tr>
<tr>
<td valign="middle" align="left">N0</td>
<td valign="middle" align="left">49 (10.3%)</td>
<td valign="middle" align="left">2 (6.3%)</td>
</tr>
<tr>
<td valign="middle" align="left">N1</td>
<td valign="middle" align="left">73 (15.4%)</td>
<td valign="middle" align="left">7 (21.9%)</td>
</tr>
<tr>
<td valign="middle" align="left">N2*</td>
<td valign="middle" align="left">202 (42.6%)</td>
<td valign="middle" align="left">14 (43.8%)</td>
</tr>
<tr>
<td valign="middle" align="left">N2a</td>
<td valign="middle" align="left">7 (1.5%)</td>
<td valign="middle" align="left">1 (3.1%)</td>
</tr>
<tr>
<td valign="middle" align="left">N2b</td>
<td valign="middle" align="left">71 (15.0%)</td>
<td valign="middle" align="left">4 (12.5%)</td>
</tr>
<tr>
<td valign="middle" align="left">N2c</td>
<td valign="middle" align="left">41 (8.6%)</td>
<td valign="middle" align="left">2 (6.3%)</td>
</tr>
<tr>
<td valign="middle" align="left">N3</td>
<td valign="middle" align="left">31 (6.5%)</td>
<td valign="middle" align="left">2 (6.3%)</td>
</tr>
<tr>
<td valign="middle" align="left">HPV positive</td>
<td valign="middle" align="left">260 (81.3%)</td>
<td valign="middle" align="left">19 (95%)</td>
</tr>
<tr>
<td valign="middle" align="left">HPV negative</td>
<td valign="middle" align="left">60 (18.8%)</td>
<td valign="middle" align="left">1 (5%)</td>
</tr>
<tr>
<td valign="middle" align="left">HPV information missing</td>
<td valign="middle" align="left">154</td>
<td valign="middle" align="left">12</td>
</tr>
<tr>
<td valign="middle" align="left">Tobacco use</td>
<td valign="middle" align="left">96 (50.8%)</td>
<td valign="middle" align="left">12 (85.7%)</td>
</tr>
<tr>
<td valign="middle" align="left">No tobacco use</td>
<td valign="middle" align="left">93 (49.2%)</td>
<td valign="middle" align="left">2 (14.3%)</td>
</tr>
<tr>
<td valign="middle" align="left">Missing tobacco use</td>
<td valign="middle" align="left">285</td>
<td valign="middle" align="left">18</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>AJCC/UICC 7th edition was used for staging.</p></fn>
<fn>
<p>Data are presented as counts with percentages or median with interquartile range (IQR). Missing data counts are reported for key clinical variables.</p></fn>
<fn>
<p>*Incomplete N2 subcategorization in the public dataset.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Baseline segmentation performance</title>
<p>Cross&#x2212;validation yielded a median Dice of 0.766 &#xb1; 0.195 for GTVp and 0.698 &#xb1; 0.313 for GTVn (development cohort median [IQR] 0.689 [0.353] and 0.719 [0.337], respectively). On the 50&#x2212;patient hold&#x2212;out set, the network reproduced these scores within &#xb1;0.01. <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref> summarizes lesion&#x2212;level statistics and segmentation failure counts. Median HD95% on the 50&#x2212;case hold&#x2212;out were 9.2 mm (IQR: 14.3) for GTVp and 17.6 mm (IQR: 62.5) for GTVn.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Baseline lesion&#x2212;level performance.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Structure</th>
<th valign="middle" align="left">N cases</th>
<th valign="middle" align="left">Mean dice</th>
<th valign="middle" align="left">Median dice</th>
<th valign="middle" align="left">IQR_Dice</th>
<th valign="middle" align="left">False negatives</th>
<th valign="middle" align="left">False positives</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">GTVp</td>
<td valign="middle" align="center">50</td>
<td valign="middle" align="center">0.678</td>
<td valign="middle" align="center">0.766</td>
<td valign="middle" align="center">0.195</td>
<td valign="middle" align="center">4 (8%)</td>
<td valign="middle" align="center">3 (6%)</td>
</tr>
<tr>
<td valign="middle" align="left">GTVn</td>
<td valign="middle" align="center">50</td>
<td valign="middle" align="center">0.614</td>
<td valign="middle" align="center">0.698</td>
<td valign="middle" align="center">0.313</td>
<td valign="middle" align="center">5 (10%)</td>
<td valign="middle" align="center">5 (10%)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>GTVp, primary tumor gross total volume; GTVn, metastatic lymph node gross total volume.</p></fn>
<fn>
<p>Values are lesion&#x2212;level. &#x201c;False Negatives&#x201d; = Dice 0 (no overlap). &#x201c;False Positives&#x201d; = reference volume 0; the network produced non&#x2212;zero voxels.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Effect of synthetic perturbations and drivers of robustness</title>
<p>Across all modalities and structures, blur, ghosting, and rigid motion artifacts produced negligible median Dice (&lt; 0.05, <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Table&#xa0;1</bold></xref>). In contrast, spike noise and bias&#x2212;field shifts were most deleterious, especially for GTVn, occasionally erasing the whole structure. Median &#x394;Dice was markedly higher for GTVn than for GTVp (<xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>, <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Figures&#xa0;1</bold></xref>-<xref ref-type="supplementary-material" rid="SM1"><bold>3</bold></xref>).</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Perturbations applied to nodal tumor contours in the CT. Each of the three box plots corresponds to one of three levels of perturbation severity, providing a visual representation of how segmentation performance is affected at increasing degrees of perturbation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1731007-g001.tif">
<alt-text content-type="machine-generated">Grid of thirty-five boxplots compares segmentation performance metrics across seven conditions: BASE, BLUR, NOISE, GHOST, SPIKE, BIAS, and MOTION. Metrics shown are DICE, Hausdorff, Sensitivity, Specificity, and Accuracy, plus their changes under artifacts. Each column represents a different artifact type or baseline, and each row a different metric.</alt-text>
</graphic></fig>
<p>Lesion size drove susceptibility to high&#x2212;variance artefacts: volume, surface area, and boundary length correlated positively with &#x394;Dice under spike and noise. Compactness and entropy showed weaker effects, while higher PET signal contrast modestly reduced the Dice metric produced by blur and motion artefacts (&#x3c1; &#x2264; 0.39; p = 0.005). Volume, surface area, and boundary length correlated positively with &#x394;Dice under spike and high&#x2212;variance noise (&#x3c1; &#x2264; 0.62; p &lt; 2e-6). Entropy and compactness showed smaller, yet significant, associations (&#x3c1; &#x2264; 0.39; p = 0.005). On the other hand, PET contrast was mildly protective against blur and motion (&#x3c1; &#x2264; 0.31; p = 0.028), although this did not remain statistically significant after Benjamini-Hochberg correction. No descriptor explained variability under ghosting.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Clinical usability, observer agreement, and metric-grade linkage</title>
<p>On unperturbed scans, 79.7% of GTVp and 73.4% of GTVn were rated clinically usable (average of both observers with &#x2265; 3 points, <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>). Perturbations reduced GTVn usability to an average of 27.9%, whereas GTVp usability remained high (77.8%), as presented in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Sankey diagram illustrating inter-observer variability in clinical usability scoring. The flow of cases represents the correspondence between usability scores assigned by Observer 1 (left) and Observer 2 (right) for primary tumors (GTVp) and metastatic lymph nodes (GTVn) on the unperturbed test set. The width of the bands is proportional to the number of patients. Colors correspond to the assigned Likert score (1 = Red/Unusable to 5 = Green/Fully Acceptable). Straight horizontal bands indicate agreement between observers, while crossing bands highlight discordant scoring.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1731007-g002.tif">
<alt-text content-type="machine-generated">Sankey diagrams display observer agreement between two observers for primary tumor (GTVp, left) and nodal tumor (GTVn, right) ratings, with rating categories from one to five colored from red to green, and flows depicting the correspondence between observer scores.</alt-text>
</graphic></fig>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Sankey diagram illustrating inter-observer usability scoring under image perturbations. The diagram displays the flow of clinical usability scores between Observer 1 (left) and Observer 2 (right) for primary tumors (GTVp) and metastatic lymph nodes (GTVn) on the perturbed test subset. This subset includes images degraded by the three most severe artifacts (spike noise, bias field, and motion). Band width is proportional to the number of cases, and colors represent the Likert score (1 = Red/Unusable to 5 = Green/Fully Acceptable). A shift toward lower usability scores (red/orange) is evident compared to the baseline, particularly for nodal volumes, highlighting the sensitivity of these segmentations to image quality degradation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1731007-g003.tif">
<alt-text content-type="machine-generated">Side-by-side Sankey diagrams compare observer ratings for perturbed cases of primary tumor (GTVp) and nodal tumor (GTVn), with color-coded bands showing rating distribution shifts between Observer 1 and Observer 2 for scores one to five.</alt-text>
</graphic></fig>
<p>Inter&#x2212;observer agreement (<xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>) was moderate on baseline images for GTVp (quadratic &#x3ba; = 0.66) and GTVn (quadratic &#x3ba; = 0.64), which were increased on the perturbed subset (quadratic &#x3ba; = 0.83 for both GTVp and GTVn). To further investigate the nature of the disagreements, we analyzed the grading flow between observers (<xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>). This visualization reveals that disagreements were not random; Observer 2 was systematically stricter in evaluating nodal targets (GTVn) compared to Observer 1, frequently assigning lower usability scores to the same contours. This suggests that the reported usability rates for nodal volumes are conservative estimates.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Spearman&#x2019;s rank analysis between Likert scores and classical segmentation metrics for Observer 1 (Top) and Observer 2 (Bottom) for primary and nodal.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-16-1731007-g004.tif">
<alt-text content-type="machine-generated">Two heatmaps display Spearman correlation coefficients between Likert ratings and various metrics, separated by Observer 1 and Observer 2 for both primary and nodal categories. Color intensity ranges from red for positive correlations to blue for negative, with values indicated in each cell and a color bar legend ranging from negative one to one provided on the right.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Qualitative failure modes</title>
<p>Spike and bias artefacts either erased the lesion completely or fragmented it into noise-like islands. Missed lesions (Dice=0) occurred in five cases affecting GTVn (10%) and four GTVp (8%), while false positive detections (reference volume = 0 but Dice &gt; 0) occurred in five cases affecting GTVn (10%) and three GTVp (6%). Additionally, severe motion perturbation in PET produced spurious focal uptake that mimicked pathological cervical nodes, likely due to misregistration artifacts that shift normal physiological uptake (e.g. sternocleidomastoid or laryngeal muscles) into the nodal region.</p>
<p>A detailed master&#x2019;s thesis of the project can be downloaded via <ext-link ext-link-type="uri" xlink:href="https://github.com/Leandre354/ECEProject/blob/main/Doc/MscThesis.pdf">https://github.com/Leandre354/ECEProject/blob/main/Doc/MscThesis.pdf</ext-link>, containing additional information, tables and figures.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>Our study assessed the robustness and clinical applicability of a deep learning-based autosegmentation model for GTVp and GTVn segmentation in HNC using the publicly available MICCAI HECKTOR 2022 PET/CT dataset. Several key findings emerged from our analyses:</p>
<p>Firstly, the segmentation model demonstrated robust performance, overall, particularly against common imaging perturbations such as blurring, ghosting, and rigid motion artifacts. These perturbations induced negligible losses in segmentation accuracy, as quantified by the median &#x394;Dice values consistently below 0.05. However, the model exhibited marked vulnerability to spike noise and bias-field artifacts, especially in GTVn segmentation. These artifacts frequently caused segmentation failures, either completely erasing lesion predictions or fragmenting them into clinically irrelevant islands. Notably, this disproportionate impact on GTVn is consistent with the general observation that smaller, lower-contrast targets are more challenging to segment reliably (<xref ref-type="bibr" rid="B9">9</xref>). These results underscore the importance of further improving model robustness through targeted data augmentation strategies designed to mimic and counteract noise and density distortions. Incorporating stronger and more diverse augmentation has been shown to bolster model performance under such conditions in other imaging contexts (<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B12">12</xref>). Future work should focus on augmenting the training process with simulated spike noise and bias-field corruptions to harden the model against these failure modes.</p>
<p>While our work does not propose new mitigation strategies, its novelty lies in systematically quantifying the vulnerabilities of a clinically relevant segmentation task under controlled and reproducible perturbations. To our knowledge, this is the first robustness study in head and neck cancer autosegmentation that directly correlates algorithmic performance with clinician grading. By precisely documenting which perturbations critically impact performance (e.g., spike noise, bias-field), we establish a factual basis for future methodological improvements. We deliberately chose to focus this study on robust characterization rather than intervention, so that subsequent works can build on these findings with targeted augmentation, uncertainty estimation, or task-tailored architectures. In this sense, our contribution is complementary to performance-oriented studies and provides an essential reference for understanding model limitations in real-world clinical deployment.</p>
<p>Secondly, we identified certain radiomic properties of lesions that influenced segmentation robustness. In our analysis, larger lesions (greater volume and surface area) and those with higher shape complexity (e.g. entropy of density) showed increased susceptibility to perturbations, particularly to spike noise and high-variance noise. This somewhat counter-intuitive finding suggests that while larger tumors are easier to segment under normal conditions, they present a larger canvas on which noise can induce errors (for instance, by creating false fragmentations within an otherwise continuous volume). By contrast, lesions with higher inherent FDG uptake contrast were less affected by blurring and motion, presumably because a strong tumor&#x2013;background signal helps the model maintain accurate boundaries even when images are slightly degraded. These lesion-specific insights can help guide model refinement. They indicate a need for customized training or augmentation strategies that account for tumor characteristics (size, heterogeneity, contrast) to enhance robustness. For example, additional noise-augmentation could be specifically applied to larger tumors during training, and networks could be conditioned or stratified based on lesion volume so that GTVn (which are smaller and more uniform) are handled by models optimized for those properties. However, any added benefit of such an approach remains speculative.</p>
<p>Thirdly, our clinical grading analysis provides a direct real-world context for these quantitative results. Approximately 80% of GTVp and 73% of GTVn were rated as clinically acceptable (score &#x2265;3 on a 5-point Likert scale) by experienced radiation oncologists on unperturbed images. This high baseline acceptability is in line with other recent studies, in which most auto-segmentations required only minor or no edits&#x200b; (<xref ref-type="bibr" rid="B17">17</xref>). Perturbations, however, had a pronounced effect on clinical usability for GTVn, the fraction of clinically usable GTVn to ~28% under severe degradation, whereas GTVp remained largely robust (&#x2248;78% usable) even with added artifacts. This discrepancy underscores the more challenging nature of GTVn segmentation, likely due to smaller lesion size, lower inherent contrast, and greater anatomical variability in lymph node regions&#x200b;. It also emphasizes the need for heightened attention and possibly dedicated modeling approaches for nodal structures. Indeed, researchers have found that using task-tailored models (e.g. separate networks focused on lymph node levels) can achieve expert-level delineation for GTVn (<xref ref-type="bibr" rid="B17">17</xref>). The average Spearman correlation we observed between traditional segmentation metrics (Dice) and the experts&#x2019; usability scores (0.67 for GTVp and 0.5 for GTVn) further validates the utility of these metrics as proxies for quality. That said, quantitative metrics alone cannot fully replace clinical judgment and there are cases in our cohort where an adequate Dice score corresponds to a clinically unacceptable contour placement (for example due to violation of a critical structure boundary). Recent work comparing automated metrics to human perception confirms that conventional overlap measures correlate only moderately with expert assessments of contour quality (<xref ref-type="bibr" rid="B13">13</xref>). Moreover, recent work (<xref ref-type="bibr" rid="B18">18</xref>) has also shown a low correlation between geometry-based metrics, such as Dice, and dosimetry. This highlights the importance of incorporating expert review in the loop and potentially developing new metrics that better capture clinically relevant errors (<xref ref-type="bibr" rid="B5">5</xref>), and dosimetric implications.</p>
<p>In comparison to the existing literature, our results reinforce and extend findings from recent HECKTOR challenges and other multi-institutional studies. Our achieved Dice coefficients on the hold-out test set (approximately 0.77 for primary tumors and 0.70 for nodal tumors) align closely with previously reported segmentation performance by state-of-the-art models on similar PET/CT tasks. For example, the top-performing algorithms in the HECKTOR 2022 challenge obtained an average DSC of ~0.80 for GTVp and ~0.78 for GTVn&#x200b; (<xref ref-type="bibr" rid="B10">10</xref>), and a recent multi-center study reported DSC in the range 0.71&#x2013;0.78 for primary GTV delineation (<xref ref-type="bibr" rid="B9">9</xref>). This concordance suggests that modern DL architectures, such as the 3D Dynamic U-Net used here, are an important step towards the level of accuracy needed for clinical adoption&#x200b; (<xref ref-type="bibr" rid="B19">19</xref>). Notably, our use of a two-channel 3D U-Net is conceptually in line with the nnU-Net framework, which has demonstrated robust generalization across numerous segmentation benchmarks by automatically configuring U-Net models to a given task (<xref ref-type="bibr" rid="B19">19</xref>). However, our study goes beyond prior works by explicitly quantifying robustness under controlled perturbations and by directly correlating algorithm performance with clinician ratings. These analyses provide novel insights that typical challenge reports (which often focus only on clean-scan Dice scores) do not capture, namely, how and why a model might fail in real-world settings and how well its output would be received by end-users. By clarifying these points, we highlight the remaining challenges that must be addressed for reliable deployment of autosegmentation in routine RT planning (e.g. handling image noise and variability, and ensuring outputs meet clinical quality standards).</p>
<p>Several limitations of our work should be acknowledged. First, the absence of MRI data is a notable shortcoming, given that MRI is superior for delineating primary tumor extent in many HNC cases (especially for soft-tissue and perineural infiltration) (<xref ref-type="bibr" rid="B20">20</xref>&#x2013;<xref ref-type="bibr" rid="B22">22</xref>). Our PET/CT-only model may thus miss subtleties that an MRI-enhanced model could capture. Future studies should prioritize multimodal imaging integration, particularly incorporating MRI, to further improve segmentation completeness and accuracy for structures where MRI offers additional contrast. Second, our robustness evaluation was performed entirely within the HECKTOR 2022 multi-institutional dataset, using an internal hold-out set rather than a fully independent external cohort. Hence, an external validation remains an important next step to confirm robustness across unseen acquisition protocols. Third, the inconsistent availability of certain patient- and HNC-specific parameters (especially HPV status and smoking history) in the public dataset prevented us from performing subgroup analyses. Such analyses could be informative (e.g. HPV-positive oropharyngeal tumors might be easier or harder to segment due to different morphology and texture), and their absence may limit the generalizability of our conclusions across different patient populations. Fourth, because our development and validation were conducted on a single multicenter public challenge dataset, it remains to be confirmed that the performance and robustness observed will transfer to another, independent external data source. Prior studies have shown that even top-performing models can experience performance degradation when applied to new hospitals or scanner settings (<xref ref-type="bibr" rid="B23">23</xref>). To ensure true generalizability, our model would therefore have to be evaluated on external datasets (for example, from other institutions or prospective trials), and potentially fine-tuned, to verify that its accuracy and robustness hold beyond the HECKTOR cohort&#x200b;. Fifth, the clinical evaluation of perturbed cases was performed on a subset of images selected to represent the most severe artifacts (&#x201c;stress testing&#x201d;). This selection introduces a bias towards lower performance and does not necessarily reflect the distribution of image quality found in routine clinical practice. Therefore, the reported usability rates under perturbation should be interpreted as a lower bound of the model&#x2019;s resilience in worst-case scenarios.</p>
<p>Another limitation concerns the clinical usability study, which relied on the evaluations of two experienced radiation oncologists from the same institution. While this provides valuable expert input, inter-observer variability in HNC contouring is known to be substantial, and including a larger panel of raters could have captured a broader range of clinical practice. Nevertheless, both observers had &gt;10 years of clinical experience in HNC radiotherapy, providing a reliable reference for assessing usability in this initial study.</p>
<p>Furthermore, our analysis was restricted to gross tumor volumes (GTVp and GTVn). Several studies and commercially available software packages have already demonstrated high clinical adoption of OAR autosegmentation in head and neck RT. Future work will therefore extend our robustness analysis to combined target and OAR segmentation within the same pipeline, which would provide a more comprehensive assessment of clinical utility. Finally, we emphasize that geometric metrics like Dice do not always predict clinical or dosimetric significance. As we recently demonstrated in brain tumor segmentation, evaluators often struggle to estimate the dosimetric impact of contouring variations based on geometry alone (<xref ref-type="bibr" rid="B24">24</xref>). For example, a complete nodal erasure represents a major dosimetric miss, whereas minor surface irregularities may have negligible therapeutic consequences. Therefore, future validation should extend beyond geometric robustness to assess downstream dosimetric effects and integrate automated quality assurance to streamline clinical workflows (<xref ref-type="bibr" rid="B25">25</xref>).</p>
<p>In conclusion, our study confirms the promising clinical utility of DL-based autosegmentation models for HNC while highlighting existing robustness challenges and areas for further improvement. Future research directions should emphasize multimodal imaging integration (e.g. adding MRI for primary tumor delineation), tailored augmentation strategies to increase robustness, and comprehensive clinical validation on independent external cohorts to confirm generalizability. Availability of accurate and complete clinical data will be a prerequisite to achieve this goal. Additionally, combining accurate segmentation with downstream classification models for high-risk features could enhance clinical decision-making. For instance, coupling such a segmentation approach with a convolutional neural networks classifier for extranodal extension may allow automated detection of nodal extracapsular spread, an application where recent studies have shown promising results on CT imaging (<xref ref-type="bibr" rid="B26">26</xref>, <xref ref-type="bibr" rid="B27">27</xref>). By addressing these next steps, we move closer to reliable and clinically applicable automated segmentation tools that can streamline RT planning and improve patient care.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <uri xlink:href="https://hecktor.grand-challenge.org/Data/">https://hecktor.grand-challenge.org/Data/</uri>. The code is available here: <uri xlink:href="https://github.com/Leandre354/ECEProject">https://github.com/Leandre354/ECEProject</uri>.</p></sec>
<sec id="s6" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>Ethical approval was not required for the study involving humans in accordance with the local legislation and institutional requirements.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>DS: Writing &#x2013; original draft, Methodology, Formal Analysis, Visualization, Writing &#x2013; review &amp; editing, Conceptualization. LC: Writing &#x2013; original draft, Software, Data curation, Visualization, Writing &#x2013; review &amp; editing, Validation, Formal Analysis. SB: Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Supervision, Methodology. MR: Visualization, Writing &#x2013; original draft, Conceptualization, Methodology, Validation, Project administration, Writing &#x2013; review &amp; editing, Supervision. OE: Writing &#x2013; original draft, Supervision, Validation, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. Because English is not the authors&#x2019; native language, the large language model ChatGPT 5 by OpenAI was used during the preparation of this work to check grammar and rephrase some sentences to improve clarity. After using this tool, the authors reviewed and edited the content as needed and take full responsibility for the content of the publication.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<sec id="s12" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fonc.2026.1731007/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fonc.2026.1731007/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="DataSheet1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Fuereder</surname> <given-names>T</given-names></name>
</person-group>. 
<article-title>Essential news of current guidelines: head and neck squamous cell carcinoma</article-title>. <source>memo</source>. (<year>2022</year>) <volume>15</volume>:<page-range>278&#x2013;81</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s12254-022-00842-5</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<label>2</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Brouwer</surname> <given-names>CL</given-names></name>
<name><surname>Steenbakkers</surname> <given-names>RJHM</given-names></name>
<name><surname>Bourhis</surname> <given-names>J</given-names></name>
<name><surname>Budach</surname> <given-names>W</given-names></name>
<name><surname>Grau</surname> <given-names>C</given-names></name>
<name><surname>Gr&#xe9;goire</surname> <given-names>V</given-names></name>
<etal/>
</person-group>. 
<article-title>CT-based delineation of organs at risk in the head and neck region: DAHANCA, EORTC, GORTEC, HKNPCSG, NCIC CTG, NCRI, NRG Oncology and TROG consensus guidelines</article-title>. <source>Radiotherapy Oncol</source>. (<year>2015</year>) <volume>117</volume>:<fpage>83</fpage>&#x2013;<lpage>90</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2015.07.041</pub-id>, PMID: <pub-id pub-id-type="pmid">26277855</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<label>3</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gr&#xe9;goire</surname> <given-names>V</given-names></name>
<name><surname>Ang</surname> <given-names>K</given-names></name>
<name><surname>Budach</surname> <given-names>W</given-names></name>
<name><surname>Grau</surname> <given-names>C</given-names></name>
<name><surname>Hamoir</surname> <given-names>M</given-names></name>
<name><surname>Langendijk</surname> <given-names>JA</given-names></name>
<etal/>
</person-group>. 
<article-title>Delineation of the neck node levels for head and neck tumors: a 2013 update. DAHANCA, EORTC, HKNPCSG, NCIC CTG, NCRI, RTOG, TROG consensus guidelines</article-title>. <source>Radiotherapy Oncol</source>. (<year>2014</year>) <volume>110</volume>:<page-range>172&#x2013;81</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2013.10.010</pub-id>, PMID: <pub-id pub-id-type="pmid">24183870</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<label>4</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gr&#xe9;goire</surname> <given-names>V</given-names></name>
<name><surname>Evans</surname> <given-names>M</given-names></name>
<name><surname>Le</surname> <given-names>QT</given-names></name>
<name><surname>Bourhis</surname> <given-names>J</given-names></name>
<name><surname>Budach</surname> <given-names>V</given-names></name>
<name><surname>Chen</surname> <given-names>A</given-names></name>
<etal/>
</person-group>. 
<article-title>Delineation of the primary tumour Clinical Target Volumes (CTV-P) in laryngeal, hypopharyngeal, oropharyngeal and oral cavity squamous cell carcinoma: AIRO, CACA, DAHANCA, EORTC, GEORCC, GORTEC, HKNPCSG, HNCIG, IAG-KHT, LPRHHT, NCIC CTG, NCRI, NRG Oncolog</article-title>. <source>Radiotherapy Oncol</source>. (<year>2018</year>) <volume>126</volume>:<fpage>3</fpage>&#x2013;<lpage>24</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2017.10.016</pub-id>, PMID: <pub-id pub-id-type="pmid">29180076</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<label>5</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Nikolov</surname> <given-names>S</given-names></name>
<name><surname>Blackwell</surname> <given-names>S</given-names></name>
<name><surname>Zverovitch</surname> <given-names>A</given-names></name>
<name><surname>Mendes</surname> <given-names>R</given-names></name>
<name><surname>Livne</surname> <given-names>M</given-names></name>
<name><surname>De Fauw</surname> <given-names>J</given-names></name>
<etal/>
</person-group>. 
<article-title>Clinically applicable segmentation of head and neck anatomy for radiotherapy: deep learning algorithm development and validation study</article-title>. <source>J Med Internet Res</source>. (<year>2021</year>) <volume>23</volume>:<elocation-id>e26151</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.2196/26151</pub-id>, PMID: <pub-id pub-id-type="pmid">34255661</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<label>6</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Evans</surname> <given-names>M</given-names></name>
<name><surname>Bonomo</surname> <given-names>P</given-names></name>
<name><surname>Chan</surname> <given-names>PC</given-names></name>
<name><surname>Chua</surname> <given-names>MLK</given-names></name>
<name><surname>Eriksen</surname> <given-names>JG</given-names></name>
<name><surname>Hunter</surname> <given-names>K</given-names></name>
<etal/>
</person-group>. 
<article-title>Post-operative radiotherapy for oral cavity squamous cell carcinoma: Review of the data guiding the selection and the delineation of post-operative target volumes</article-title>. <source>Radiotherapy Oncol</source>. (<year>2025</year>) <volume>207</volume>:<fpage>110880</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2025.110880</pub-id>, PMID: <pub-id pub-id-type="pmid">40194704</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<label>7</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Salama</surname> <given-names>JK</given-names></name>
<name><surname>Haddad</surname> <given-names>RI</given-names></name>
<name><surname>Kies</surname> <given-names>MS</given-names></name>
<name><surname>Busse</surname> <given-names>PM</given-names></name>
<name><surname>Dong</surname> <given-names>L</given-names></name>
<name><surname>Brizel</surname> <given-names>DM</given-names></name>
<etal/>
</person-group>. 
<article-title>Clinical practice guidance for radiotherapy planning after induction chemotherapy in locoregionally advanced head-and-neck cancer</article-title>. <source>Int J Radiat oncology biology physics</source>. (<year>2009</year>) <volume>75</volume>:<page-range>725&#x2013;33</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ijrobp.2008.11.059</pub-id>, PMID: <pub-id pub-id-type="pmid">19362781</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<label>8</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Naser</surname> <given-names>MA</given-names></name>
<name><surname>van Dijk</surname> <given-names>LV</given-names></name>
<name><surname>He</surname> <given-names>R</given-names></name>
<name><surname>Wahid</surname> <given-names>KA</given-names></name>
<name><surname>Fuller</surname> <given-names>CD</given-names></name>
</person-group>. 
<article-title>Tumor segmentation in patients with head and neck cancers using deep learning based-on multi-modality PET/CT images</article-title>. In: <source>Lecture Notes in Computer Science</source>. 
<publisher-name>Springer International Publishing</publisher-name>, <publisher-loc>Cham</publisher-loc>. p. <fpage>85</fpage>&#x2013;<lpage>98</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-67194-5_10</pub-id>, PMID: <pub-id pub-id-type="pmid">33724743</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<label>9</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>Y</given-names></name>
<name><surname>Lombardo</surname> <given-names>E</given-names></name>
<name><surname>Huang</surname> <given-names>L</given-names></name>
<name><surname>Avanzo</surname> <given-names>M</given-names></name>
<name><surname>Fanetti</surname> <given-names>G</given-names></name>
<name><surname>Franchin</surname> <given-names>G</given-names></name>
<etal/>
</person-group>. 
<article-title>Comparison of deep learning networks for fully automated head and neck tumor delineation on multi-centric PET/CT images</article-title>. <source>Radiat Oncol</source>. (<year>2024</year>) <volume>19</volume>:<page-range>1&#x2013;13</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13014-023-02388-0</pub-id>, PMID: <pub-id pub-id-type="pmid">38191431</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<label>10</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Andrearczyk</surname> <given-names>V</given-names></name>
<name><surname>Oreiller</surname> <given-names>V</given-names></name>
<name><surname>Abobakr</surname> <given-names>M</given-names></name>
<name><surname>Akhavanallaf</surname> <given-names>A</given-names></name>
<name><surname>Balermpas</surname> <given-names>P</given-names></name>
<name><surname>Boughdad</surname> <given-names>S</given-names></name>
<etal/>
</person-group>. 
<article-title>Overview of the HECKTOR challenge at MICCAI 2022: automatic head and neck tumor segmentation and outcome prediction in PET/CT</article-title>. In: <source>Lecture Notes in Computer Science</source>. 
<publisher-name>Springer Nature Switzerland</publisher-name>, <publisher-loc>Cham</publisher-loc>. p. <fpage>1</fpage>&#x2013;<lpage>30</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-031-27420-6_1</pub-id>, PMID: <pub-id pub-id-type="pmid">37195050</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<label>11</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Buddenkotte</surname> <given-names>T</given-names></name>
<name><surname>Buchert</surname> <given-names>R</given-names></name>
</person-group>. 
<article-title>Unrealistic data augmentation improves the robustness of deep learning&#x2013;based classification of dopamine transporter SPECT against variability between sites and between cameras</article-title>. <source>J Nucl Med</source>. (<year>2024</year>) <volume>65</volume>:<page-range>1463&#x2013;6</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.2967/jnumed.124.267570</pub-id>, PMID: <pub-id pub-id-type="pmid">39054285</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<label>12</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Oreiller</surname> <given-names>V</given-names></name>
<name><surname>Andrearczyk</surname> <given-names>V</given-names></name>
<name><surname>Jreige</surname> <given-names>M</given-names></name>
<name><surname>Boughdad</surname> <given-names>S</given-names></name>
<name><surname>Elhalawani</surname> <given-names>H</given-names></name>
<name><surname>Castelli</surname> <given-names>J</given-names></name>
<etal/>
</person-group>. 
<article-title>Head and neck tumor segmentation in PET/CT: The HECKTOR challenge</article-title>. <source>Med Image Analysis</source>. (<year>2022</year>) <volume>77</volume>:<fpage>102336</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.media.2021.102336</pub-id>, PMID: <pub-id pub-id-type="pmid">35016077</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<label>13</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kofler</surname> <given-names>F</given-names></name>
<name><surname>Ezhov</surname> <given-names>I</given-names></name>
<name><surname>Isensee</surname> <given-names>F</given-names></name>
<name><surname>Balsiger</surname> <given-names>F</given-names></name>
<name><surname>Berger</surname> <given-names>C</given-names></name>
<name><surname>Koerner</surname> <given-names>M</given-names></name>
<etal/>
</person-group>. 
<article-title>Are we using appropriate segmentation metrics? Identifying correlates of human expert perception for CNN training beyond rolling the DICE coefficient</article-title>. <source>Mach Learn Biomed Imaging</source>. (<year>2023</year>) <volume>2</volume>:<fpage>27</fpage>&#x2013;<lpage>71</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.59275/j.melba.2023-dg1f</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<label>14</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cardoso</surname> <given-names>MJ</given-names></name>
<name><surname>Li</surname> <given-names>W</given-names></name>
<name><surname>Brown</surname> <given-names>R</given-names></name>
<name><surname>Ma</surname> <given-names>N</given-names></name>
<name><surname>Kerfoot</surname> <given-names>E</given-names></name>
<name><surname>Wang</surname> <given-names>Y</given-names></name>
<etal/>
</person-group>. 
<article-title>MONAI: An open-source framework for deep learning in healthcare</article-title>. <source>arXiv</source>. (<year>2022</year>). Available online at: <uri xlink:href="https://arxiv.org/abs/2211.02701">https://arxiv.org/abs/2211.02701</uri> (Accessed <date-in-citation content-type="access-date">July 16, 2025</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B15">
<label>15</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>P&#xe9;rez-Garc&#xed;a</surname> <given-names>F</given-names></name>
<name><surname>Sparks</surname> <given-names>R</given-names></name>
<name><surname>Ourselin</surname> <given-names>S</given-names></name>
</person-group>. 
<article-title>TorchIO: A Python library for efficient loading, preprocessing, augmentation and patch-based sampling of medical images in deep learning</article-title>. <source>Comput Methods Programs Biomedicine</source>. (<year>2021</year>) <volume>208</volume>:<fpage>106236</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cmpb.2021.106236</pub-id>, PMID: <pub-id pub-id-type="pmid">34311413</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<label>16</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Boone</surname> <given-names>L</given-names></name>
<name><surname>Biparva</surname> <given-names>M</given-names></name>
<name><surname>Mojiri Forooshani</surname> <given-names>P</given-names></name>
<name><surname>Ramirez</surname> <given-names>J</given-names></name>
<name><surname>Masellis</surname> <given-names>M</given-names></name>
<name><surname>Bartha</surname> <given-names>R</given-names></name>
<etal/>
</person-group>. 
<article-title>ROOD-MRI: Benchmarking the robustness of deep learning segmentation models to out-of-distribution and corrupted data in MRI</article-title>. <source>NeuroImage</source>. (<year>2023</year>) <volume>278</volume>:<fpage>120289</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neuroimage.2023.120289</pub-id>, PMID: <pub-id pub-id-type="pmid">37495197</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<label>17</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Weissmann</surname> <given-names>T</given-names></name>
<name><surname>Huang</surname> <given-names>Y</given-names></name>
<name><surname>Fischer</surname> <given-names>S</given-names></name>
<name><surname>Roesch</surname> <given-names>J</given-names></name>
<name><surname>Mansoorian</surname> <given-names>S</given-names></name>
<name><surname>Ayala Gaona</surname> <given-names>H</given-names></name>
<etal/>
</person-group>. 
<article-title>Deep learning for automatic head and neck lymph node level delineation provides expert-level accuracy</article-title>. <source>Front Oncol</source>. (<year>2023</year>) <volume>13</volume>:<elocation-id>1115258</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fonc.2023.1115258</pub-id>, PMID: <pub-id pub-id-type="pmid">36874135</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<label>18</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Poel</surname> <given-names>R</given-names></name>
<name><surname>R&#xfc;fenacht</surname> <given-names>E</given-names></name>
<name><surname>Hermann</surname> <given-names>E</given-names></name>
<name><surname>Scheib</surname> <given-names>S</given-names></name>
<name><surname>Manser</surname> <given-names>P</given-names></name>
<name><surname>Aebersold</surname> <given-names>DM</given-names></name>
<etal/>
</person-group>. 
<article-title>The predictive value of segmentation metrics on dosimetry in organs at risk of the brain</article-title>. <source>Med Image Analysis</source>. (<year>2021</year>) <volume>73</volume>:<fpage>102161</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.media.2021.102161</pub-id>, PMID: <pub-id pub-id-type="pmid">34293536</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<label>19</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Isensee</surname> <given-names>F</given-names></name>
<name><surname>Jaeger</surname> <given-names>PF</given-names></name>
<name><surname>Kohl</surname> <given-names>SAA</given-names></name>
<name><surname>Petersen</surname> <given-names>J</given-names></name>
<name><surname>Maier-Hein</surname> <given-names>KH</given-names></name>
</person-group>. 
<article-title>nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation</article-title>. <source>Nat Methods</source>. (<year>2021</year>) <volume>18</volume>:<page-range>203&#x2013;11</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41592-020-01008-z</pub-id>, PMID: <pub-id pub-id-type="pmid">33288961</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<label>20</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ahmed</surname> <given-names>M</given-names></name>
<name><surname>Schmidt</surname> <given-names>M</given-names></name>
<name><surname>Sohaib</surname> <given-names>A</given-names></name>
<name><surname>Kong</surname> <given-names>C</given-names></name>
<name><surname>Burke</surname> <given-names>K</given-names></name>
<name><surname>Richardson</surname> <given-names>C</given-names></name>
<etal/>
</person-group>. 
<article-title>The value of magnetic resonance imaging in target volume delineation of base of tongue tumours &#x2013; A study using flexible surface coils</article-title>. <source>Radiotherapy Oncol</source>. (<year>2010</year>) <volume>94</volume>:<page-range>161&#x2013;7</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2009.12.021</pub-id>, PMID: <pub-id pub-id-type="pmid">20096947</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<label>21</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Adjogatse</surname> <given-names>D</given-names></name>
<name><surname>Petkar</surname> <given-names>I</given-names></name>
<name><surname>Reis Ferreira</surname> <given-names>M</given-names></name>
<name><surname>Kong</surname> <given-names>A</given-names></name>
<name><surname>Lei</surname> <given-names>M</given-names></name>
<name><surname>Thomas</surname> <given-names>C</given-names></name>
<etal/>
</person-group>. 
<article-title>The impact of interactive MRI-based radiologist review on radiotherapy target volume delineation in head and neck cancer</article-title>. <source>AJNR Am J Neuroradiol</source>. (<year>2023</year>) <volume>44</volume>:<page-range>192&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3174/ajnr.A7773</pub-id>, PMID: <pub-id pub-id-type="pmid">36702503</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<label>22</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Biau</surname> <given-names>J</given-names></name>
<name><surname>Dunet</surname> <given-names>V</given-names></name>
<name><surname>Lapeyre</surname> <given-names>M</given-names></name>
<name><surname>Simon</surname> <given-names>C</given-names></name>
<name><surname>Ozsahin</surname> <given-names>M</given-names></name>
<name><surname>Gr&#xe9;goire</surname> <given-names>V</given-names></name>
<etal/>
</person-group>. 
<article-title>Practical clinical guidelines for contouring the trigeminal nerve (V) and its branches in head and neck cancers</article-title>. <source>Radiotherapy Oncol</source>. (<year>2019</year>) <volume>131</volume>:<fpage>192</fpage>&#x2013;<lpage>201</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2018.08.020</pub-id>, PMID: <pub-id pub-id-type="pmid">30206021</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<label>23</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kann</surname> <given-names>BH</given-names></name>
<name><surname>Hicks</surname> <given-names>DF</given-names></name>
<name><surname>Payabvash</surname> <given-names>S</given-names></name>
<name><surname>Mahajan</surname> <given-names>A</given-names></name>
<name><surname>Du</surname> <given-names>J</given-names></name>
<name><surname>Gupta</surname> <given-names>V</given-names></name>
<etal/>
</person-group>. 
<article-title>Multi-institutional validation of deep learning for pretreatment identification of extranodal extension in head and neck squamous cell carcinoma</article-title>. <source>JCO</source>. (<year>2020</year>) <volume>38</volume>:<page-range>1304&#x2013;11</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1200/JCO.19.02031</pub-id>, PMID: <pub-id pub-id-type="pmid">31815574</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<label>24</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Willmann</surname> <given-names>J</given-names></name>
<name><surname>Kamath</surname> <given-names>A</given-names></name>
<name><surname>Poel</surname> <given-names>R</given-names></name>
<name><surname>Riggenbach</surname> <given-names>E</given-names></name>
<name><surname>Mose</surname> <given-names>L</given-names></name>
<name><surname>Bertholet</surname> <given-names>J</given-names></name>
<etal/>
</person-group>. 
<article-title>Predicting the impact of target volume contouring variations on the organ at risk dose: results of a qualitative survey</article-title>. <source>Radiotherapy Oncol</source>. (<year>2025</year>) <volume>210</volume>:<fpage>110999</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2025.110999</pub-id>, PMID: <pub-id pub-id-type="pmid">40581214</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<label>25</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Poel</surname> <given-names>R</given-names></name>
<name><surname>Kamath</surname> <given-names>A</given-names></name>
<name><surname>Ermi&#x15f;</surname> <given-names>E</given-names></name>
<name><surname>Willmann</surname> <given-names>J</given-names></name>
<name><surname>R&#xfc;fenacht</surname> <given-names>E</given-names></name>
<name><surname>Andratschke</surname> <given-names>N</given-names></name>
<etal/>
</person-group>. 
<article-title>A dual-layer quality assurance approach leveraging dose prediction for efficient review of automated contours of organs at risk in the brain in radiotherapy</article-title>. <source>Phys Imaging Radiat Oncol</source>. (<year>2025</year>) <volume>36</volume>:<fpage>100888</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.phro.2025.100888</pub-id>, PMID: <pub-id pub-id-type="pmid">41492344</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<label>26</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kann</surname> <given-names>BH</given-names></name>
<name><surname>Aneja</surname> <given-names>S</given-names></name>
<name><surname>Loganadane</surname> <given-names>GV</given-names></name>
<name><surname>Kelly</surname> <given-names>JR</given-names></name>
<name><surname>Smith</surname> <given-names>SM</given-names></name>
<name><surname>Decker</surname> <given-names>RH</given-names></name>
<etal/>
</person-group>. 
<article-title>Pretreatment identification of head and neck cancer nodal metastasis and extranodal extension using deep learning neural networks</article-title>. <source>Sci Rep</source>. (<year>2018</year>) <volume>8</volume>:<page-range>1&#x2013;11</page-range>. Available online at: <uri xlink:href="https://www.nature.com/articles/s41598-018-32441-y">https://www.nature.com/articles/s41598-018-32441-y</uri> (Accessed <date-in-citation content-type="access-date">July 16, 2025</date-in-citation>)., PMID: <pub-id pub-id-type="pmid">30232350</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<label>27</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kann</surname> <given-names>BH</given-names></name>
<name><surname>Likitlersuang</surname> <given-names>J</given-names></name>
<name><surname>Bontempi</surname> <given-names>D</given-names></name>
<name><surname>Ye</surname> <given-names>Z</given-names></name>
<name><surname>Aneja</surname> <given-names>S</given-names></name>
<name><surname>Bakst</surname> <given-names>R</given-names></name>
<etal/>
</person-group>. 
<article-title>Screening for extranodal extension in HPV-associated oropharyngeal carcinoma: evaluation of a CT-based deep learning algorithm in patient data from a multicentre, randomised de-escalation trial</article-title>. <source>Lancet Digital Health</source>. (<year>2023</year>) <volume>5</volume>:<page-range>e360&#x2013;9</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S2589-7500(23)00046-8</pub-id>, PMID: <pub-id pub-id-type="pmid">37087370</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1449379">Emma Gangemi</ext-link>, Hospital Physiotherapy Institutes (IRCCS), Italy</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1006326">Ruth McLauchlan</ext-link>, Imperial College London, United Kingdom</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3295400">Sandeep Singh</ext-link>, Rajiv Gandhi Cancer Institute and Research Center, India</p></fn>
</fn-group>
</back>
</article>