<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Oncol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Oncology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Oncol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2234-943X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fonc.2025.1637198</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Enhancing online adaptive radiotherapy with uncertainty based segmentation error and out-of-distribution detection</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>van Lente</surname><given-names>Marissa</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2911653/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Pluim</surname><given-names>Josien</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Fransson</surname><given-names>Samuel</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3143969/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Strand</surname><given-names>Robin</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/299873/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Tilly</surname><given-names>David</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1998507/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Biomedical Engineering, Eindhoven University of Technology</institution>, <city>Eindhoven</city>,&#xa0;<country country="nl">Netherlands</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Medical Imaging, Radboud University Medical Center</institution>, <city>Nijmegen</city>,&#xa0;<country country="nl">Netherlands</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Medical Physics, Uppsala University Hospital</institution>, <city>Uppsala</city>,&#xa0;<country country="se">Sweden</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Surgical Sciences, Uppsala University</institution>, <city>Uppsala</city>,&#xa0;<country country="se">Sweden</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Information Technology, Uppsala University</institution>, <city>Uppsala</city>,&#xa0;<country country="se">Sweden</country></aff>
<aff id="aff6"><label>6</label><institution>Department of Immunology, Genetics and Pathology, Uppsala University</institution>, <city>Uppsala</city>,&#xa0;<country country="se">Sweden</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Marissa van Lente, <email xlink:href="mailto:marissavlente@gmail.com">marissavlente@gmail.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-14">
<day>14</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>15</volume>
<elocation-id>1637198</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>15</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>30</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 van Lente, Pluim, Fransson, Strand and Tilly.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>van Lente, Pluim, Fransson, Strand and Tilly</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-14">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Purpose</title>
<p>Anatomical segmentation is one of the biggest sources of uncertainty in the online adaptive radiotherapy workflow. The aim of this study was to investigate the relation between the estimated uncertainty in deep learning (DL)-based segmentation and the correctness of the segmentations. In addition, the ability to capture out-of-distribution (OOD) data with uncertainty estimation was tested.</p>
</sec>
<sec>
<title>Materials and methods</title>
<p>The Monte Carlo dropout method was applied to estimate the uncertainty of a DL model for magnetic resonance (MR)-guided radiotherapy prostate cancer images, trained to segment the clinical target volume (CTV), bladder, and rectum. The training/validation set consisted of 151 T2 MR scans from 26 patients, while the test set consisted of 65 scans from 10 patients. Predictive entropy (PE) was used to capture predictive (model and data) uncertainty. The PE distributions for correct and incorrect predictions were used to find a threshold value. Predicted segmentations with PE values above this threshold value were allocated to the &#x201c;uncertain group,&#x201d; and those below to the &#x201c;certain group.&#x201d; Dice scores were computed for both groups, using manual segmentations as ground truth. Mutual information (MI) was additionally used to capture epistemic (model) uncertainty as a means to separate in-distribution (ID) from OOD data. Balanced steady-state free precession MRI scans of 10 healthy volunteers were used as OOD data.</p>
</sec>
<sec>
<title>Results</title>
<p>The segmentation model obtained Dice scores of 85.7% for the CTV, 94.8% for the bladder, and 86.6% for the rectum. The highest PE values were found at the segmentation borders. Higher PE threshold values resulted in better separation between the certain and uncertain groups. This shows the ability to detect incorrect predictions with uncertainty estimation. A 100% separation between ID and OOD data was achieved with MI.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>Uncertainty estimation from a DL-based segmentation model was seen to correlate with Dice scores for segmentation of MR-guided radiotherapy prostate cancer images. This implies that uncertainty estimation could be used to label the quality of the segmentations in the online adaptive radiotherapy workflow. Preliminary results showed that uncertainty estimation could be used to distinguish between ID and OOD data.</p>
</sec>
</abstract>
<kwd-group>
<kwd>uncertainty estimation</kwd>
<kwd>machine learning</kwd>
<kwd>radiotherapy</kwd>
<kwd>MR-Linac</kwd>
<kwd>prostate cancer</kwd>
<kwd>Monte Carlo dropout</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="2"/>
<equation-count count="2"/>
<ref-count count="17"/>
<page-count count="8"/>
<word-count count="4447"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Radiation Oncology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>In radiotherapy (RT), one of the biggest sources of uncertainty comes from segmentation uncertainty, regardless of whether segmentation is done manually or with deep learning (DL) (<xref ref-type="bibr" rid="B1">1</xref>). An obvious step to reduce erroneous segmentation is to develop a high-performing DL model with high segmentation accuracy. However, DL segmentation models will inevitably not be perfect due to limited training data and observer variability; thus, it is important to study how estimated uncertainties in the model predictions can be of value. For example, erroneous predictions should have high uncertainty values, such that they can be flagged. Consequently, in a human-in-the-loop workflow, clinicians could more readily identify erroneous predictions and manually adjust the segmentations. Incorporating uncertainty estimation, therefore, has the potential to both enhance confidence that radiotherapy is accurately targeted, i.e., ensuring tumor coverage while sparing organs at risk, and improve efficiency by supporting clinicians in refining DL predictions.</p>
<p>Prostate cancer is the most common cancer indication in Sweden,<xref ref-type="fn" rid="fn1"><sup>1</sup></xref> and radiotherapy is a cornerstone treatment modality for several prostate cancer groups. The most advanced workflow is called online adaptive radiotherapy and is implemented using MR-Linac in Uppsala University Hospital since 2019. This workflow means that diagnostic quality magnetic resonance (MR) images, utilizing the superior soft tissue contrast compared to imaging at conventional treatment machines, are acquired at each daily session with the patient in treatment position. Consequently, the treatment plans are adapted based on the segmentations of the acquired images of the patient in treatment position immediately prior to irradiation. The complex workflow with the patient in treatment position requires fast and accurate segmentation. Supporting clinicians to perform accurate and efficient refinement of the DL segmentation is of utmost importance to make online adaptive radiotherapy available to more patients.</p>
<p>Uncertainty in machine learning can be divided into epistemic uncertainty (EU) and aleatoric uncertainty (AU). EU is also called model uncertainty, as it is a measurement of the uncertainty in the DL model parameters. The EU can be decreased by increasing the training data size, but also by using more homogeneous training and test data. AU, or data uncertainty, comes from observation and scanner noise. Collecting more data does not decrease this type of uncertainty, but increasing the scanner precision does. The combination of EU and AU is called predictive uncertainty, generally portraying the confidence of a prediction (<xref ref-type="bibr" rid="B2">2</xref>, <xref ref-type="bibr" rid="B3">3</xref>).</p>
<p>Several methods exist for uncertainty estimation, including Bayesian neural networks (BNNs) that make use of the predictive distributions in neural networks (<xref ref-type="bibr" rid="B3">3</xref>). However, Bayesian methods come with high computational costs, partially due to inference (i.e., integrating over model parameters) (<xref ref-type="bibr" rid="B3">3</xref>). Variational inference can also be used to approximate the posterior distribution in complex models, but the computational cost remains high (<xref ref-type="bibr" rid="B4">4</xref>). Instead, a less computationally expensive method, called the Monte Carlo dropout (MCD), can be used, as it has been proven to be a good approximation of traditional BNNs (<xref ref-type="bibr" rid="B4">4</xref>). MCD is easy to implement in most existing segmentation models, as long as the architecture contains dropout layers. Dropout layers were originally introduced to reduce the risk of overfitting by randomly dropping units during training (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>). MCD works by activating dropout both at training and inference time and by performing multiple stochastic forward passes through the model, resulting in the generation of stochastic predictions. The stochastic predictions can be seen as samples from a probabilistic distribution, and with enough samples, an estimate of the distribution is obtained. With this probability distribution, the estimated uncertainty of the output can be quantified with various metrics and visualized in uncertainty maps. The dropout probability determines how many neurons are randomly dropped. In turn, the dropout probability influences how well the BNN posterior is approximated, and thus, it also affects the predictive uncertainty (<xref ref-type="bibr" rid="B3">3</xref>).</p>
<p>Previous research on uncertainty estimation for segmentation in the field of radiotherapy reveals that high uncertainties were detected at difficult and visually ambiguous structures (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>). However, thorough analysis of the correlation between the correctness of the prediction and the estimated uncertainty is often lacking (<xref ref-type="bibr" rid="B9">9</xref>). In addition, uncertainty estimation is often only tested on in-distribution test data. When a segmentation model is wrongfully applied to out-of-distribution (OOD) data, e.g., data that significantly differ from the training data, there is a substantial risk of leading to incorrect predictions. These types of errors should also be captured by uncertainty estimation.</p>
<p>The aim of this research was therefore to investigate the relationship between uncertainty estimation using MCD and the correctness of the predicted segmentations for MR-Linac prostate cancer images. In addition, the ability to capture OOD data with uncertainty estimation is tested.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Patient data</title>
<p>All patients in the in-distribution (ID) dataset received radiotherapy treatment at the MR-Linac at Uppsala University Hospital in Uppsala, Sweden. The dataset consists of 216 T2-weighted MRI scans of 36 prostate cancer patients. Technical specifications of the scans can be found in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material S1</bold></xref>. Ethical approval was given by the Swedish Ethical Review Authority (2019-03050) to use the data in retrospective studies, such as this one. Informed consent was obtained from all subjects. Each patient received between two and seven fractions. For all scans, the segmentations of the clinical target volume (CTV), bladder, and rectum were taken from the treatment plans and manually corrected by one medical physicist with 5 years of experience. The CTV was delineated as the entire prostate without seminal vesicles or a seminal vesicle base, and it was the target for radiotherapy treatment. The patient data were split into sets of 124 scans from 22 patients, 27 scans from 4 patients, and 65 scans from 10 patients for training, validation, and testing, respectively. The scans were kept paired for each patient.</p>
<p>The out-of-distribution data consisted of steady-state free precession MRI scans of 10 healthy volunteers with manual segmentation of the prostate, bladder, and rectum. Ethical approval was given by the Swedish Ethical Review Authority (2021-00831) to use the data in retrospective studies. Informed consent was obtained from all subjects. The segmentations were made by the same physicist who corrected the patient data. The exact scan settings can be found in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material S3</bold></xref>. An example of the visual difference between ID and OOD scans can be found in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material S5</bold></xref>, <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Figure S4</bold></xref>.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Uncertainty estimation with Monte Carlo dropout</title>
<p>The uncertainty estimation with Monte Carlo dropout was realized using dropout at training time and at inference time. Multiple predictions from a probabilistic distribution are obtained by passing the input image <italic>T</italic> times through the segmentation network with MCD. The number of stochastic forward passes was set to <italic>T</italic>&#xa0;=&#xa0;50, following (<xref ref-type="bibr" rid="B10">10</xref>), as this led to stabilization of accuracy. The actual model prediction for one input is computed by averaging over the <italic>T</italic> output predictions. This model prediction is given as a probability for belonging to a specific class. The estimated uncertainty for the averaged prediction can be quantified and visualized, for example, with uncertainty maps, by using two uncertainty metrics: predictive entropy (PE) and mutual information (MI). PE captures a combination of epistemic and aleatoric uncertainty, i.e., predictive uncertainty, while MI captures epistemic uncertainty. PE is the entropy of the predictive distribution, and it is computed with <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mover accent="true"><mml:mi>&#x210d;</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>c</mml:mi></mml:munder><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>t</mml:mi></mml:munder><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">|</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mtext>log&#xa0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>t</mml:mi></mml:munder><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">|</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>c</italic> ranges over all classes; <italic>T</italic> is the number of stochastic forward passes (Monte Carlo samples); <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is the softmax probability of output <italic>y</italic> being in class <italic>c</italic>, given an input <inline-formula>
<mml:math display="inline" id="im2"><mml:mi>x</mml:mi></mml:math></inline-formula>; and <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are the model parameters on the <italic>t</italic>th Monte Carlo sample (<xref ref-type="bibr" rid="B3">3</xref>). Mutual information is computed with <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:mover accent="true"><mml:mi mathvariant="double-struck">I</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mover accent="true"><mml:mi>&#x210d;</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mo>+</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mtext>log&#xa0;</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>w</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo>|</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>.</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>The MI between the predictive distribution and the posterior over network weights can be expressed as the predictive entropy minus expected entropy (<xref ref-type="bibr" rid="B3">3</xref>). The expected entropy is the mean of the entropy of the predictions given the parameters over the posterior distribution (<xref ref-type="bibr" rid="B11">11</xref>). For both PE and MI, it holds that a low value equals low uncertainty and a high value equals high uncertainty.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Segmentation model</title>
<p>The U-Net has been shown to perform very well in the field of medical image segmentation (<xref ref-type="bibr" rid="B12">12</xref>). In this research, a 2D U-Net architecture with five levels was used for the segmentation of the CTV, bladder, and rectum. Two convolutional layers were applied in every stage. The first layer contained 30 channels. With max pooling, the number of channels was doubled in the consecutive encoding layers to 480 channels in the bottleneck layer. Spatial concrete dropout was employed following each convolution in both the encoder and decoder, resulting in a mean dropout probability of 0.09. This type of dropout was used to find the optimal probabilities with regard to both uncertainty estimation and model accuracy for each layer (<xref ref-type="bibr" rid="B13">13</xref>). As an activation function, leaky ReLU was used with a slope of 1e&#x2212;2. The final layer of the network provided a softmax output, giving a pixel&#x2019;s probability of belonging to each of the three classes or to the background, which could be considered the fourth class. The model was trained for 20 epochs using a batch size of 8. The Adam optimizer (<xref ref-type="bibr" rid="B14">14</xref>) was employed with a learning rate of 1e&#x2212;4. To improve generalization, data augmentation was applied in the form of left&#x2212;right flipping with a probability of 50%. The cross-entropy loss function was used during training. More details about the hyperparameter settings can be found in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material S2</bold></xref>. The final predicted segmentation masks were obtained by averaging over all 50 Monte Carlo samples per input image and binarizing the averaged masks. Binarization was done by mapping the probability values to one for the class that had the highest probability and zero for the remaining three classes for each pixel. The segmentation model performance is quantified using the Dice similarity coefficient (DSC), the 95th percentile Hausdorff distance (HD95), and the mean surface distance (MSD).</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Uncertainty estimation error detection</title>
<p>To investigate the relation between the estimated uncertainty and the correctness of the predicted segmentations, the distributions of the PE per type of classification [i.e., true positive (TP), true negative (TN), false positive (FP), and false negative (FN)] were used for each class. The per-class PE values were computed by omitting the class summation in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>. For example, pixels were classified as TP for the CTV class if they were segmented as CTV in the ground truth and predicted as CTV by the model. TN meant that pixels were correctly predicted as background. FP for the CTV class meant that pixels were falsely predicted as CTV when they belonged to the bladder, rectum, or background class in the ground truth. Lastly, FN meant that pixels were incorrectly predicted as background. With these distributions, threshold values for the PE could be selected to distinguish as good as possible between correct and incorrect predictions. Local (i.e., pixel-based) predictions of the segmentations with PE values above this threshold value were assigned to the &#x201c;uncertain group&#x201d; and those below to the &#x201c;certain group,&#x201d; similar to the research of Alves et&#xa0;al. (<xref ref-type="bibr" rid="B15">15</xref>). Following this separation, the performances within the uncertain group and the certain group were quantified using the DSC by comparing the predictions with the ground truth segmentations. Here, the DSC was computed by making use of the TP, FP, and FN counts per class. If the correctness and the uncertainty of the prediction are indeed linked, the performance within the certain group should be msuch higher than the performance within the uncertain group. As a result, the predictions of the uncertain group could be flagged to focus the attention of the clinician on those pixels that are most likely classified incorrectly.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Out-of-distribution detection</title>
<p>In addition to testing how well segmentation errors can be detected with MCD, the ability to detect OOD data was tested. As described above, the OOD data had a different contrast than the ID data. This choice for OOD data is realistic, since steady-state free precession MRI scans are also used in radiotherapy. Hypothetically, the developed segmentation model could be wrongly applied to these types of scans in practice. With OOD data, the epistemic uncertainty of a segmentation model is expected to increase, which is reflected by a slight increase in PE and a relatively large increase in MI for OOD compared to ID data (<xref ref-type="bibr" rid="B16">16</xref>). Thus, to test if these OOD scans can be detected, the trained model was used to predict the segmentations and make global estimates of the uncertainties. The mean MI value (for all classes combined) of each 3D scan was computed for both the OOD and ID data and compared between the groups to explore if they could be distinguished with uncertainty estimation on a global scale.</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Segmentation model performance</title>
<p>The performance of the segmentation model on the in-distribution test set is summarized in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>. For the CTV, bladder, and rectum, the DSC scores were 85.7%, 94.8%, and 86.6%, respectively. The performance on the out-of-distribution test set can be found in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>. For the CTV (i.e., healthy prostate), bladder, and rectum, the DSC scores were 64.6%, 78.3%, and 63.7%, respectively.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Segmentation performance on in-distribution test data.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Metrics</th>
<th valign="middle" align="left">CTV</th>
<th valign="middle" align="left">Bladder</th>
<th valign="middle" align="left">Rectum</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">DSC (%)</td>
<td valign="middle" align="left">85.65 &#xb1; 3.90</td>
<td valign="middle" align="left">94.81 &#xb1; 2.46</td>
<td valign="middle" align="left">86.60 &#xb1; 4.32</td>
</tr>
<tr>
<td valign="middle" align="left">HD95 (mm)</td>
<td valign="middle" align="left">4.43 &#xb1; 1.56</td>
<td valign="middle" align="left">3.24 &#xb1; 3.96</td>
<td valign="middle" align="left">6.64 &#xb1; 5.66</td>
</tr>
<tr>
<td valign="middle" align="left">MSD (mm)</td>
<td valign="middle" align="left">1.55 &#xb1; 0.50</td>
<td valign="middle" align="left">0.99 &#xb1; 0.64</td>
<td valign="middle" align="left">1.39 &#xb1; 0.82</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>CTV, clinical target volume; DSC, Dice similarity coefficient; HD95, 95th percentile Hausdorff distance; MSD, mean surface distance. </p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Segmentation performance on out-of-distribution test data.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Metrics</th>
<th valign="middle" align="left">CTV</th>
<th valign="middle" align="left">Bladder</th>
<th valign="middle" align="left">Rectum</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">DSC (%)</td>
<td valign="middle" align="left">64.55 &#xb1; 16.45</td>
<td valign="middle" align="left">78.33 &#xb1; 14.37</td>
<td valign="middle" align="left">63.69 &#xb1; 13.49</td>
</tr>
<tr>
<td valign="middle" align="left">HD95 (mm)</td>
<td valign="middle" align="left">13.34 &#xb1; 17.79</td>
<td valign="middle" align="left">4.84 &#xb1; 3.11</td>
<td valign="middle" align="left">7.37 &#xb1; 3.50</td>
</tr>
<tr>
<td valign="middle" align="left">MSD (mm)</td>
<td valign="middle" align="left">2.18 &#xb1; 0.93</td>
<td valign="middle" align="left">1.40 &#xb1; 0.79</td>
<td valign="middle" align="left">1.97 &#xb1; 0.84</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>CTV, clinical target volume; DSC, Dice similarity coefficient; HD95, 95th percentile Hausdorff distance; MSD, mean surface distance. </p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Uncertainty threshold for error detection</title>
<p>The uncertainty of the predictions for the in-distribution data was locally quantified using PE. The pixel-wise PE values were calculated per class. For every pixel, the predicted segmentation was classified as TP, TN, FP, or FN for every class separately. This classification resulted in distributions of the PE values for TP, TN, FP, and FN for each class. The distributions can be found in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material S4</bold></xref>. For all three classes, at least 70% of wrong predictions (FN and FP) had PE values above 0.10, and often more strongly centered around a value of 0.35, with at least 30% having a value of 0.35. At least 97% of correct predictions (TP and TN) had PE values of 0.05 or below. To create groups of certain and uncertain predictions, PE thresholds of 0.30 to 0.36 with increments of 0.01 were selected based on the distinct distributions per classification type. The DSC scores for the certain and uncertain groups with varying thresholds are shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>, per class. The shaded areas represent one standard deviation. For all thresholds and for all three classes, the performance of the certain group remains much higher than the uncertain group. By increasing the PE threshold, the performance of the uncertain group quickly drops, while it only slightly drops for the certain group. The difference in performance between the two groups thus increases further with higher PE.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Dice scores for the certain (blue) and uncertain (red) groups with varying thresholds, per class. Shaded areas represent one standard deviation. CTV, clinical target volume; PE, predictive entropy.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1637198-g001.tif">
<alt-text content-type="machine-generated">Three line graphs show Dice scores against PE threshold valuesfor CTV, Bladder, and Rectum. Each graph has two lines representing the Certain andUncertain groups. Scores for the Certain group remain high across all thresholds, whilethe Uncertain group's scores decrease significantly. At a threshold of 0.36, the Certain group scores are 0.870.96, and 0.88 respectively, while the Uncertain group's scores drop to 0.06, 0.04, and0.01.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Uncertainty maps</title>
<p>The estimated predictive uncertainty could also be visualized using uncertainty maps. Three examples of input images, their ground truth, model predictions, prediction error, and uncertainty map can be seen in <xref ref-type="fig" rid="f2"><bold>Figures&#xa0;2</bold></xref>&#x2013;<xref ref-type="fig" rid="f4"><bold>4</bold></xref>. The prediction error is the difference between the average prediction after binarization and the manual segmentation. Two examples (<xref ref-type="fig" rid="f2"><bold>Figures&#xa0;2</bold></xref>, <xref ref-type="fig" rid="f4"><bold>4</bold></xref>) contain relatively accurate predictions, while the third example (<xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>) contains more erroneously predicted pixels. In <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>, the predictions are accurate and low in uncertainty, with the exception of the border pixels. In <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>, the CTV and bladder predictions are not completely correct. The uncertainty is high in the entire CTV and around the border of the bladder. The rectum is predicted accurately and with low uncertainty. In <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>, both the bladder and rectum are predicted well, but the uncertainty of the prediction for the rectum is high. In general, the uncertainty maps show higher PE values around the borders of the structures.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Example of an accurate and certain prediction for the CTV and rectum <bold>(A)</bold>, showing original MRI with a crop box delineating the area magnified in the following panels: ground truth segmentation for the CTV and rectum <bold>(B)</bold>, predicted segmentation for the CTV and rectum <bold>(C)</bold>, prediction error <bold>(D)</bold>, and predictive entropy uncertainty map for all classes <bold>(E)</bold>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1637198-g002.tif">
<alt-text content-type="machine-generated">MRI and image processing results with five panels: A shows an MRI scan with a highlighted prostate region. B and C display binary masks of the prostate in white against a black background. D presents contour lines over the shape. E features a heatmap with a color scale from zero to 0.3 on the right side.</alt-text>
</graphic></fig>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Example of bladder and CTV predictions with erroneous areas and high uncertainty and accurate rectum prediction with low uncertainty <bold>(A)</bold>, showing the original MRI with a crop box delineating the area magnified in the following panels: ground truth segmentation for the bladder, CTV, and rectum <bold>(B)</bold> predicted segmentation for the bladder, CTV, and rectum <bold>(C)</bold> prediction error <bold>(D)</bold> and predictive entropy uncertainty map for all classes <bold>(E)</bold>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1637198-g003.tif">
<alt-text content-type="machine-generated">MRI scans and segmented images. Panel A shows an MRI with a highlighted area. Panels B and C display binary segmentation masks. Panel D presents outlined regions from the segmentation in green. Panel E features a colormap with intensity values ranging from zero to zero point three.</alt-text>
</graphic></fig>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Example of accurate bladder and rectum predictions with large uncertainty for the rectum <bold>(A)</bold>, showing the original MRI with a crop box delineating the area magnified in the following panels: ground truth segmentation for the bladder, CTV, and rectum <bold>(B)</bold> predicted segmentation for the bladder, CTV, and rectum <bold>(C)</bold> prediction error <bold>(D)</bold> and predictive entropy uncertainty map for all classes <bold>(E)</bold>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1637198-g004.tif">
<alt-text content-type="machine-generated">MRI scans illustrating prostate segmentation. Image A shows the original scan with a marked region. Images B and C display binary segmentation masks. Image D shows contour overlays of the segmentation. Image E presents a probability map with a color bar indicating values from zero to 0.3.</alt-text>
</graphic></fig>
<p>In practice, the uncertainty maps could be used by highlighting all pixels that were categorized as uncertain, after carefully selecting the correct PE threshold for that specific use case. An example of such visualization is shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5F</bold></xref>, together with the original MRI, ground truth segmentation, DL prediction, error in prediction, and full uncertainty map. Here, uncertain pixels are indicated with the two darkest shades. In this example, a PE threshold of 0.30 is used. The darkest shade is used to flag pixels that are predicted as the structure of interest but with high uncertainty, hinting at a potential false positive. The second darkest shade is used to show which pixels are predicted as background with high uncertainty, hinting at a potential false negative. The two lightest shades indicate low uncertainty predictions for the structure of interest (referred to as &#x201c;certain positive&#x201d;) and for the background (referred to as &#x201c;certain negative&#x201d;).</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Example of how uncertain pixels may be flagged, showing the original MRI <bold>(A)</bold> with a crop box delineating the area magnified in the following panels: ground truth segmentation for the bladder and rectum <bold>(B)</bold>, predicted segmentation for the CTV, bladder, and rectum <bold>(C)</bold>, prediction error <bold>(D)</bold>, PE uncertainty map for all classes <bold>(E)</bold>, and prediction with pixels flagged as uncertain in the two darkest shades using a predictive entropy threshold of 0.30 <bold>(F)</bold>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fonc-15-1637198-g005.tif">
<alt-text content-type="machine-generated">MRI analysis with six panels labeled A to F. Panel A shows a rectal MRI image with a highlighted section. Panels B and C display segmented images, highlighting anatomical features with white areas. Panel D shows outlines of the same features in green and purple. Panel E provides a heatmap with a color scale from 0.0 to 0.3. Panel F illustrates a probability map with a grayscale color bar indicating certain and uncertain categories for positive and negative areas.</alt-text>
</graphic></fig>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Out-of-distribution detection</title>
<p>The global mean MI value of every 3D scan was computed for the ID and OOD data. The exact distribution of MI values can be found in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material S5</bold></xref>. The OOD data resulted in significantly higher MI values, portraying higher epistemic uncertainty. A 100% separation between the ID and OOD samples was possible using the mean MI. In addition, the distribution of mean PE values was added in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material S5</bold></xref>, showing that the assumption that PE values also increase for OOD samples holds true. In clinical practice, this would mean that global uncertainty estimation could be applied to distinguish ID from OOD samples. Local uncertainty estimation could then be used to flag potential errors on ID scans.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>The current research showed a clear relationship between the estimated uncertainty from DL-based segmentation using Monte Carlo dropout and ground truth segmentation of prostate cancer patient MR images.</p>
<p>It was shown that PE could be used to predict the quality of the DL-based segmentation measured in DSC, on a pixel level. Sorting pixels into uncertain/certain categories based on a PE threshold revealed that the certain group had a significantly higher DSC than the uncertain group.</p>
<p>The performance of the DL segmentation model in this research is comparable to DL models in previous studies on prostate cancer segmentation in MR, with less than 5% difference in DSC scores for all structures (<xref ref-type="bibr" rid="B17">17</xref>). Here, the focus was on uncertainty estimation, and therefore, it was not the primary objective to obtain state-of-the-art segmentation results. However, training the segmentation model on 3D rather than 2D input could have increased the performance. By adding cross-slice information, the relationship between the correctness of the DL prediction and the uncertainty could be altered, which should be investigated. This 3D approach could, however, increase the inference time due to the need for multiple forward passes with the MCD method, possibly making&#xa0;it&#xa0;less translatable to clinical use, especially in online adaptive radiotherapy.</p>
<p>The estimated uncertainty offers the possibility to direct the clinician&#x2019;s attention to areas where special attention is needed, as demonstrated in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>. Since segmentation borders are always uncertain, some attention should be directed to them. However, clinicians will always face some ambiguous decision-making about the exact pixels that belong to the borders. It is also important to focus attention on larger areas with high uncertainty, such as shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. This has the potential to speed up the editing of the DL-based segmentations, especially important in online adaptive radiotherapy with the patient in treatment position, where segmentation is a time-critical step. This should be explored in a future study to test if segmentation time, including refinement (and possibly quality), improves when uncertain areas are highlighted.</p>
<p>In this study, manual corrections by one medical expert were used as ground truth segmentations. This limits the study as it reduces the validity of the segmentation model&#x2019;s performance evaluation. However, with multiple experts performing multiple segmentations, random errors are introduced. Capturing these random errors with uncertainty estimation would not be possible. To be able to investigate the relationship between uncertainty estimation and the correctness of the predictions, this type of error was therefore purposely avoided.</p>
<p>The application of the network to out-of-distribution data resulted in significantly higher estimated uncertainties as quantified with mutual information. This suggests that MI could indeed be used as a quality assurance tool to detect when the network is used for segmentation tasks it has not been trained for. However, there are two limitations of the OOD detection analysis. Firstly, the used OOD data differed from the ID data in two ways. The biggest difference is in the contrast settings of the scans, but the use of healthy volunteers for the OOD data also meant that an anatomical difference of the prostate is present. Additionally, patients had larger bladder volumes and a smaller variety in rectum sizes due to a predefined drinking schedule before treatment. Secondly, only one OOD dataset was analyzed. Thus, testing for other data types should be performed before drawing more general conclusions. Datasets with other scan settings, severe imaging artifacts, different patient groups (e.g., rectum patients), or different anatomy (e.g., female pelvis scans) should be additionally tested as OOD data.</p>
<p>Another limitation of this study is the relatively small in-distribution dataset, consisting of 216 scans from 36 patients. By making use of several fraction scans of each patient, this issue is somewhat alleviated, even though the intrafractional variation for a single patient is smaller than the interpatient variability.</p>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusions</title>
<p>A deep learning-based segmentation model with integrated uncertainty estimation was developed for magnetic resonance images of prostate cancer patients. The results demonstrated that the estimated uncertainty was linked with segmentation correctness, indicating its potential to highlight regions requiring clinical review. Furthermore, the model exhibited significantly higher uncertainty when applied to out-of-distribution data, suggesting its utility in detecting scenarios where the model is used outside its intended use.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The patient scans used in this research may not be publicly shared and are available upon reasonable request. Requests to access these datasets should be directed to <email xlink:href="mailto:david.tilly@igp.uu.se">david.tilly@igp.uu.se</email>.</p></sec>
<sec id="s7" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Swedish Ethical Review Authority (2019&#x2013;03050) and Swedish Ethical Review Authority (2021-00831). The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study.</p></sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>ML: Investigation, Visualization, Validation, Software, Project administration, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Methodology. JP: Methodology, Conceptualization, Writing &#x2013; review &amp; editing, Supervision. SF: Software, Writing &#x2013; review &amp; editing, Data curation, Methodology. RS: Supervision, Writing &#x2013; review &amp; editing, Methodology, Resources, Data curation. DT: Resources, Methodology, Writing &#x2013; review &amp; editing, Supervision.</p></sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>DT was part-time employed by Elekta MR-Linac vendor during the time the research was conducted, but the current work was&#xa0;performed solely as part of DT&#x2019;s position at Uppsala University Hospital.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<sec id="s13" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fonc.2025.1637198/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fonc.2025.1637198/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Segedin</surname> <given-names>B</given-names></name>
<name><surname>Petric</surname> <given-names>P</given-names></name>
</person-group>. 
<article-title>Uncertainties in target volume delineation in radiotherapy&#x2013;are they relevant and what can we do about them</article-title>? <source>Radiol Oncol</source>. (<year>2016</year>) <volume>50</volume>:<page-range>254&#x2013;62</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1515/raon-2016-0023</pub-id>, PMID: <pub-id pub-id-type="pmid">27679540</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<label>2</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mukhoti</surname> <given-names>J</given-names></name>
<name><surname>Gal</surname> <given-names>Y</given-names></name>
</person-group>. 
<article-title>Evaluating bayesian deep learning methods for semantic segmentation</article-title>. <source>arXiv preprint arXiv:1811.12709</source>. (<year>2018</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1811.12709</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<label>3</label>
<mixed-citation publication-type="thesis">
<person-group person-group-type="author">
<name><surname>Gal</surname> <given-names>Y</given-names></name>
</person-group>. <source>Uncertainty in Deep Learning</source>. 
<publisher-name>University of Cambridge</publisher-name>, <publisher-loc>Cambridge, United Kingdom</publisher-loc> (<year>2016</year>).
</mixed-citation>
</ref>
<ref id="B4">
<label>4</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gal</surname> <given-names>Y</given-names></name>
<name><surname>Ghahramani</surname> <given-names>Z</given-names></name>
</person-group>. 
<article-title>Dropout as a bayesian approximation</article-title>. <source>Appendix</source>. (<year>2016</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1506.02157</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<label>5</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hinton</surname> <given-names>GE</given-names></name>
<name><surname>Srivastava</surname> <given-names>N</given-names></name>
<name><surname>Krizhevsky</surname> <given-names>A</given-names></name>
<name><surname>Sutskever</surname> <given-names>I</given-names></name>
<name><surname>Salakhutdinov</surname> <given-names>RR</given-names></name>
</person-group>. 
<article-title>Improving neural networks by preventing co-adaptation of feature detectors</article-title>. arXiv preprint arXiv:1207.0580v1. (<year>2012</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1207.0580</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<label>6</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Srivastava</surname> <given-names>N</given-names></name>
<name><surname>Hinton</surname> <given-names>G</given-names></name>
<name><surname>Krizhevsky</surname> <given-names>A</given-names></name>
<name><surname>Sutskever</surname> <given-names>I</given-names></name>
<name><surname>Salakhutdinov</surname> <given-names>R</given-names></name>
</person-group>. 
<article-title>Dropout: a simple way to prevent neural networks from overfitting</article-title>. <source>J Mach Learn Res</source>. (<year>2014</year>) <volume>15</volume>:<page-range>1929&#x2013;58</page-range>.
</mixed-citation>
</ref>
<ref id="B7">
<label>7</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>van Rooij</surname> <given-names>W</given-names></name>
<name><surname>Verbakel</surname> <given-names>WF</given-names></name>
<name><surname>Slotman</surname> <given-names>BJ</given-names></name>
<name><surname>Dahele</surname> <given-names>M</given-names></name>
</person-group>. 
<article-title>Using spatial probability maps to highlight potential inaccuracies in deep learning-based contours: facilitating online adaptive radiation therapy</article-title>. <source>Adv Radiat Oncol</source>. (<year>2021</year>) <volume>6</volume>:<elocation-id>100658</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.adro.2021.100658</pub-id>, PMID: <pub-id pub-id-type="pmid">33778184</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<label>8</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Min</surname> <given-names>H</given-names></name>
<name><surname>Dowling</surname> <given-names>J</given-names></name>
<name><surname>Jameson</surname> <given-names>MG</given-names></name>
<name><surname>Cloak</surname> <given-names>K</given-names></name>
<name><surname>Faustino</surname> <given-names>J</given-names></name>
<name><surname>Sidhom</surname> <given-names>M</given-names></name>
<etal/>
</person-group>. 
<article-title>Clinical target volume delineation quality assurance for mri-guided prostate radiotherapy using deep learning with uncertainty estimation</article-title>. <source>Radiotherapy Oncol</source>. (<year>2023</year>) <volume>186</volume>:<elocation-id>109794</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2023.109794</pub-id>, PMID: <pub-id pub-id-type="pmid">37414257</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<label>9</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wahid</surname> <given-names>KA</given-names></name>
<name><surname>Kaffey</surname> <given-names>ZY</given-names></name>
<name><surname>Farris</surname> <given-names>DP</given-names></name>
<name><surname>Humbert-Vidan</surname> <given-names>L</given-names></name>
<name><surname>Moreno</surname> <given-names>AC</given-names></name>
<name><surname>Rasmussen</surname> <given-names>M</given-names></name>
<etal/>
</person-group>. 
<article-title>Artificial intelligence uncertainty quantification in radiotherapy applications- a scoping review</article-title>. <source>Radiotherapy Oncol</source>. (<year>2024</year>) <volume>201</volume>:<elocation-id>110542</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.radonc.2024.110542</pub-id>, PMID: <pub-id pub-id-type="pmid">39299574</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<label>10</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Milan&#xe9;s-Hermosilla</surname> <given-names>D</given-names></name>
<name><surname>Trujillo Codorni&#xfa;</surname> <given-names>R</given-names></name>
<name><surname>L&#xf3;pez-Baracaldo</surname> <given-names>R</given-names></name>
<name><surname>Sagar&#xf3;-Zamora</surname> <given-names>R</given-names></name>
<name><surname>Delisle-Rodriguez</surname> <given-names>D</given-names></name>
<name><surname>Villarejo-Mayor</surname> <given-names>JJ</given-names></name>
<etal/>
</person-group>. 
<article-title>Monte carlo dropout for uncertainty estimation and motor imagery classification</article-title>. <source>Sensors</source>. (<year>2021</year>) <volume>21</volume>:<elocation-id>7241</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s21217241</pub-id>, PMID: <pub-id pub-id-type="pmid">34770553</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<label>11</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Smith</surname> <given-names>L</given-names></name>
<name><surname>Gal</surname> <given-names>Y</given-names></name>
</person-group>. 
<article-title>Understanding measures of uncertainty for adversarial example detection</article-title>. <source>arXiv preprint arXiv:1803.08533</source>. (<year>2018</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1803.08533</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<label>12</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cusumano</surname> <given-names>D</given-names></name>
<name><surname>Boldrini</surname> <given-names>L</given-names></name>
<name><surname>Dhont</surname> <given-names>J</given-names></name>
<name><surname>Fiorino</surname> <given-names>C</given-names></name>
<name><surname>Green</surname> <given-names>O</given-names></name>
<name><surname>G&#xfc;ng&#xf6;r</surname> <given-names>G</given-names></name>
<etal/>
</person-group>. 
<article-title>Artificial intelligence in magnetic resonance guided radiotherapy: Medical and physical considerations on state of art and future perspectives</article-title>. <source>Physica Med</source>. (<year>2021</year>) <volume>85</volume>:<page-range>175&#x2013;91</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ejmp.2021.05.010</pub-id>, PMID: <pub-id pub-id-type="pmid">34022660</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<label>13</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gal</surname> <given-names>Y</given-names></name>
<name><surname>Hron</surname> <given-names>J</given-names></name>
<name><surname>Kendall</surname> <given-names>A</given-names></name>
</person-group>. 
<article-title>Concrete dropout</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2017</year>) <volume>30</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1705.07832</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<label>14</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Diederik</surname> <given-names>PK</given-names></name>
</person-group>. 
<article-title>Adam: A method for stochastic optimization</article-title>. <source>arXiv preprint arXiv:1412.6980</source>. (<year>2014</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1412.6980</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<label>15</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Alves</surname> <given-names>N</given-names></name>
<name><surname>Bosma</surname> <given-names>JS</given-names></name>
<name><surname>Venkadesh</surname> <given-names>KV</given-names></name>
<name><surname>Jacobs</surname> <given-names>C</given-names></name>
<name><surname>Saghir</surname> <given-names>Z</given-names></name>
<name><surname>de Rooij</surname> <given-names>M</given-names></name>
<etal/>
</person-group>. 
<article-title>Prediction variability to identify reduced ai performance in cancer diagnosis at mri and ct</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>308</volume>:<fpage>e230275</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1148/radiol.230275</pub-id>, PMID: <pub-id pub-id-type="pmid">37724961</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<label>16</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kirsch</surname> <given-names>A</given-names></name>
<name><surname>Mukhoti</surname> <given-names>J</given-names></name>
<name><surname>van Amersfoort</surname> <given-names>J</given-names></name>
<name><surname>Torr</surname> <given-names>HP</given-names></name>
<name><surname>Gal</surname> <given-names>Y</given-names></name>
</person-group>. 
<article-title>On pitfalls in ood detection: Predictive entropy considered harmful</article-title>. <source>Presented at ICML 2021 Workshop Uncertainty Robustness Deep Learn</source>. (<year>2021</year>).
</mixed-citation>
</ref>
<ref id="B17">
<label>17</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Elguindi</surname> <given-names>S</given-names></name>
<name><surname>Zelefsky</surname> <given-names>MJ</given-names></name>
<name><surname>Jiang</surname> <given-names>J</given-names></name>
<name><surname>Veeraraghavan</surname> <given-names>H</given-names></name>
<name><surname>Deasy</surname> <given-names>JO</given-names></name>
<name><surname>Hunt</surname> <given-names>MA</given-names></name>
<etal/>
</person-group>. 
<article-title>Deep learning-based auto-segmentation of targets and organs-at-risk for magnetic resonance imaging only planning of prostate radiotherapy</article-title>. <source>Phys Imaging Radiat Oncol</source>. (<year>2019</year>) <volume>12</volume>:<page-range>80&#x2013;6</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.phro.2019.11.006</pub-id>, PMID: <pub-id pub-id-type="pmid">32355894</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1592485">Tomas Janssen</ext-link>, The Netherlands Cancer Institute (NKI), Netherlands</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3121434">Charlotte Brouwer</ext-link>, University Medical Center Groningen, Netherlands</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3228987">Hans Meine</ext-link>, Fraunhofer Institute for Digital Medicine (MEVIS), Germany</p></fn>
</fn-group>
<fn-group>
<fn id="fn1"><label>1</label>
<p><uri xlink:href="https://www.cancerfonden.se/">https://www.cancerfonden.se/</uri></p></fn>
</fn-group>
</back>
</article>