<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Reprod. Health</journal-id><journal-title-group>
<journal-title>Frontiers in Reproductive Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Reprod. Health</abbrev-journal-title></journal-title-group>
<issn pub-type="epub">2673-3153</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frph.2026.1778326</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Investigating discrepancies in accuracy, agreement and interpretability for single-frame embryo classification tasks conducted by embryologists and deep learning models</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes"><name><surname>Kakulavarapu</surname><given-names>Radhika</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="cor1">&#x002A;</xref><uri xlink:href="https://loop.frontiersin.org/people/3332431/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role></contrib>
<contrib contrib-type="author"><name><surname>Delbarre</surname><given-names>Erwan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="an1"><sup>&#x2020;</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role></contrib>
<contrib contrib-type="author"><name><surname>Sharma</surname><given-names>Akriti</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Jahanlu</surname><given-names>David</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/2769477/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Riegler</surname><given-names>Michael A.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/608379/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role></contrib>
<contrib contrib-type="author"><name><surname>Haugen</surname><given-names>Trine B.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/1014378/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Iliceto</surname><given-names>Mario</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role></contrib>
<contrib contrib-type="author"><name><surname>Stensen</surname><given-names>Mette H.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role></contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Life Sciences and Health, Faculty of Health Sciences, OsloMet &#x2013; Oslo Metropolitan University</institution>, <city>Oslo</city>, <country country="no">Norway</country></aff>
<aff id="aff2"><label>2</label><institution>Volvat Spiren</institution>, <city>Oslo</city>, <country country="no">Norway</country></aff>
<aff id="aff3"><label>3</label><institution>Simula Research Laboratory</institution>, <city>Oslo</city>, <country country="no">Norway</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Social Work, Faculty of Social Science, Child Welfare and Social Policy, OsloMet &#x2013; Oslo Metropolitan University</institution>, <city>Oslo</city>, <country country="no">Norway</country></aff>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label><bold>Correspondence:</bold> Radhika Kakulavarapu <email xlink:href="mailto:radhikak@oslomet.no">radhikak@oslomet.no</email>; <email xlink:href="mailto:radhika.kakulavarapu@volvat.no">radhika.kakulavarapu@volvat.no</email></corresp>
<fn fn-type="equal" id="an1"><label>&#x2020;</label><p>PRESENT ADDRESS Erwan Delbarre, Department of Biotechnology, Faculty of Applied Ecology, Agricultural Sciences and Biotechnology, University of Inland Norway, Elverum, Norway</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-03"><day>03</day><month>03</month><year>2026</year></pub-date>
<pub-date publication-format="electronic" date-type="collection"><year>2026</year></pub-date>
<volume>8</volume><elocation-id>1778326</elocation-id>
<history>
<date date-type="received"><day>30</day><month>12</month><year>2025</year></date>
<date date-type="rev-recd"><day>30</day><month>01</month><year>2026</year></date>
<date date-type="accepted"><day>05</day><month>02</month><year>2026</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 Kakulavarapu, Delbarre, Sharma, Jahanlu, Riegler, Haugen, Iliceto and Stensen.</copyright-statement>
<copyright-year>2026</copyright-year><copyright-holder>Kakulavarapu, Delbarre, Sharma, Jahanlu, Riegler, Haugen, Iliceto and Stensen</copyright-holder><license><ali:license_ref start_date="2026-03-03">https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p></license>
</permissions>
<abstract><sec><title>Introduction</title>
<p>Artificial intelligence tools show promise in supporting clinical decision making, but their safe use requires evaluation of not only accuracy, but also agreement with experts and interpretability of model decisions. The aim of this study was to evaluate the accuracy and agreement of human embryologists and deep learning models in embryo stage classification, and to explore interpretability through explainable artificial intelligence.</p>
</sec><sec><title>Methods</title>
<p>A retrospective, single-center study used single-frame embryo images (<italic>n</italic>&#x2009;&#x003D;&#x2009;245) classified according to developmental stage by three embryologists and two deep learning models, ResNet-34 and VGG16. Accuracy and agreement among all operators was evaluated, along with an assessment of interpretability with regards to model-generated explanations for spatial attention.</p>
</sec><sec><title>Results</title>
<p>Embryologists achieved higher accuracy (89.9&#x0025;) than ResNet-34 (78.8&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) and VGG16 (74.3&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), while overall agreement with the reference standard remained excellent for all operators (<italic>&#x03BA;</italic>&#x2265;0.932). Stage-wise agreement was consistently stronger among embryologists than DL models (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.778&#x2013;0.952 vs. 0.385&#x2013;0.681). ResNet-34 Grad-CAMs were rated biologically relevant more often than VGG16 (89&#x0025; vs. 59&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), yet interpretability did not consistently align with accuracy. Analysis of spatial overlap between model generated explanations was weak and observed to be lowest at the blastocyst stage, despite perfect model accuracy.</p>
</sec><sec><title>Conclusions</title>
<p>These findings highlight the need for evaluation frameworks that integrate accuracy, agreement and interpretability to support safe and transparent development of artificial intelligence tools in assisted reproduction technology.</p>
</sec>
</abstract>
<kwd-group>
<kwd>accuracy</kwd>
<kwd>agreement</kwd>
<kwd>deep learning</kwd>
<kwd>embryo assessment</kwd>
<kwd>interpretability</kwd>
</kwd-group><funding-group><award-group id="gs1"><funding-source id="sp1"><institution-wrap><institution>Norges Forskningsr&#x00E5;d</institution><institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100005416</institution-id></institution-wrap></funding-source><award-id rid="sp1">288727</award-id></award-group><funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This study was funded by the Research Council of Norway (Project Number 288727).</funding-statement></funding-group><counts>
<fig-count count="5"/>
<table-count count="2"/><equation-count count="0"/><ref-count count="70"/><page-count count="13"/><word-count count="0"/></counts><custom-meta-group><custom-meta><meta-name>section-at-acceptance</meta-name><meta-value>Assisted Reproduction</meta-value></custom-meta></custom-meta-group>
</article-meta>
</front>
<body><sec id="s1" sec-type="intro"><label>1</label><title>Introduction</title>
<p>Consistent monitoring of embryo development, whether conducted at discrete time-points or with the aid of time-lapse technology (TLT), represents a routine and critical task within assisted reproductive technology (ART). Considering the established relationship between embryo morphology and reproductive success (<xref ref-type="bibr" rid="B1">1</xref>&#x2013;<xref ref-type="bibr" rid="B3">3</xref>), a visual assessment of embryo morphology is paramount towards the birth of a healthy child. To this end, TLT systems have enhanced the descriptive resolution of embryo monitoring, leading to its widespread clinical adoption (<xref ref-type="bibr" rid="B4">4</xref>&#x2013;<xref ref-type="bibr" rid="B6">6</xref>). However, despite these advancements, morphological assessments remain time-consuming (<xref ref-type="bibr" rid="B7">7</xref>) and prone to variability across clinics (<xref ref-type="bibr" rid="B8">8</xref>), protocols (<xref ref-type="bibr" rid="B9">9</xref>, <xref ref-type="bibr" rid="B10">10</xref>) and operators (<xref ref-type="bibr" rid="B11">11</xref>). As a means of addressing these limitations, several commercially available artificial intelligence (AI) systems have experienced a sharp increase in utilization (<xref ref-type="bibr" rid="B12">12</xref>&#x2013;<xref ref-type="bibr" rid="B14">14</xref>), with demonstrated improvements in cost- and time-efficiency within clinical settings (<xref ref-type="bibr" rid="B15">15</xref>). Indeed, the application of various deep learning (DL) models, particularly convolutional neural networks (CNN) have been successfully implemented in classification tasks (<xref ref-type="bibr" rid="B16">16</xref>&#x2013;<xref ref-type="bibr" rid="B20">20</xref>), embryo ranking strategies (<xref ref-type="bibr" rid="B21">21</xref>, <xref ref-type="bibr" rid="B22">22</xref>), prediction of morphokinetics (<xref ref-type="bibr" rid="B23">23</xref>) and reproductive viability (<xref ref-type="bibr" rid="B24">24</xref>&#x2013;<xref ref-type="bibr" rid="B26">26</xref>). However, as numerous studies suggest the equal (<xref ref-type="bibr" rid="B27">27</xref>&#x2013;<xref ref-type="bibr" rid="B29">29</xref>), if not improved (<xref ref-type="bibr" rid="B25">25</xref>, <xref ref-type="bibr" rid="B30">30</xref>) predictive performance of DL models compared to embryologists, reaching a clinical consensus on model evaluation remains an ongoing effort.</p>
<p>Model performance is often evaluated based on accuracy, which provides a measure of how closely a model prediction aligns with a known outcome (<xref ref-type="bibr" rid="B31">31</xref>). While reflective of some clinical tasks, such as embryo stage classification, several publications acknowledge the misleading properties of reporting model accuracy alone, particularly within the imbalanced datasets available in medical science (<xref ref-type="bibr" rid="B31">31</xref>&#x2013;<xref ref-type="bibr" rid="B33">33</xref>). Consequently, as AI-powered tools gain traction within the field of ART, models are being evaluated with a greater range of performance indicators (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B31">31</xref>, <xref ref-type="bibr" rid="B34">34</xref>), including a lens of agreement (<xref ref-type="bibr" rid="B21">21</xref>, <xref ref-type="bibr" rid="B29">29</xref>, <xref ref-type="bibr" rid="B35">35</xref>&#x2013;<xref ref-type="bibr" rid="B37">37</xref>). Providing a measure of consistency to clinical gold-standards, benchmarking DL model performance against embryologist performance has become a common method in rationalizing the use of AI systems within <italic>in-vitro</italic> fertilization (IVF) laboratories (<xref ref-type="bibr" rid="B13">13</xref>). Practically, however, a major barrier to adoption has been attributed to difficulties in interpreting how a model has reached a particular decision. Thus, the lack of transparency within &#x201C;black box&#x201D; DL architectures, where decisions can carry significant ethical, emotional and medical implications, leads to mistrust, misinterpretation and poor clinical integration (<xref ref-type="bibr" rid="B38">38</xref>, <xref ref-type="bibr" rid="B39">39</xref>). Moreover, an incomplete understanding of model predictions raises concerns about whether predictions are based on biologically meaningful features, or spurious correlations (<xref ref-type="bibr" rid="B40">40</xref>). Remarkably, very few studies have examined whether embryologists and DL models reach the same decisions based on shared interpretive features (<xref ref-type="bibr" rid="B36">36</xref>), thereby representing a notable gap in the effective evaluation of clinical AI (<xref ref-type="bibr" rid="B41">41</xref>).</p>
<p>Building upon these observations, explainable artificial intelligence (XAI) is emerging to bridge the gap between accuracy and agreement (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B38">38</xref>, <xref ref-type="bibr" rid="B42">42</xref>&#x2013;<xref ref-type="bibr" rid="B44">44</xref>), a distinction that is critical when model classifications must be justified or explained. XAI methods aim to increase interpretability by visualizing regions or features that contribute strongly to resultant predictions (<xref ref-type="bibr" rid="B43">43</xref>, <xref ref-type="bibr" rid="B45">45</xref>). One such method, Gradient-weighted Class Activation Mapping (Grad-CAM), generates local, gradient-based saliency maps, known as heatmaps, that visualize model attention upon classification tasks (<xref ref-type="bibr" rid="B46">46</xref>&#x2013;<xref ref-type="bibr" rid="B48">48</xref>). Another occlusion-based approach perturbs the input image to create Local Interpretable Model-agnostic Explanations (LIME), which outline salient input regions that contribute towards changes in the class of a given model (<xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B49">49</xref>). Despite the growing interest in XAI, applications of Grad-CAM (<xref ref-type="bibr" rid="B37">37</xref>, <xref ref-type="bibr" rid="B47">47</xref>, <xref ref-type="bibr" rid="B48">48</xref>, <xref ref-type="bibr" rid="B50">50</xref>&#x2013;<xref ref-type="bibr" rid="B52">52</xref>), LIME (<xref ref-type="bibr" rid="B53">53</xref>, <xref ref-type="bibr" rid="B54">54</xref>), or a combination of XAI methods (<xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B49">49</xref>, <xref ref-type="bibr" rid="B51">51</xref>) still lack comprehensive evaluation by domain experts. In fact, most existing work focuses on predictive outcomes such as blastocyst quality or implantation potential, without deeper evaluation of spatial attention patterns (<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B15">15</xref>). Consequently, reports that demonstrate alignment between model accuracy, agreement and domain expert interpretability remain scarce.</p>
<p>This study aims to address these gaps by evaluating the accuracy and agreement of clinical embryologists and two DL models, ResNet-34 and VGG16, in the classification of single-point images representing embryonic developmental stages. The choice of DL models in this study is based on our previous publication (<xref ref-type="bibr" rid="B49">49</xref>) and represents some of the most frequently used architectures for embryo development and assessment tasks (<xref ref-type="bibr" rid="B13">13</xref>). Crucially, beyond model performance alone, we apply Grad-CAM to each model, thereby creating &#x201C;explanations&#x201D; that are evaluated for biological relevance and interpretability by embryologists. Additionally, we apply LIME explanations to the same models in order to quantitatively evaluate the degree of spatial overlap between independent XAI techniques. The selection of local, <italic>post-hoc</italic> XAI methods such as Grad-CAM and LIME are intended to cater to the equally local or individual differences observed within embryo cohorts, thereby allowing insight into specific predictions, rather than global markers. By investigating classification patterns, domain expert assessments and explainability-based evaluations, this study aims to provide deeper insight into the reliability of AI-driven embryo stage classification, thereby contributing to the responsible development and implementation of AI tools in ART.</p>
</sec>
<sec id="s2" sec-type="methods"><label>2</label><title>Methods</title>
<sec id="s2a"><label>2.1</label><title>Data collection</title>
<p>Data from 531 embryos were obtained retrospectively from 526 women undergoing assisted reproductive treatment (ART) at a single fertility clinic in Oslo, Norway, between October 2013 and February 2019. All fertilized zygotes, inseminated by either <italic>in-vitro</italic> fertilization (IVF) or intracytoplasmic sperm injection (ICSI), were placed inside individual culture chambers within an EmbryoScope&#x2122; (Vitrolife, Denmark) time-lapse incubator (5&#x0025; O<sub>2</sub>, 6&#x0025; CO<sub>2</sub>, 89&#x0025; N<sub>2</sub>). Embryos were cultured and monitored within the EmbryoScope&#x2122; for a maximum of 5 days. Annotations were manually performed by three separate operators using EmbryoViewer&#x2122; (Vitrolife, Denmark) and denoted the following stages: start of 2-cell (t2), 3-cell (t3), 4-cell (t4), 5-cell (t5), 8-cell (t8), 9-cell stage (t9), morula stage (tM), and formation of a full blastocyst (tB). Manual annotations by embryologists made use of all available focal planes during routine embryo assessment, consistent with current guidelines for good practice (<xref ref-type="bibr" rid="B55">55</xref>). These annotations were used to identify the corresponding frame number, thereby informing the extraction of frames that were morphologically appropriate and unambiguous to each developmental stage of interest. As described previously in Sharma et al. (<xref ref-type="bibr" rid="B49">49</xref>), in order to make the dataset more robust, frames from the central and peripheral focal planes were randomly extracted 1&#x2013;3 frames after the annotated timing and used for model training. The independent test set contained images of embryos at the central plane only.</p>
</sec>
<sec id="s2b"><label>2.2</label><title>Deep learning algorithms</title>
<p>ResNet-34 and VGG16 were used in this study based on calculated performance metrics (<xref ref-type="sec" rid="s11">Supplementary Table S1</xref>), the training and fine-tuning of which is previously described (<xref ref-type="bibr" rid="B49">49</xref>). Briefly, 8-bit images (500&#x2009;&#x00D7;&#x2009;500) were obtained from the EmbryoScope&#x2122; and resized for both models (224&#x2009;&#x00D7;&#x2009;224). ImageNet base weights were utilized, and fine-tuning was conducted using the Adam Optimizer (learning rate 0.001). Standard normalization was applied, and no data augmentation techniques were used. For model training, images were extracted from a total of 350 embryos, from which 335 successfully developed to the blastocyst stage. Likewise, images were extracted from 150 different embryos for fine-tuning, and 31 embryos for the independent test set (<xref ref-type="fig" rid="F1">Figure&#x00A0;1A</xref>). Of note, embryos were distributed across each dataset prior to extraction of developmental stage frames to ensure the same embryo did not recur, thereby preventing data leakage. All classification results from this study represent data from the independent test set alone.</p>
<fig id="F1" position="float"><label>Figure&#x00A0;1</label>
<caption><p>Overview of methodology. <bold>(A)</bold> Data distribution and methodology. Extracted frames representing stages of embryo development from 500 individual embryos, including the 2-cell, 3-cell, 4-cell, 5-cell, 8-cell, 9-cell, morula and blastocyst stages, were used to train, validate and fine-tune ResNet-34 and VGG16. An independent test set of extracted frames representing the remaining 31 embryos were presented to three embryologists, VGG16 and ResNet-34 for classification of embryo development stages. Further generation of explainable outputs by each DL model was conducted by means of Grad-CAM and LIME. Grad-CAM outputs were additionally evaluated by the same embryologists for relevance to biologically relevant areas. <bold>(B)</bold> Visualization of overlap between XAI outputs using binary masks. Each frame presented to ResNet-34 and VGG16 generated explainable outputs using Grad-CAM and LIME. Binary masks were created using a custom python script from each XAI output, per frame, and were overlayed to calculate intersection over union (IoU). IoU is presented as a proportion (&#x0025;) for each frame.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="frph-08-1778326-g001.tif"><alt-text content-type="machine-generated">Panel A illustrates the workflow for classifying embryo developmental stages using extracted microscopic images and deep learning models (VGG16, ResNet-34), with validation by embryologists and explainable AI outputs like Grad-CAM, LIME, and binary masks. Panel B compares ResNet-34 and VGG16 visualization outputs on embryo images, displaying original images, heatmaps, binary masks, and overlay masks with intersection over union scores of 17.7 percent and 31.5 percent respectively.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2c"><label>2.3</label><title>Embryologists&#x0027; evaluation of Grad-CAM outputs</title>
<p>To assess the spatial focus of DL models in classification tasks, features used to classify each image were visualized using Grad-CAM, the configuration of which is previously described (<xref ref-type="bibr" rid="B49">49</xref>). The resultant color grading ranged from areas of high relevance (red) to low relevance (blue), for each model. These Grad-CAM outputs were independently evaluated by three embryologists (<xref ref-type="fig" rid="F1">Figure&#x00A0;1A</xref>), each of whom rated the relevance of each Grad-CAM computation as &#x201C;good&#x201D; (identified areas of high relevance are localized within biologically relevant structures/areas), &#x201C;poor&#x201D; (identified areas of high relevance did not correspond to the correct or any biologically relevant structures/areas), or &#x201C;intermediate&#x201D; (identified areas of high relevance corresponded to both relevant and irrelevant biological structures/areas). These assessment groups were developed following several focused discussions with embryologists that considered the range of patterns displayed by each Grad-CAM output and were re-grouped into simplistic assessment criteria following operator consensus (<xref ref-type="sec" rid="s11">Supplementary Material 2</xref>). Embryologists were blinded to the classification result by the DL models. Proportions of &#x201C;good&#x201D;, &#x201C;intermediate&#x201D; and &#x201C;poor&#x201D; assessments were calculated per developmental stage for each DL model.</p>
</sec>
<sec id="s2d"><label>2.4</label><title>LIME explanations and calculation of spatial overlap</title>
<p>All images from the independent test set were further assessed using Local Interpretable Model-agnostic Explanations (LIME), the configuration of which is previously described (<xref ref-type="bibr" rid="B49">49</xref>). Only regions contributing positively to the predicted embryo stage were generated, where super pixels were denoted by yellow boundaries. To ascertain the extent of spatial overlap between both XAI techniques, binary masks were created for each explanation (<xref ref-type="fig" rid="F1">Figure&#x00A0;1B</xref>). A custom Python script was created to process Grad-CAM and LIME outputs, generate binary masks and calculate intersection over union (IoU), available on GitHub (Faiga91/IoU-Interpretability-Analysis; commit 065de77). For Grad-CAM masks, color segmentation was used to establish higher and lower bounds corresponding to red and cyan hues, respectively. In this manner, the range of pixels within the identified color bounds were masked as white, corresponding to contributive values. On the other hand, blue hues were considered non-contributive and remained black. For LIME masks, yellow super pixel contours were used to create a base mask, which was then filled and merged into a single, white mask. Pixels outside of the yellow boundaries were converted into black, non-contributive pixels. Thus, to ascertain the level of spatial overlap between each binary mask, an IoU score was calculated between 0 and 1. Values of 0 denoted no spatial overlap between Grad-CAM and LIME explanations, whereas values of 1 represented complete spatial overlap between XAI outputs. IoU values are presented as proportion (&#x0025;) of overlap between XAI outputs.</p>
</sec>
<sec id="s2e"><label>2.5</label><title>Statistical analysis</title>
<p>All statistical analyses were conducted using IBM SPSS Statistics (version 29). Embryo-level predictions from each rater and model were encoded as ordinal variables ranging from 1 to 7, corresponding to the developmental stages: 2-cell, 3-cell, 4-cell, 5-cell, 8 or 9 cell, morula, and blastocyst, respectively. Upon initial classification assessment by embryologists, misclassification of 8-cell embryos as 9-cell embryos, and vice versa, was frequently observed. This pilot finding led to the merging of these groups to mimic clinical practice in the grading of these stages. Classification accuracy was calculated as the proportion of embryos correctly identified relative to the reference standard. Agreement between each rater/model and the reference standard was evaluated using quadratic-weighted Cohen&#x0027;s kappa (<italic>&#x03BA;</italic>) and 95&#x0025; confidence intervals for <italic>&#x03BA;</italic> were estimated. Inter-rater reliability was assessed through pairwise quadratic-weighted Cohen&#x0027;s <italic>&#x03BA;</italic> across all raters and models. Additionally, Fleiss&#x0027; <italic>&#x03BA;</italic> was calculated to summarize agreement among the three embryologists. Stage-specific performance was examined by computing accuracy within each developmental stage subgroup. Confusion matrices were generated to visualize misclassification patterns and identify stages with elevated error rates. Comparative analysis of overall classification accuracy among raters and models was performed using Cochran&#x0027;s <italic>Q</italic> test. Where significant differences were observed, <italic>post-hoc</italic> pairwise comparisons were conducted using McNamar&#x0027;s test, with Holm-adjusted <italic>p</italic>-values applied to control for multiple testing. Chi squared analyses were used to compare proportions of qualitative assessments. A Wilcoxon matched pairs signed-rank test was used to assess stage-wise differences in IoU, between models. <italic>P</italic>-values less than 0.05 were considered significant.</p>
</sec>
<sec id="s2f"><label>2.6</label><title>Ethical considerations</title>
<p>All data used in this study was anonymized and approved by the Regional Committee for Medical and Health Research Ethics &#x2013; South-East Norway (2018/477, REC South-East).</p>
</sec>
</sec>
<sec id="s3" sec-type="results"><label>3</label><title>Results</title>
<sec id="s3a"><label>3.1</label><title>Accuracy observed across raters</title>
<p>A total of 245 images representing various stages of embryo development from 31 embryos were assessed by three clinical embryologists (E1, E2, E3) and two DL models, ResNet-34 and VGG16. Collectively, embryologists accurately identified 89.9&#x0025; (<italic>n</italic>&#x2009;&#x003D;&#x2009;661) of presented images according to developmental stage. In comparison, ResNet-34 and VGG16 accurately classified 78.8&#x0025; (<italic>n</italic>&#x2009;&#x003D;&#x2009;193) and 74.3&#x0025; (<italic>n</italic>&#x2009;&#x003D;&#x2009;182), respectively, evidently displaying higher error rates than human operators. Proportions of incorrect classifications were indeed found to be stage-dependent (<xref ref-type="fig" rid="F2">Figure&#x00A0;2</xref>). Although all raters achieved perfect accuracy at the blastocyst stage and minimal errors at the morula stage, the highest misclassification rate within embryologists occurred at the 3-cell (18&#x0025;) and 5-cell stages (17&#x0025;). Notably, the majority of these misclassifications occurred within adjacent developmental stages, where 11&#x0025; of 3-cell embryos and 10&#x0025; of 5-cell embryos were misclassified as 4-cell embryos (<xref ref-type="fig" rid="F2">Figure&#x00A0;2D</xref>). Similar trends in misclassification were observed for DL models, where 50&#x0025; and 44&#x0025; of 5-cell embryos were accurately identified with VGG16 and ResNet-34, respectively. Strikingly, adjacent cell-stage errors were observed at higher proportions for both DL models compared to embryologists, ranging from 3&#x0025; to 50&#x0025; for ResNet-34 (<xref ref-type="fig" rid="F2">Figure&#x00A0;2E</xref>), and between 3&#x0025; and 45&#x0025; for VGG16 (<xref ref-type="fig" rid="F2">Figure&#x00A0;2F</xref>). An overall significant difference in classification accuracy among the five operators was found [<italic>Q</italic>(4)&#x2009;&#x003D;&#x2009;60.77, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001]. Although <italic>post-hoc</italic> pairwise comparisons (Holm-adjusted) revealed no differences in accuracy between the three embryologists (E1&#x2013;E2: <italic>p</italic>&#x2009;&#x003D;&#x2009;0.629; E1&#x2013;E3: <italic>p</italic>&#x2009;&#x003D;&#x2009;0.845; E2&#x2013;E3: <italic>p</italic>&#x2009;&#x003D;&#x2009;1.000), the accuracy of each embryologist differed from both deep learning models (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). However, no differences in accuracy were found between ResNet-34 and VGG16 (<italic>p</italic>&#x2009;&#x003D;&#x2009;0.144).</p>
<fig id="F2" position="float"><label>Figure&#x00A0;2</label>
<caption><p>Confusion matrices for all raters. Each matrix shows the proportion of embryos in each true developmental stage that were classified into each predicted stage by <bold>(A)</bold> Embryologist 1 (E1), <bold>(B)</bold> Embryologist 2 (E2), <bold>(C)</bold> Embryologist 3 (E3), <bold>(D)</bold> All embryologists (E1, E2, E3 combined), <bold>(E)</bold> ResNet-34 and <bold>(F)</bold> VGG16. Darker shades of red indicate higher proportions of accuracy (correctly classified). Correct classifications are represented along the diagonal. while off-diagonal cells indicate misclassifications. Proportions are rounded up to the nearest whole value.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="frph-08-1778326-g002.tif"><alt-text content-type="machine-generated">Confusion matrix graphic with six panels labeled A to F, each showing classification results for embryonic stages using Embryologists (A-D), ResNet-34 (E), and VGG16 (F). Diagonal cells have high values, indicating stronger accuracy for true classifications in most cases.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3b"><label>3.2</label><title>Agreement between raters</title>
<p>Agreement with the reference standard was close to perfect for all raters and models (<xref ref-type="table" rid="T1">Table&#x00A0;1</xref>). Embryologists demonstrated slightly higher agreement with the reference standard (<italic>&#x03BA;</italic> range: 0.969&#x2013;0.976), however both DL models also showed excellent agreement (ResNet-34: <italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.932; VGG16: <italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.940). All <italic>&#x03BA;</italic> values were statistically significant (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), with overlapping confidence intervals between embryologists and models. Perfect agreement was observed at the blastocyst stage (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;1.000) for all raters, leading to its removal from the present analysis. For all remaining stages, agreement among the three embryologists was high, with an overall Fleiss&#x0027; <italic>&#x03BA;</italic> of 0.878 (95&#x0025; CI: 0.848&#x2013;0.908. <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), indicating excellent reliability across human operators. Fleiss&#x0027; <italic>&#x03BA;</italic> was only used to assess agreement among the three embryologists, whereas Cohen&#x0027;s <italic>&#x03BA;</italic> was sufficient to make comparisons between two operators. As such, pairwise quadratic weighted Cohen&#x0027;s kappa between all operators indicated high agreement (<xref ref-type="table" rid="T2">Table&#x00A0;2</xref>).</p>
<table-wrap id="T1" position="float"><label>Table&#x00A0;1</label>
<caption><p>Quadratic-weighted Cohen&#x0027;s <italic>&#x03BA;</italic> values between each rater/model and the reference standard (ground truth).</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Operator</th>
<th valign="top" align="center">Weighted kappa (<italic>&#x03BA;</italic>)</th>
<th valign="top" align="center">95&#x0025;CI</th>
<th valign="top" align="center"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">E1</td>
<td valign="top" align="center">0.969</td>
<td valign="top" align="center">0.948&#x2013;0.990</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">E2</td>
<td valign="top" align="center">0.974</td>
<td valign="top" align="center">0.954&#x2013;0.993</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">E3</td>
<td valign="top" align="center">0.976</td>
<td valign="top" align="center">0.957&#x2013;0.995</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">ResNet-34</td>
<td valign="top" align="center">0.932</td>
<td valign="top" align="center">0.893&#x2013;0.971</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">VGG16</td>
<td valign="top" align="center">0.940</td>
<td valign="top" align="center">0.912&#x2013;0.968</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF1"><p>Embryologists (E1, E2, E3) and deep learning models (ResNet-34, VGG16) displayed excellent agreement to ground truth.</p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float"><label>Table&#x00A0;2</label>
<caption><p>Pairwise quadratic-weighted Cohen&#x0027;s &#x03BA; values between and within operators.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Comparison of operators</th>
<th valign="top" align="center">Weighted kappa (&#x03BA;)</th>
<th valign="top" align="center">95&#x0025;CI</th>
<th valign="top" align="center"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">E1-E2</td>
<td valign="top" align="center">0.974</td>
<td valign="top" align="center">0.958&#x2013;0.991</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">E1-E3</td>
<td valign="top" align="center">0.974</td>
<td valign="top" align="center">0.960&#x2013;0.988</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">E2-E3</td>
<td valign="top" align="center">0.983</td>
<td valign="top" align="center">0.975&#x2013;0.991</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">ResNet-E1</td>
<td valign="top" align="center">0.915</td>
<td valign="top" align="center">0.871&#x2013;0.959</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">ResNet-E2</td>
<td valign="top" align="center">0.928</td>
<td valign="top" align="center">0.885&#x2013;0.970</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">ResNet-E3</td>
<td valign="top" align="center">0.926</td>
<td valign="top" align="center">0.884&#x2013;0.968</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">VGG-E1</td>
<td valign="top" align="center">0.930</td>
<td valign="top" align="center">0.899&#x2013;0.960</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">VGG-E2</td>
<td valign="top" align="center">0.938</td>
<td valign="top" align="center">0.910&#x2013;0.967</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">VGG-E3</td>
<td valign="top" align="center">0.934</td>
<td valign="top" align="center">0.901&#x2013;0.966</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">VGG-ResNet</td>
<td valign="top" align="center">0.937</td>
<td valign="top" align="center">0.899&#x2013;0.976</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF2"><p>Embryologists (E1, E2, E3) and deep learning models (ResNet, VGG). Kappa (&#x03BA;) values are presented for each comparison, along with 95&#x0025; confidence intervals (CI) and the corresponding <italic>p</italic>-value.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Stage-specific agreement among embryologists was consistently strong, ranging from 0.778 (95&#x0025; CI: 0.706&#x2013;0.851, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) for 3-cell embryos to 0.952 for 2-cell embryos (95&#x0025; CI: 0.879&#x2013;1.024, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) (<xref ref-type="fig" rid="F3">Figure&#x00A0;3A</xref>). In contrast, negative and non-significant <italic>&#x03BA;</italic> values were found upon 2-cell (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;&#x2212;0.054, 95&#x0025; CI: &#x2212;0.143&#x2013;0.034, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.696) and 3-cell embryo classification by DL models (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;&#x2212;0.005, 95&#x0025; CI: &#x2212;0.496&#x2013;0.485, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.975), suggesting insufficient evidence of agreement between ResNet-34 and VGG16 within the earliest embryo divisions. On the other hand, moderate agreement between the DL models was observed for 4-cell embryos (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.427, 95&#x0025; CI: 0.214&#x2013;0.640, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.015) onwards. Notably, DL models displayed substantial agreement at best, upon classifying the 5-cell stage embryo (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.681, 95&#x0025; CI: 0.503&#x2013;0.860, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). Nonetheless, Cohen&#x0027;s weighted <italic>&#x03BA;</italic> values were found to be consistently lower than <italic>&#x03BA;</italic> values for embryologist agreement at all stages of embryo development, and with larger confidence intervals (<xref ref-type="sec" rid="s11">Supplementary Table S3</xref>).</p>
<fig id="F3" position="float"><label>Figure&#x00A0;3</label>
<caption><p>Agreement and interpretability analyses. <bold>(A)</bold> Stage-wise agreement of embryo classification. Fleiss&#x2019; multirater kappa (<italic>&#x03BA;</italic>) was used to assess stage-wise classification agreement among the three embryologists. Cohen&#x0027;s weighted kappa (<italic>&#x03BA;</italic>) was used to assess stage-wise classification agreement between ResNet-34 and VGG16. Kappa (<italic>&#x03BA;</italic>) values are denoted by orange circles for embryologists, and blue squares for deep learning (DL) models. Error bars represent 95&#x0025; confidence intervals for each <italic>&#x03BA;</italic> value. Grey squares represent analyses where the calculated <italic>&#x03BA;</italic> value was not found to be significant (<italic>p</italic>&#x2009;&#x003E;&#x2009;0.05). <bold>(B)</bold> Grad-CAM explanations generated by ResNet-34. Examples of Grad-CAM outputs generated by ResNet-34 are presented in each row for the 3-cell, 5-cell, 8- or 9-cell embryos, morulae and blastocysts. Columns display one example of embryos evaluated as &#x201C;good&#x201D;, &#x201C;intermediate&#x201D; or &#x201C;poor&#x201D; for each relevant embryo stage. If an &#x201C;intermediate&#x201D; or &#x201C;poor&#x201D; evaluation was not assigned at any particular stage, the figure states &#x201C;not observed&#x201D; (n.o). White asterisks on selected frames indicate only one observation of the Grad-CAM evaluation at the relevant embryo development stage. <bold>(C)</bold> Grad-CAM explanations generated by VGG16. Examples of Grad-CAM outputs generated by VGG16 are presented in each row for the 3-cell, 5-cell, 8- or 9-cell embryos, morulae and blastocysts. Columns display one example of embryos evaluated as &#x201C;good&#x201D;, &#x201C;intermediate&#x201D; or &#x201C;poor&#x201D; for each relevant embryo stage. If an &#x201C;intermediate&#x201D; or &#x201C;poor&#x201D; evaluation was not assigned at any particular stage, the figure states &#x201C;not observed&#x201D; (n.o). <bold>(D)</bold> Overall evaluation of Grad-CAM explanations by embryologists. The overall proportion of Grad-CAM explanations evaluated as &#x201C;good&#x201D; and &#x201C;poor&#x201D; for each deep learning (DL) model. Red bars represent &#x201C;good&#x201D; evaluations from ResNet-34 and VGG16, whereas blue bars represent &#x201C;poor&#x201D; evaluations by both models. Proportions (&#x0025;) are listed at the top of each bar. <italic>P</italic>-values were calculated using a chi-square analysis and are displayed using comparison lines.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="frph-08-1778326-g003.tif"><alt-text content-type="machine-generated">Panel A presents a bar graph comparing weighted kappa agreement between embryologists and deep learning models across various embryo development stages. Panel B shows heatmap overlays from the ResNet-34 model classifying embryos as good, intermediate, or poor, with some entries marked as not observed. Panel C displays similar heatmaps for the VGG16 model. Panel D provides a bar graph comparing the proportion of Grad-CAM explanations rated as good or poor by embryologists for each model, showing higher agreement and clarity for ResNet-34.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3c"><label>3.3</label><title>Interpretability of Grad-CAM explanations</title>
<p>Grad-CAM outputs were generated by ResNet-34 and VGG16 (<xref ref-type="fig" rid="F3">Figure&#x00A0;3B</xref>). These outputs were presented to embryologists and assessed for relevance to biologically appropriate areas as &#x201C;good&#x201D;, &#x201C;intermediate&#x201D; or &#x201C;poor&#x201D;. Overall, we observed greater proportions of &#x201C;good&#x201D; classifications assigned to ResNet-34 Grad-CAM outputs (89&#x0025;) compared to those generated VGG16 (59&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001, <xref ref-type="fig" rid="F3">Figure&#x00A0;3D</xref>). Notably, very few Grad-CAM explanations (1&#x0025;, <italic>n</italic>&#x2009;&#x003D;&#x2009;3) stemming from ResNet-34 were evaluated as &#x201C;poor&#x201D;. In fact, a larger proportion of VGG16 Grad-CAM outputs were assessed as &#x201C;poor&#x201D; (27&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), suggesting that embryologists interpreted ResNet-34 to focus on more biologically relevant areas within embryo images, compared to VGG16.</p>
<sec id="s3c1"><label>3.3.1</label><title>Stage-wise assessment of Grad-CAM outputs</title>
<p>Across all stages of development, with the exception of the blastocyst stage, embryologists assessed ResNet-34 Grad-CAMs as &#x201C;good&#x201D; at higher proportions than VGG16 generated Grad-CAM explanations (<xref ref-type="fig" rid="F4">Figure&#x00A0;4A</xref>; <xref ref-type="sec" rid="s11">Supplementary Table S4</xref>). The proportion of blastocyst-stage embryos assessed as &#x201C;good&#x201D; were comparable between ResNet-34 (91&#x0025;) and VGG16 (90&#x0025;). In contrast, &#x201C;poor&#x201D; assessments were observed at higher proportions for VGG16 generated Grad-CAM outputs than ResNet-34 Grad-CAM outputs for 2-cell (48&#x0025; vs. 0&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), 3-cell (29&#x0025; vs. 3&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.012), 4-cell (25&#x0025; vs. 3&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.026) and 8- or 9-cell embryos (13&#x0025; vs. 0&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.006) (<xref ref-type="fig" rid="F4">Figure&#x00A0;4B</xref>). Strikingly, 70&#x0025; of VGG16 generated Grad-CAM outputs were assigned as &#x201C;poor&#x201D; for morula stage embryos in particular, indicating a majority of all VGG16 Grad-CAM outputs did not focus on biologically meaningful areas upon morula stage classification. Likewise, proportions of &#x201C;intermediate&#x201D; assessments did not vary between ResNet-34 and VGG16 for any embryo stages, except the morula stage (<xref ref-type="fig" rid="F4">Figure&#x00A0;4C</xref>; <xref ref-type="sec" rid="s11">Supplementary Table S4</xref>). In fact, a higher proportion of ResNet-34 generated Grad-CAM explanations for the morula were assigned as &#x201C;intermediate&#x201D; (43&#x0025;), compared to VGG16 (10&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.007). On the other hand, Grad-CAM explanations at the blastocyst stage presented from either model did not result in any &#x201C;poor&#x201D; evaluations from embryologists.</p>
<fig id="F4" position="float"><label>Figure&#x00A0;4</label>
<caption><p>Evaluation of biological relevance and resultant accuracy. <bold>(A)</bold> &#x201C;Good&#x201D; Grad-CAM evaluations according to embryo stage. Bars indicate proportion (&#x0025;) of images in each stage of embryo development that were assessed by embryologists for relevance of biological focus. Solid red bars represent Grad-CAM explanations generated by ResNet-34, while checkered bars represent VGG16 Grad-CAM explanations. Proportions are listed to the right of each relevant bar. <italic>P</italic>-values are listed using comparison lines for significant comparisons only, assessed using chi-squared tests. No comparison lines or <italic>p</italic>-values are listed where <italic>p</italic>&#x2009;&#x003E;&#x2009;0.05. <bold>(B)</bold> &#x201C;Poor&#x201D; Grad-CAM evaluations according to embryo stage. Bars indicate proportion (&#x0025;) of images in each stage of embryo development that were assessed by embryologists for relevance of biological focus. Solid blue bars represent Grad-CAM explanations generated by ResNet-34, while checkered bars represent VGG16 Grad-CAM explanations. Proportions are listed to the right of each relevant bar. <italic>P</italic>-values are listed using comparison lines for significant comparisons only, assessed using chi-squared tests. No comparison lines or <italic>p</italic>-values are listed where <italic>p</italic>&#x2009;&#x003E;&#x2009;0.05. <bold>(C)</bold> &#x201C;Intermediate&#x201D; Grad-CAM evaluations at the morula stage. Bars indicate proportion (&#x0025;) of images in each stage of embryo development that were assessed by embryologists for relevance of biological focus. Solid orange bars represent Grad-CAM explanations generated by ResNet-34, while checkered bars represent VGG16 Grad-CAM explanations. Proportions are listed above each relevant bar. <italic>P</italic>-values are listed using comparison lines, assessed using chi-squared tests. <bold>(D)</bold> Proportions of correctly classified images by ResNet-34 according to spatial focus group. Red circles represent the proportion of images that were classified as &#x201C;good&#x201D; by embryologists and correctly classified by ResNet-34. Orange squares represent the proportion of images that were classified as &#x201C;intermediate&#x201D; by embryologists and correctly classified by ResNet-34. Blue triangles represent the proportion of images that were classified as &#x201C;poor&#x201D; by embryologists and correctly classified by ResNet-34. Asterisks colored in orange or blue indicate too few instances to allow statistical comparison for the proportion of embryo images evaluated as &#x201C;intermediate&#x201D; or &#x201C;poor&#x201D;, respectively. No comparison lines or <italic>p</italic>-values are listed where <italic>p</italic>&#x2009;&#x003E;&#x2009;0.05. <bold>(E)</bold> Proportions of correctly classified images by VGG16 according to spatial focus group. Red circles represent the proportion of images that were classified as &#x201C;good&#x201D; by embryologists and correctly classified by VGG16. Orange squares represent the proportion of images that were classified as &#x201C;intermediate&#x201D; by embryologists and correctly classified by VGG16. Blue triangles represent the proportion of images that were classified as &#x201C;poor&#x201D; by embryologists and correctly classified by VGG16. Asterisks colored in orange or blue indicate too few instances to allow statistical comparison for the proportion of embryo images evaluated as &#x201C;intermediate&#x201D; or &#x201C;poor&#x201D;, respectively. <italic>P</italic>-values are listed using comparison lines for significant comparisons only. No comparison lines or <italic>p</italic>-values are listed where <italic>p</italic>&#x2009;&#x003E;&#x2009;0.05.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="frph-08-1778326-g004.tif"><alt-text content-type="machine-generated">Composite scientific figure comparing ResNet-34 and VGG16 models using Grad-CAM outputs across embryo development stages. Panels A, B, and C show bar graphs of Grad-CAM outputs assessed as \"Good,\" \"Poor,\" and \"Intermediate,\" respectively, with proportions and statistical significance. Panel D displays a scatter plot for ResNet-34 showing proportions of correctly classified images by category (\"Good,\" \"Intermediate,\" \"Poor\") at each stage. Panel E presents a similar scatter plot for VGG16. Key data points, counts, and statistical values are annotated on the graphs.</alt-text>
</graphic>
</fig>
<p>To further ascertain whether domain expert evaluation of spatial focus aligned with observed accuracy, the proportion of correctly classified images assessed as &#x201C;good&#x201D;, &#x201C;intermediate&#x201D; and &#x201C;poor&#x201D; were illustrated for ResNet-34 and VGG16 (<xref ref-type="fig" rid="F4">Figures&#x00A0;4D,E</xref>). Indeed, our findings illustrate greater accuracy within &#x201C;good&#x201D; ResNet-34 Grad-CAM outputs, particularly for 2-cell (97&#x0025;) and 3-cell (72&#x0025;) embryos (<xref ref-type="fig" rid="F4">Figure&#x00A0;4D</xref>). However, despite 100&#x0025; of 5-cell stage ResNet-34 Grad-CAMs being evaluated as &#x201C;good&#x201D; by embryologists, only 43&#x0025; of these instances were accurately classified. Moreover, there were no observed differences in accuracy between ResNet-34 Grad-CAMs assessed as &#x201C;good&#x201D; or &#x201C;intermediate&#x201D; for the 8- or 9-cell stage (79&#x0025; vs. 60&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.322), morula stage (100&#x0025; vs. 92&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.448) or blastocyst stage (100&#x0025; vs. 100&#x0025;, <italic>p</italic>&#x2009;&#x003E;&#x2009;0.999).</p>
<p>Remarkably, greater accuracy was observed for &#x201C;poor&#x201D; VGG16 generated Grad-CAM outputs at the 2-cell stage (100&#x0025;) compared to &#x201C;good&#x201D; outputs (67&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.031, <xref ref-type="fig" rid="F4">Figure&#x00A0;4E</xref>). Although this was the only comparison indicating a statistically significant difference in accuracy between spatial focus groups, comparable proportions of accuracy were consistently observed at the 3-cell (Good: 78&#x0025; vs. Poor: 56&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.372), 4-cell (Good: 47&#x0025; vs. Poor: 50&#x0025;, <italic>p</italic>&#x2009;&#x003E;&#x2009;0.999) and 5-cell stages (Good: 62&#x0025; vs. Intermediate: 40&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.613). Notably, 95&#x0025; of morula-stage Grad-CAMs evaluated with &#x201C;poor&#x201D; biological relevance were correctly classified, compared to 83&#x0025; accuracy within frames assessed as &#x201C;good&#x201D;. Similarly, proportions of accurately classified frames within &#x201C;good&#x201D; (73&#x0025;), &#x201C;intermediate&#x201D; (67&#x0025;) and &#x201C;poor&#x201D; (75&#x0025;) assessments were comparable. Moreover, 100&#x0025; accuracy was observed at the blastocyst stage by both ResNet-34 (<xref ref-type="fig" rid="F4">Figure&#x00A0;4D</xref>) and VGG16 (<xref ref-type="fig" rid="F4">Figure&#x00A0;4E</xref>) regardless of qualitative assessment as &#x201C;good&#x201D; or &#x201C;intermediate&#x201D;.</p>
</sec>
</sec>
<sec id="s3d"><label>3.4</label><title>Spatial overlap with LIME explanations</title>
<p>Binary masks were generated to calculate intersection over union (IoU) scores and quantitatively assess the degree of overlap between Grad-CAM and LIME explanations. A Wilcoxon matched-pairs signed-rank test demonstrated higher IoU for ResNet-34 explanations (26.6&#x0025;, 95&#x0025; CI: 25.4&#x2013;28.4) compared to VGG16 (25.2&#x0025;, 95&#x0025; CI: 21.0&#x2013;28.8, W&#x2009;&#x003D;&#x2009;&#x2212;6,091, median difference&#x2009;&#x003D;&#x2009;&#x2212;0.0236, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.006), suggesting Grad-CAM and LIME outputs from ResNet-34 were more spatially consistent across XAI methods, compared to VGG16. Yet, upon investigating the median distribution of IoU between embryo development stages (<xref ref-type="fig" rid="F5">Figure&#x00A0;5A</xref>), we found no significant differences in median IoU between models among any of the cleavage stages (2-cell to 8- or 9-cell embryos). On the other hand, spatial overlap between VGG16 generated XAI outputs was significantly lower at both the morula (0&#x0025;, 95&#x0025; CI: 0.0&#x2013;4.0, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) and blastocyst stages (17.0&#x0025;, 95&#x0025; CI: 11.0&#x2013;21.0, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.006) compared to ResNet-34 generated outputs (<italic>Morula</italic>: 22.5&#x0025;, 95&#x0025; CI: 21.0&#x2013;26.0; <italic>Blastocyst</italic>: 21.0&#x0025;, 95&#x0025; CI:19.0&#x2013;27.0).</p>
<fig id="F5" position="float"><label>Figure&#x00A0;5</label>
<caption><p>Intersection over union (IoU) between grad-CAM and LIME explanations. Median IoU distribution among embryo development stages. Embryo development stages listed along the <italic>x</italic>-axis are ordinally coded to represent the 2-cell embryo (2), 3-cell embryo (3), 4-cell embryo (4), 5-cell embryo (5), 8- or 9-cell embryo (8), morula (1) and blastocyst (0). Blue circles represent median IoU values calculated for ResNet-34 generated explanations, whereas green triangles represent median IoU values calculated for VGG16 generated explanations. Error bars for each node represent 95&#x0025; confidence intervals. <italic>P</italic>-values were calculated using a Wilcoxon matched pairs signed-rank test and listed using comparison lines between significantly different nodes only. No comparison lines or <italic>p</italic>-values are listed where <italic>p</italic>&#x2009;&#x003E;&#x2009;0.05. <bold>(A)</bold> Overall median IoU distribution per embryo stage. <bold>(B)</bold> Median IoU distribution for Grad-CAM outputs evaluated as &#x201C;good&#x201D;. <bold>(C)</bold> Overall median IoU distribution per embryo stage. <bold>(B)</bold> Median IoU distribution for Grad-CAM outputs evaluated as &#x201C;good&#x201D;. Statistical tests could not be reliably conducted as too few ResNet-34 Grad-CAM outputs were assessed as &#x201C;poor&#x201D;.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="frph-08-1778326-g005.tif"><alt-text content-type="machine-generated">Three-panel line graph compares the median intersection over union (IoU) between Grad-CAM and LIME for ResNet and VGG neural networks across embryo development stages. Panel A presents overall results, B shows \"good\" embryos, and C displays \"poor\" embryos. VGG generally exhibits higher IoU values than ResNet, with greater divergence in &#x201C;good&#x201D; embryos at early stages and similar or lower consistency in \"poor\" embryos, especially at later stages. Error bars represent variability, and key p-values are indicated for significant differences.</alt-text>
</graphic>
</fig>
<p>Furthermore, Grad-CAM explanations evaluated as &#x201C;good&#x201D; were assessed for their corresponding overlap with LIME explanations, per developmental stage (<xref ref-type="fig" rid="F5">Figure&#x00A0;5B</xref>). Despite higher proportions of ResNet-34 Grad-CAMs being evaluated as &#x201C;good&#x201D; by embryologists, median IoU scores indicated less spatial overlap with corresponding LIME explanations, compared to VGG16 counterparts. Grad-CAM outputs generated by VGG16 at the 2-cell stage indicated the highest spatial overlap with LIME (52.4&#x0025;, 95&#x0025; CI: 36.2&#x2013;59.1&#x0025;), which was observed to reduce with each subsequent stage of embryo development, to the lowest degree of spatial overlap at the blastocyst stage (17.3&#x0025;, 95&#x0025; CI: 10.7&#x2013;25.9&#x0025;). Comparatively, explanations generated by ResNet-34 showed similar overlap between both XAI outputs from 2-cell embryos (28.7&#x0025;, 95&#x0025; CI: 22.3&#x2013;35.0&#x0025;) to 8 or 9-cell embryos (29.2&#x0025;, 95&#x0025; CI: 23.8&#x2013;32.9&#x0025;). Nonetheless, IoU values for ResNet-34 and VGG16 were comparable from the 4-cell stage to the blastocyst stage. Strikingly, despite both DL models perfectly classifying blastocyst stage embryos, the median IoU values for ResNet-34 and VGG16 are the lowest among all other embryo stages (<italic>ResNet-34</italic>: 20.2&#x0025;, 95&#x0025; CI: 18.5&#x2013;27.3; <italic>VGG16</italic>: 17.3&#x0025;, 95&#x0025; CI: 10.7&#x2013;25.9). Moreover, upon calculating the level of overlap among &#x201C;poor&#x201D; Grad-CAM explanations (<xref ref-type="fig" rid="F5">Figure&#x00A0;5C</xref>), no spatial overlap was observed from VGG16 at the morula stage, suggesting the observed Grad-CAM explanations did not intersect with corresponding LIME super pixels from the same embryos.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion"><label>4</label><title>Discussion</title>
<p>This study investigated differences in accuracy and inter-rater agreement between human operators and DL models during embryo stage classification. Embryologists, individually and collectively (89.9&#x0025;), achieved greater accuracy in embryo stage classification compared to ResNet-34 (78.8&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) and VGG16 (74.3&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). Despite these observed differences in accuracy, all raters demonstrated excellent overall agreement with the reference standard, and with other operators. Contrastingly, further analyses conducted at the developmental stage level revealed lower agreement between DL models, thereby suggesting variations in underlying classification performance that may not be represented by accuracy assessments alone. Further divergence between models was detected upon assessing the biological relevance of Grad-CAM explanations. Although our findings indicate that Grad-CAM explanations generated by ResNet-34 were more frequently deemed biologically meaningful (89&#x0025;) compared to their VGG16 Grad-CAM counterparts (59&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), trends of spatial overlap with LIME explanations suggest that the spatial focus of models may vary by embryo stage, and by the XAI method utilized. Taken together, these findings demonstrate that accuracy, agreement and interpretability are metrics that may not be consistently aligned. Instead, these performance indicators may represent distinct facets of model behavior that may be used to improve trustworthiness and further development of AI-driven assessment tools.</p>
<p>Although accuracy remains the most frequently reported metric within embryo classification models, clinical reliability is often established by measuring agreement between operators (<xref ref-type="bibr" rid="B56">56</xref>). Therefore, we evaluated inter-operator agreement using quadratic-weighted Cohen&#x0027;s kappa (<italic>&#x03BA;</italic>) as an additional model performance indicator (<xref ref-type="bibr" rid="B57">57</xref>). Intriguingly, despite reduced classification accuracy by both DL models, their agreement to the reference standard remained high (<italic>ResNet-34</italic>: <italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.932, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001; <italic>VGG16</italic>: <italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.940, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). Similarly, pairwise comparisons indicated excellent agreement between both DL models, and each individual embryologist. Based on this finding, we speculate the increased weighted <italic>&#x03BA;</italic> may not be representative of the high proportions of adjacent-cell stage errors by DL models. Although the use of a quadratic-weighted Cohen&#x0027;s <italic>&#x03BA;</italic> mirrors the ordinal nature of embryo development, this method of assessing agreement penalizes adjacent cell-stage errors <italic>less</italic> than misclassifications at more distant stages. This notion was reinforced upon investigating per-class agreement between operators, where DL models displayed moderate to fair agreement (<italic>&#x03BA;</italic> range: 0.385&#x2013;0.681) across embryo stages, compared to consistently high embryologist agreement (<italic>&#x03BA;</italic> range: 0.778&#x2013;0.952). Notably, the calculated <italic>&#x03BA;</italic> value was unstable at the 2-cell and 3-cell stages, indicating this analysis may be underpowered and requires cautious interpretation. Nonetheless, this divergence revealed that while ResNet-34 and VGG16 achieve comparable classification accuracy, DL models may perform with different intrinsic reasoning that contributes towards unstable agreement (<xref ref-type="bibr" rid="B35">35</xref>). In fact, our findings align with another study reporting low inter-AI agreement (<xref ref-type="bibr" rid="B21">21</xref>), which remains, to the best of our knowledge, the only investigation reporting agreement metrics between AI-systems. Thus, unlike many studies that focus primarily on performance (<xref ref-type="bibr" rid="B12">12</xref>, <xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B58">58</xref>, <xref ref-type="bibr" rid="B59">59</xref>), our results join recent efforts in highlighting the importance of understanding <italic>how</italic> models reach their decisions.</p>
<p>Recognizing the reduced agreement between ResNet-34 and VGG16 classifications, we further examined embryologists&#x0027; assessment of Grad-CAM heatmaps to ascertain differences in model spatial attention. Overall, ResNet-34 Grad-CAM outputs were rated as biologically relevant more frequently (89&#x0025;) compared to VGG16 generated heatmaps (59&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). Furthermore, consistently &#x201C;good&#x201D; assessments of ResNet-34 Grad-CAMs at the embryo stage-level implies that the model may prioritize features more congruent with embryologist decision-making than VGG16. However, this interpretive advantage was not always aligned with accuracy. At the 5-cell stage, 100&#x0025; of ResNet-34 Grad-CAMs were rated as &#x201C;good&#x201D;, yet only 43&#x0025; of classifications were correct, implying that well-rated explanations of a qualitative nature may not reliably reflect the features driving model predictions (<xref ref-type="bibr" rid="B51">51</xref>, <xref ref-type="bibr" rid="B60">60</xref>, <xref ref-type="bibr" rid="B61">61</xref>). Crucially, these findings raise the concern of interpretability without accuracy, and empirically validate several concerns raised by van Royen et al. (<xref ref-type="bibr" rid="B39">39</xref>) regarding the deployment of XAI in clinical settings. In fact, our results exemplify the risk of &#x201C;illusory understanding&#x201D; that arises when embryologists are presented with visually convincing <italic>post-hoc</italic> explanations that appear plausible, but may obscure underlying discrepancies in model behavior (<xref ref-type="bibr" rid="B62">62</xref>). In such cases, explanatory visualizations may actually impede clinical skepticism by overinterpretations of causality and an increased risk of confirmation bias (<xref ref-type="bibr" rid="B39">39</xref>). Conversely, VGG16 achieved high accuracy at the morula stage (94&#x0025;) despite 70&#x0025; of its Grad-CAM outputs being rated &#x201C;poor,&#x201D; thereby reflecting accuracy without interpretability. Despite a dearth of literature surrounding these discrepancies within embryo-related tasks, correct predictions that arise from spurious or non-biological features are known to undermine trust and implementation within a range of healthcare settings (<xref ref-type="bibr" rid="B60">60</xref>, <xref ref-type="bibr" rid="B61">61</xref>, <xref ref-type="bibr" rid="B63">63</xref>, <xref ref-type="bibr" rid="B64">64</xref>). Importantly, the observed divergence in interpretability between ResNet-34 and VGG16 Grad-CAM outputs occurred despite similar overall accuracy, thus demonstrating that accuracy and interpretability may represent distinct dimensions of model behavior rather than interchangeable indicators of reliability. Taken together, our findings underscore the need for caution when integrating saliency-based XAI tools into embryo assessment workflows, where explanation quality metrics should be evaluated independently from model reliability, instead of a proxy for model performance. This highlights a meaningful gap between model performance and interpretive agreement that warrants further research in embryo-related tasks.</p>
<p>Further, we expanded our investigation to determine whether independent XAI methods, Grad-CAM and LIME, converged on salient features using IoU-based spatial overlap. Overall, we observed that ResNet-34 exhibited a greater degree of median overlap between Grad-CAM and LIME explanations (26.6&#x0025;) compared to VGG16 (25.2&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.006). Upon stage-wise assessment, however, significant variations in median overlap were only observed at the morula (<italic>ResNet-34</italic>: 22.5&#x0025; vs. <italic>VGG16</italic>: 0&#x0025;, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) and blastocyst stage (<italic>ResNet-34</italic>: 21.0&#x0025; vs. <italic>VGG16:</italic> 17.0&#x0025;, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.006). Given that both models achieved high accuracy at these stages, the limited spatial overlap between explanations is unexpected and prompts reconsideration of whether saliency-based spatial attention serves as a reliable proxy for classification accuracy. Indeed, based on embryologists&#x2019; qualitative assessment of Grad-CAM outputs, the majority of VGG16 explanations at the morula stage were &#x201C;poor&#x201D;. Yet, the complete absence of overlap with LIME at this stage suggests that while Grad-CAM visualizations appeared uninformative to embryologists, LIME may have captured alternative, potentially relevant regions of interest. Even so, perfect classification accuracy at the blastocyst stage encapsulating mostly &#x201C;good&#x201D; Grad-CAM explanations, resulted in the lowest IoU for both models. Importantly, this highlights that XAI tools may capture different, potentially arbitrary, features of the same input (<xref ref-type="bibr" rid="B64">64</xref>, <xref ref-type="bibr" rid="B65">65</xref>), rather than converging on causal factors driving classification decisions. Furthermore, this raises concerns about whether XAI methodologies offer genuine mechanistic insight that is grounded in model reasoning, or simply provide a veneer of interpretability that satisfies human intuitions (<xref ref-type="bibr" rid="B63">63</xref>). Further, we speculate that that within the range of developmental stages explored here, the morula and blastocyst present with the most distinct embryo morphologies, at relatively low numbers. This raises concerns regarding overfitting, which may have also contributed to their increased accuracy (<xref ref-type="bibr" rid="B66">66</xref>). It is also important to acknowledge that LIME is inherently stochastic, such that super pixel boundaries may vary across runs, thereby limiting the stability of IoU-based comparisons (<xref ref-type="bibr" rid="B67">67</xref>, <xref ref-type="bibr" rid="B68">68</xref>). Collectively, these findings illustrate that agreement between <italic>post-hoc</italic> XAI methods is not necessarily informed by accuracy but may prove useful in the enhancement of human-perceived interpretability.</p>
<p>Collectively, this work provides a systematic evaluation of accuracy, agreement and interpretability within embryo-stage classification, demonstrating that each component represents an independent dimension of model performance that may diverge within practical tasks. Notably, we highlight that accuracy may arise without biologically grounded interpretive focus, but also that observed interpretive focus lacks application-based and functionality-grounded evaluation (<xref ref-type="bibr" rid="B69">69</xref>). Granting clinical reliability remains largely based on inter-operator agreement, our findings highlight how agreement metrics can quantitatively and qualitatively vary between equally performing models (<xref ref-type="bibr" rid="B35">35</xref>). To our knowledge, this is the first investigation to evaluate embryo-level XAI outputs through direct embryologist review, thereby establishing domain expert focus within interpretability assessments, rather than model performance metrics alone. Notwithstanding, this study harbours several limitations. First, only single-frame images were extracted from retrospectively annotated TLT recordings for the purpose of classification, which may not fully capture the complexity and variability of dynamic embryo development. Additionally, the inclusion of embryos with the most complete morphokinetic annotations upon data collection, in comparison to arrested embryos for example, may have introduced selection bias. Moreover, subjective evaluation of Grad-CAM explanations by embryologists may have introduced confirmation bias towards the biological relevance of generated heatmaps. Similarly, although morphological classification is strongly associated with reproductive potential, it is not equivalent to the primary end-goal of live birth, which limits its clinical generalizability. As such, further research should prioritize the use of large, multicenter datasets to quantitatively and qualitatively interpret model behaviors that are tailored towards the intended domain. For example, as studies strive to create objective measures of explainability, the quantitative use of saliency map-based segmentation shows promise within user interpretability (<xref ref-type="bibr" rid="B35">35</xref>, <xref ref-type="bibr" rid="B60">60</xref>, <xref ref-type="bibr" rid="B70">70</xref>). However, our findings caution the calibration of trust within AI predictions based on the perceived quality of model explanations alone, as the observed dissociation between XAI interpretability and accuracy may, paradoxically, lead to further compromised decision-making than &#x201C;black box&#x201D; models. Overall, establishing robust standards and alignment between accuracy, agreement and interpretability will be essential for the responsible implementation of AI-tools within IVF laboratories.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability"><title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s6" sec-type="ethics-statement"><title>Ethics statement</title>
<p>The studies involving humans were approved by Regional Committee for Medical and Health Research Ethics &#x2013; South-East Norway (2018/477, REC South-East). The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x0027; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p>
</sec>
<sec id="s7" sec-type="author-contributions"><title>Author contributions</title>
<p>RK: Writing &#x2013; original draft, Formal analysis, Writing &#x2013; review &#x0026; editing, Investigation, Methodology, Data curation, Visualization, Conceptualization, Validation. ED: Supervision, Writing &#x2013; original draft, Formal analysis, Investigation, Data curation, Conceptualization, Methodology, Visualization. AS: Methodology, Investigation, Software, Formal analysis, Conceptualization, Validation, Data curation, Writing &#x2013; review &#x0026; editing. DJ: Formal analysis, Validation, Data curation, Methodology, Investigation, Writing &#x2013; review &#x0026; editing. MR: Data curation, Validation, Methodology, Conceptualization, Supervision, Software, Writing &#x2013; review &#x0026; editing, Investigation, Resources, Funding acquisition. TH: Resources, Funding acquisition, Project administration, Supervision, Investigation, Writing &#x2013; review &#x0026; editing. MI: Formal analysis, Writing &#x2013; review &#x0026; editing, Investigation, Methodology. MS: Methodology, Supervision, Resources, Data curation, Conceptualization, Investigation, Writing &#x2013; review &#x0026; editing, Funding acquisition.</p>
</sec>
<ack><title>Acknowledgments</title>
<p>We thank the clinical embryologists at Volvat Spiren for their contribution towards data interpretation, embryo annotation and interpretability assessments. We gratefully acknowledge Faiga Alawad, currently Assistant Professor at Western Norway University of Applied Sciences, for the development of a custom Python script that enabled the spatial evaluation of XAI outputs.</p>
</ack>
<sec id="s9" sec-type="COI-statement"><title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="ai-statement"><title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s12" sec-type="disclaimer"><title>Publisher&#x0027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11" sec-type="supplementary-material"><title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frph.2026.1778326/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frph.2026.1778326/full&#x0023;supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
</sec>
<ref-list><title>References</title>
<ref id="B1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Coticchio</surname> <given-names>G</given-names></name> <name><surname>Ahlstr&#x00F6;m</surname> <given-names>A</given-names></name> <name><surname>Arroyo</surname> <given-names>G</given-names></name> <name><surname>Balaban</surname> <given-names>B</given-names></name> <name><surname>Campbell</surname> <given-names>A</given-names></name> <name><surname>De Los Santos</surname> <given-names>MJ</given-names></name><etal/></person-group> <article-title>The Istanbul consensus update: a revised ESHRE/ALPHA consensus on oocyte and embryo static and dynamic morphological assessment&#x2020;,&#x2021;</article-title>. <source>Hum Reprod</source>. (<year>2025</year>) <volume>40</volume>:<fpage>989</fpage>&#x2013;<lpage>1035</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/deaf097.107</pub-id><pub-id pub-id-type="pmid">40288770</pub-id></mixed-citation></ref>
<ref id="B2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Majumdar</surname> <given-names>G</given-names></name> <name><surname>Majumdar</surname> <given-names>A</given-names></name> <name><surname>Verma</surname> <given-names>IC</given-names></name> <name><surname>Upadhyaya</surname> <given-names>KC</given-names></name></person-group>. <article-title>Relationship between morphology, euploidy and implantation potential of cleavage and blastocyst stage embryos</article-title>. <source>J Hum Reprod Sci</source>. (<year>2017</year>) <volume>10</volume>:<fpage>49</fpage>&#x2013;<lpage>57</lpage>. <pub-id pub-id-type="doi">10.4103/jhrs.JHRS_98_17</pub-id><pub-id pub-id-type="pmid">28479756</pub-id></mixed-citation></ref>
<ref id="B3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>M</given-names></name> <name><surname>Singh</surname> <given-names>B</given-names></name> <name><surname>Baker</surname> <given-names>VL</given-names></name></person-group>. <article-title>Association between embryo morphological quality and birth weight for singletons conceived via autologous fresh embryo transfer: an analysis using society for assisted reproductive technology clinical outcomes reporting system</article-title>. <source>Fertil Steril</source>. (<year>2022</year>) <volume>118</volume>:<fpage>715</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1016/j.fertnstert.2022.06.017</pub-id><pub-id pub-id-type="pmid">35934541</pub-id></mixed-citation></ref>
<ref id="B4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kirkegaard</surname> <given-names>K</given-names></name> <name><surname>Agerholm</surname> <given-names>IE</given-names></name> <name><surname>Ingerslev</surname> <given-names>HJ</given-names></name></person-group>. <article-title>Time-lapse monitoring as a tool for clinical embryo assessment</article-title>. <source>Hum Reprod</source>. (<year>2012</year>) <volume>27</volume>:<fpage>1277</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/des079</pub-id><pub-id pub-id-type="pmid">22419744</pub-id></mixed-citation></ref>
<ref id="B5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>T</given-names></name> <name><surname>Natalwala</surname> <given-names>J</given-names></name> <name><surname>Chapple</surname> <given-names>V</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name></person-group>. <article-title>A brief history of artificial intelligence embryo selection: from black-box to glass-box</article-title>. <source>Hum Reprod</source>. (<year>2024</year>) <volume>39</volume>:<fpage>285</fpage>&#x2013;<lpage>92</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/dead254</pub-id><pub-id pub-id-type="pmid">38061074</pub-id></mixed-citation></ref>
<ref id="B6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dimitriadis</surname> <given-names>I</given-names></name> <name><surname>Zaninovic</surname> <given-names>N</given-names></name> <name><surname>Badiola</surname> <given-names>AC</given-names></name> <name><surname>Bormann</surname> <given-names>CL</given-names></name></person-group>. <article-title>Artificial intelligence in the embryology laboratory: a review</article-title>. <source>Reprod Biomed Online</source>. (<year>2022</year>) <volume>44</volume>:<fpage>435</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1016/j.rbmo.2021.11.003</pub-id><pub-id pub-id-type="pmid">35027326</pub-id></mixed-citation></ref>
<ref id="B7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Veiga</surname> <given-names>E</given-names></name> <name><surname>Olmedo</surname> <given-names>C</given-names></name> <name><surname>S&#x00E1;nchez</surname> <given-names>L</given-names></name> <name><surname>Fern&#x00E1;ndez</surname> <given-names>M</given-names></name> <name><surname>Mauri</surname> <given-names>A</given-names></name> <name><surname>Ferrer</surname> <given-names>E</given-names></name><etal/></person-group> <article-title>Recalculating the staff required to run a modern assisted reproductive technology laboratory</article-title>. <source>Hum Reprod</source>. (<year>2022</year>) <volume>37</volume>:<fpage>1774</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/deac121</pub-id><pub-id pub-id-type="pmid">35652237</pub-id></mixed-citation></ref>
<ref id="B8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cimadomo</surname> <given-names>D</given-names></name> <name><surname>Sosa Fernandez</surname> <given-names>L</given-names></name> <name><surname>Soscia</surname> <given-names>D</given-names></name> <name><surname>Fabozzi</surname> <given-names>G</given-names></name> <name><surname>Benini</surname> <given-names>F</given-names></name> <name><surname>Cesana</surname> <given-names>A</given-names></name><etal/></person-group> <article-title>Inter-centre reliability in embryo grading across several IVF clinics is limited: implications for embryo selection</article-title>. <source>Reprod Biomed Online</source>. (<year>2022</year>) <volume>44</volume>:<fpage>39</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1016/j.rbmo.2021.09.022</pub-id><pub-id pub-id-type="pmid">34819249</pub-id></mixed-citation></ref>
<ref id="B9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>P-Y</given-names></name> <name><surname>Lee</surname> <given-names>C-I</given-names></name> <name><surname>Chen</surname> <given-names>H-H</given-names></name> <name><surname>Huang</surname> <given-names>C-C</given-names></name> <name><surname>Chen</surname> <given-names>M-J</given-names></name> <name><surname>Yu</surname> <given-names>T-N</given-names></name><etal/></person-group> <article-title>Factors influencing oocyte yield and embryo quality in donor IVF cycles: a retrospective cohort study</article-title>. <source>Front Endocrinol (Lausanne)</source>. (<year>2025</year>) <volume>16</volume>:<fpage>1649523</fpage>. <pub-id pub-id-type="doi">10.3389/fendo.2025.1649523</pub-id><pub-id pub-id-type="pmid">41244056</pub-id></mixed-citation></ref>
<ref id="B10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nemerovsky</surname> <given-names>L</given-names></name> <name><surname>Ghetler</surname> <given-names>Y</given-names></name> <name><surname>Bakhshi</surname> <given-names>DI</given-names></name> <name><surname>Rom</surname> <given-names>T</given-names></name> <name><surname>Itskovich</surname> <given-names>A</given-names></name> <name><surname>Yeres</surname> <given-names>N</given-names></name><etal/></person-group> <article-title>Short insemination during conventional <italic>in vitro</italic> fertilization increases embryo quality</article-title>. <source>Andrology</source>. (<year>2025</year>) <volume>13</volume>:<fpage>1402</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1111/andr.13781</pub-id><pub-id pub-id-type="pmid">39415620</pub-id></mixed-citation></ref>
<ref id="B11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sundvall</surname> <given-names>L</given-names></name> <name><surname>Ingerslev</surname> <given-names>HJ</given-names></name> <name><surname>Breth Knudsen</surname> <given-names>U</given-names></name> <name><surname>Kirkegaard</surname> <given-names>K</given-names></name></person-group>. <article-title>Inter- and intra-observer variability of time-lapse annotations</article-title>. <source>Hum Reprod</source>. (<year>2013</year>) <volume>28</volume>:<fpage>3215</fpage>&#x2013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/det366</pub-id><pub-id pub-id-type="pmid">24070998</pub-id></mixed-citation></ref>
<ref id="B12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Salih</surname> <given-names>M</given-names></name> <name><surname>Austin</surname> <given-names>C</given-names></name> <name><surname>Mantravadi</surname> <given-names>K</given-names></name> <name><surname>Seow</surname> <given-names>E</given-names></name> <name><surname>Jitanantawittaya</surname> <given-names>S</given-names></name> <name><surname>Reddy</surname> <given-names>S</given-names></name><etal/></person-group> <article-title>Deep learning classification integrating embryo images with associated clinical information from ART cycles</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>17585</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-02076-x</pub-id><pub-id pub-id-type="pmid">40399312</pub-id></mixed-citation></ref>
<ref id="B13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alsaad</surname> <given-names>R</given-names></name> <name><surname>Abusarhan</surname> <given-names>L</given-names></name> <name><surname>Odeh</surname> <given-names>N</given-names></name> <name><surname>Abd-Alrazaq</surname> <given-names>A</given-names></name> <name><surname>Choucair</surname> <given-names>F</given-names></name> <name><surname>Zegour</surname> <given-names>R</given-names></name><etal/></person-group> <article-title>Deep learning applications for human embryo assessment using time-lapse imaging: scoping review</article-title>. <source>Front Reprod Health</source>. (<year>2025</year>) <volume>7</volume>:<fpage>1549642</fpage>. <pub-id pub-id-type="doi">10.3389/frph.2025.1549642</pub-id><pub-id pub-id-type="pmid">40264925</pub-id></mixed-citation></ref>
<ref id="B14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Moysis</surname> <given-names>L</given-names></name> <name><surname>Iliadis</surname> <given-names>LA</given-names></name> <name><surname>Vergos</surname> <given-names>G</given-names></name> <name><surname>Sotiroudis</surname> <given-names>SP</given-names></name> <name><surname>Boursianis</surname> <given-names>AD</given-names></name> <name><surname>Papatheodorou</surname> <given-names>A</given-names></name><etal/></person-group> <article-title>Artificial intelligence-empowered embryo selection for IVF applications: a methodological review</article-title>. <source>Mach Learn Knowl Extract</source>. (<year>2025</year>) <volume>56</volume>:<fpage>1</fpage>&#x2013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.3390/make7020056</pub-id></mixed-citation></ref>
<ref id="B15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Illingworth</surname> <given-names>PJ</given-names></name> <name><surname>Venetis</surname> <given-names>C</given-names></name> <name><surname>Gardner</surname> <given-names>DK</given-names></name> <name><surname>Nelson</surname> <given-names>SM</given-names></name> <name><surname>Berntsen</surname> <given-names>J</given-names></name> <name><surname>Larman</surname> <given-names>MG</given-names></name><etal/></person-group> <article-title>Deep learning versus manual morphology-based embryo selection in IVF: a randomized, double-blind noninferiority trial</article-title>. <source>Nat Med</source>. (<year>2024</year>) <volume>30</volume>:<fpage>3114</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-024-03166-5</pub-id><pub-id pub-id-type="pmid">39122964</pub-id></mixed-citation></ref>
<ref id="B16"><label>16.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Rad</surname> <given-names>RM</given-names></name> <name><surname>Saeedi</surname> <given-names>P</given-names></name> <name><surname>Au</surname> <given-names>J</given-names></name> <name><surname>Havelock</surname> <given-names>J</given-names></name></person-group>. <article-title>Blastomere cell counting and centroid localization in microscopic images of human embryo</article-title>. <conf-name>2018 IEEE 20th International Workshop on Multimedia Signal Processing (MMSP)</conf-name>; <conf-date>29-31 Aug. 2018</conf-date> (<year>2018</year>). p. <fpage>1</fpage>&#x2013;<lpage>6</lpage></mixed-citation></ref>
<ref id="B17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Raudonis</surname> <given-names>V</given-names></name> <name><surname>Paulauskaite-Taraseviciene</surname> <given-names>A</given-names></name> <name><surname>Sutiene</surname> <given-names>K</given-names></name> <name><surname>Jonaitis</surname> <given-names>D</given-names></name></person-group>. <article-title>Towards the automation of early-stage human embryo development detection</article-title>. <source>Biomed Eng Online</source>. (<year>2019</year>) <volume>18</volume>:<fpage>120</fpage>. <pub-id pub-id-type="doi">10.1186/s12938-019-0738-y</pub-id><pub-id pub-id-type="pmid">31830988</pub-id></mixed-citation></ref>
<ref id="B18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dirvanauskas</surname> <given-names>D</given-names></name> <name><surname>Maskeliunas</surname> <given-names>R</given-names></name> <name><surname>Raudonis</surname> <given-names>V</given-names></name> <name><surname>Damasevicius</surname> <given-names>R</given-names></name></person-group>. <article-title>Embryo development stage prediction algorithm for automated time lapse incubators</article-title>. <source>Comput Methods Programs Biomed</source>. (<year>2019</year>) <volume>177</volume>:<fpage>161</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2019.05.027</pub-id><pub-id pub-id-type="pmid">31319944</pub-id></mixed-citation></ref>
<ref id="B19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rotem</surname> <given-names>O</given-names></name> <name><surname>Schwartz</surname> <given-names>T</given-names></name> <name><surname>Maor</surname> <given-names>R</given-names></name> <name><surname>Tauber</surname> <given-names>Y</given-names></name> <name><surname>Shapiro</surname> <given-names>MT</given-names></name> <name><surname>Meseguer</surname> <given-names>M</given-names></name><etal/></person-group> <article-title>Visual interpretability of image-based classification models by generative latent space disentanglement applied to <italic>in vitro</italic> fertilization</article-title>. <source>Nat Commun</source>. (<year>2024</year>) <volume>15</volume>:<fpage>7390</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-024-51136-9</pub-id><pub-id pub-id-type="pmid">39191720</pub-id></mixed-citation></ref>
<ref id="B20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barhoun</surname> <given-names>A</given-names></name> <name><surname>Balafar</surname> <given-names>MA</given-names></name> <name><surname>Golzari Oskouei</surname> <given-names>A</given-names></name> <name><surname>Sadeghi</surname> <given-names>L</given-names></name></person-group>. <article-title>Human embryo stage classification using an enhanced R(2&#x002B;1)D model and dynamic programming with optimized datasets</article-title>. <source>Biomed Signal Process Control</source>. (<year>2025</year>) <volume>107</volume>:<fpage>107841</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2025.107841</pub-id></mixed-citation></ref>
<ref id="B21"><label>21.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zaninovic</surname> <given-names>N</given-names></name> <name><surname>Sierra</surname> <given-names>JT</given-names></name> <name><surname>Malmsten</surname> <given-names>JE</given-names></name> <name><surname>Rosenwaks</surname> <given-names>Z</given-names></name></person-group>. <article-title>Embryo ranking agreement between embryologists and artificial intelligence algorithms</article-title>. <source>F S Sci</source>. (<year>2024</year>) <volume>5</volume>:<fpage>50</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1016/j.xfss.2023.10.002</pub-id><pub-id pub-id-type="pmid">37820865</pub-id></mixed-citation></ref>
<ref id="B22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Loewke</surname> <given-names>K</given-names></name> <name><surname>Cho</surname> <given-names>JH</given-names></name> <name><surname>Brumar</surname> <given-names>CD</given-names></name> <name><surname>Maeder-York</surname> <given-names>P</given-names></name> <name><surname>Barash</surname> <given-names>O</given-names></name> <name><surname>Malmsten</surname> <given-names>JE</given-names></name><etal/></person-group> <article-title>Characterization of an artificial intelligence model for ranking static images of blastocyst stage embryos</article-title>. <source>Fertil Steril</source>. (<year>2022</year>) <volume>117</volume>:<fpage>528</fpage>&#x2013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.1016/j.fertnstert.2021.11.022</pub-id><pub-id pub-id-type="pmid">34998577</pub-id></mixed-citation></ref>
<ref id="B23"><label>23.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Canat</surname> <given-names>G</given-names></name> <name><surname>Duval</surname> <given-names>A</given-names></name> <name><surname>Gidel-Dissler</surname> <given-names>N</given-names></name> <name><surname>Boussommier-Calleja</surname> <given-names>A</given-names></name></person-group>. <article-title>A novel deep learning approach to identify embryo morphokinetics in multiple time lapse systems</article-title>. <source>Sci Rep</source>. (<year>2024</year>) <volume>14</volume>:<fpage>29016</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-80565-1</pub-id><pub-id pub-id-type="pmid">39578525</pub-id></mixed-citation></ref>
<ref id="B24"><label>24.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vermilyea</surname> <given-names>M</given-names></name> <name><surname>Hall</surname> <given-names>JMM</given-names></name> <name><surname>Diakiw</surname> <given-names>SM</given-names></name> <name><surname>Johnston</surname> <given-names>A</given-names></name> <name><surname>Nguyen</surname> <given-names>T</given-names></name> <name><surname>Perugini</surname> <given-names>D</given-names></name><etal/></person-group> <article-title>Development of an artificial intelligence-based assessment model for prediction of embryo viability using static images captured by optical light microscopy during IVF</article-title>. <source>Hum Reprod</source>. (<year>2020</year>) <volume>35</volume>:<fpage>770</fpage>&#x2013;<lpage>84</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/deaa013</pub-id><pub-id pub-id-type="pmid">32240301</pub-id></mixed-citation></ref>
<ref id="B25"><label>25.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bormann</surname> <given-names>CL</given-names></name> <name><surname>Kanakasabapathy</surname> <given-names>MK</given-names></name> <name><surname>Thirumalaraju</surname> <given-names>P</given-names></name> <name><surname>Gupta</surname> <given-names>R</given-names></name> <name><surname>Pooniwala</surname> <given-names>R</given-names></name> <name><surname>Kandula</surname> <given-names>H</given-names></name><etal/></person-group> <article-title>Performance of a deep learning based neural network in the selection of human blastocysts for implantation</article-title>. <source>Elife</source>. (<year>2020</year>) <volume>9</volume>:<fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.7554/eLife.55301</pub-id></mixed-citation></ref>
<ref id="B26"><label>26.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fitz</surname> <given-names>VW</given-names></name> <name><surname>Kanakasabapathy</surname> <given-names>MK</given-names></name> <name><surname>Thirumalaraju</surname> <given-names>P</given-names></name> <name><surname>Kandula</surname> <given-names>H</given-names></name> <name><surname>Ramirez</surname> <given-names>LB</given-names></name> <name><surname>Boehnlein</surname> <given-names>L</given-names></name><etal/></person-group> <article-title>Should there be an &#x201C;AI&#x201D; in TEAM? Embryologists selection of high implantation potential embryos improves with the aid of an artificial intelligence algorithm</article-title>. <source>J Assist Reprod Genet</source>. (<year>2021</year>) <volume>38</volume>:<fpage>2663</fpage>&#x2013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1007/s10815-021-02318-7</pub-id><pub-id pub-id-type="pmid">34535847</pub-id></mixed-citation></ref>
<ref id="B27"><label>27.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fukunaga</surname> <given-names>N</given-names></name> <name><surname>Sanami</surname> <given-names>S</given-names></name> <name><surname>Kitasaka</surname> <given-names>H</given-names></name> <name><surname>Tsuzuki</surname> <given-names>Y</given-names></name> <name><surname>Watanabe</surname> <given-names>H</given-names></name> <name><surname>Kida</surname> <given-names>Y</given-names></name><etal/></person-group> <article-title>Development of an automated two pronuclei detection system on time-lapse embryo images using deep learning techniques</article-title>. <source>Reprod Med Biol</source>. (<year>2020</year>) <volume>19</volume>:<fpage>286</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1002/rmb2.12331</pub-id><pub-id pub-id-type="pmid">32684828</pub-id></mixed-citation></ref>
<ref id="B28"><label>28.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kragh</surname> <given-names>MF</given-names></name> <name><surname>Rimestad</surname> <given-names>J</given-names></name> <name><surname>Berntsen</surname> <given-names>J</given-names></name> <name><surname>Karstoft</surname> <given-names>H</given-names></name></person-group>. <article-title>Automatic grading of human blastocysts from time-lapse imaging</article-title>. <source>Comput Biol Med</source>. (<year>2019</year>) <volume>115</volume>:<fpage>103494</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2019.103494</pub-id><pub-id pub-id-type="pmid">31630027</pub-id></mixed-citation></ref>
<ref id="B29"><label>29.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Harir</surname> <given-names>Y</given-names></name> <name><surname>Halevy Amiran</surname> <given-names>R</given-names></name> <name><surname>Or</surname> <given-names>Y</given-names></name></person-group>. <article-title>Embryologist versus AI in embryo selection: agreement and impact on pregnancy rates</article-title>. <source>In Vitro Cell Dev Biol Anim</source>. (<year>2025</year>) <volume>61</volume>:<fpage>1107</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1007/s11626-025-01099-y</pub-id><pub-id pub-id-type="pmid">40751026</pub-id></mixed-citation></ref>
<ref id="B30"><label>30.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Duval</surname> <given-names>A</given-names></name> <name><surname>Nogueira</surname> <given-names>D</given-names></name> <name><surname>Dissler</surname> <given-names>N</given-names></name> <name><surname>Maskani Filali</surname> <given-names>M</given-names></name> <name><surname>Delestro Matos</surname> <given-names>F</given-names></name> <name><surname>Chansel-Debordeaux</surname> <given-names>L</given-names></name><etal/></person-group> <article-title>A hybrid artificial intelligence model leverages multi-centric clinical data to improve fetal heart rate pregnancy prediction across time-lapse systems</article-title>. <source>Hum Reprod</source>. (<year>2023</year>) <volume>38</volume>:<fpage>596</fpage>&#x2013;<lpage>608</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/dead023</pub-id><pub-id pub-id-type="pmid">36763673</pub-id></mixed-citation></ref>
<ref id="B31"><label>31.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kragh</surname> <given-names>MF</given-names></name> <name><surname>Karstoft</surname> <given-names>H</given-names></name></person-group>. <article-title>Embryo selection with artificial intelligence: how to evaluate and compare methods?</article-title> <source>J Assist Reprod Genet</source>. (<year>2021</year>) <volume>38</volume>:<fpage>1675</fpage>&#x2013;<lpage>89</lpage>. <pub-id pub-id-type="doi">10.1007/s10815-021-02254-6</pub-id><pub-id pub-id-type="pmid">34173914</pub-id></mixed-citation></ref>
<ref id="B32"><label>32.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hicks</surname> <given-names>SA</given-names></name> <name><surname>Str&#x00FC;mke</surname> <given-names>I</given-names></name> <name><surname>Thambawita</surname> <given-names>V</given-names></name> <name><surname>Hammou</surname> <given-names>M</given-names></name> <name><surname>Riegler</surname> <given-names>MA</given-names></name> <name><surname>Halvorsen</surname> <given-names>P</given-names></name><etal/></person-group> <article-title>On evaluation metrics for medical applications of artificial intelligence</article-title>. <source>Sci Rep</source>. (<year>2022</year>) <volume>12</volume>:<fpage>5979</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-09954-8</pub-id><pub-id pub-id-type="pmid">35395867</pub-id></mixed-citation></ref>
<ref id="B33"><label>33.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>M&#x00FC;ller</surname> <given-names>D</given-names></name> <name><surname>Soto-Rey</surname> <given-names>I</given-names></name> <name><surname>Kramer</surname> <given-names>F</given-names></name></person-group>. <article-title>Towards a guideline for evaluation metrics in medical image segmentation</article-title>. <source>BMC Res Notes</source>. (<year>2022</year>) <volume>15</volume>:<fpage>210</fpage>. <pub-id pub-id-type="doi">10.1186/s13104-022-06096-y</pub-id></mixed-citation></ref>
<ref id="B34"><label>34.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Riegler</surname> <given-names>MA</given-names></name> <name><surname>Stensen</surname> <given-names>MH</given-names></name> <name><surname>Witczak</surname> <given-names>O</given-names></name> <name><surname>Andersen</surname> <given-names>JM</given-names></name> <name><surname>Hicks</surname> <given-names>SA</given-names></name> <name><surname>Hammer</surname> <given-names>HL</given-names></name><etal/></person-group> <article-title>Artificial intelligence in the fertility clinic: status, pitfalls and possibilities</article-title>. <source>Hum Reprod</source>. (<year>2021</year>) <volume>36</volume>:<fpage>2429</fpage>&#x2013;<lpage>42</lpage>. <pub-id pub-id-type="doi">10.1093/humrep/deab168</pub-id><pub-id pub-id-type="pmid">34324672</pub-id></mixed-citation></ref>
<ref id="B35"><label>35.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Thirumalaraju</surname> <given-names>P</given-names></name> <name><surname>Kanakasabapathy</surname> <given-names>MK</given-names></name> <name><surname>Kandula</surname> <given-names>H</given-names></name> <name><surname>Kandula</surname> <given-names>T</given-names></name> <name><surname>Reddy Katkuri</surname> <given-names>AV</given-names></name> <name><surname>Cipriano</surname> <given-names>C</given-names></name><etal/></person-group> <article-title>Stability and reliability of artificial intelligence models in embryo selection for <italic>in vitro</italic> fertilization</article-title>. <source>Fertil Steril</source>. (<year>2026</year>) <volume>125</volume>(<issue>2</issue>):<fpage>277</fpage>&#x2013;<lpage>86</lpage>. <pub-id pub-id-type="doi">10.1016/j.fertnstert.2025.08.021</pub-id><pub-id pub-id-type="pmid">40876725</pub-id></mixed-citation></ref>
<ref id="B36"><label>36.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bashir</surname> <given-names>Z</given-names></name> <name><surname>Lin</surname> <given-names>M</given-names></name> <name><surname>Feragen</surname> <given-names>A</given-names></name> <name><surname>Mikolaj</surname> <given-names>K</given-names></name> <name><surname>Taks&#x00F8;e-Vester</surname> <given-names>C</given-names></name> <name><surname>Christensen</surname> <given-names>AN</given-names></name><etal/></person-group> <article-title>Clinical validation of explainable AI for fetal growth scans through multi-level, cross-institutional prospective end-user evaluation</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>2074</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-86536-4</pub-id><pub-id pub-id-type="pmid">39820804</pub-id></mixed-citation></ref>
<ref id="B37"><label>37.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>HM</given-names></name> <name><surname>Kang</surname> <given-names>H</given-names></name> <name><surname>Lee</surname> <given-names>C</given-names></name> <name><surname>Park</surname> <given-names>JH</given-names></name> <name><surname>Chung</surname> <given-names>MK</given-names></name> <name><surname>Kim</surname> <given-names>M</given-names></name><etal/></person-group> <article-title>Evaluation of the clinical efficacy and trust in AI-assisted embryo ranking: survey-based prospective study</article-title>. <source>J Med Internet Res</source>. (<year>2024</year>) <volume>26</volume>:<fpage>e52637</fpage>. <pub-id pub-id-type="doi">10.2196/52637</pub-id><pub-id pub-id-type="pmid">38830209</pub-id></mixed-citation></ref>
<ref id="B38"><label>38.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Curchoe</surname> <given-names>CL</given-names></name> <name><surname>Malmsten</surname> <given-names>J</given-names></name> <name><surname>Bormann</surname> <given-names>C</given-names></name> <name><surname>Shafiee</surname> <given-names>H</given-names></name> <name><surname>Flores-Saiffe Farias</surname> <given-names>A</given-names></name> <name><surname>Mendizabal</surname> <given-names>G</given-names></name><etal/></person-group> <article-title>Predictive modeling in reproductive medicine: where will the future of artificial intelligence research take us?</article-title> <source>Fertil Steril</source>. (<year>2020</year>) <volume>114</volume>:<fpage>934</fpage>&#x2013;<lpage>40</lpage>. <pub-id pub-id-type="doi">10.1016/j.fertnstert.2020.10.040</pub-id><pub-id pub-id-type="pmid">33160516</pub-id></mixed-citation></ref>
<ref id="B39"><label>39.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Van Royen</surname> <given-names>FS</given-names></name> <name><surname>Weerts</surname> <given-names>HJP</given-names></name> <name><surname>De Hond</surname> <given-names>AAH</given-names></name> <name><surname>Geersing</surname> <given-names>G-J</given-names></name> <name><surname>Rutten</surname> <given-names>FH</given-names></name> <name><surname>Moons</surname> <given-names>KGM</given-names></name><etal/></person-group> <article-title>In humble defense of unexplainable black box prediction models in healthcare</article-title>. <source>J Clin Epidemiol</source>. (<year>2026</year>) <volume>189</volume>:<fpage>112013</fpage>. <pub-id pub-id-type="doi">10.1016/j.jclinepi.2025.112013</pub-id><pub-id pub-id-type="pmid">41077324</pub-id></mixed-citation></ref>
<ref id="B40"><label>40.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hassija</surname> <given-names>V</given-names></name> <name><surname>Chamola</surname> <given-names>V</given-names></name> <name><surname>Mahapatra</surname> <given-names>A</given-names></name> <name><surname>Singal</surname> <given-names>A</given-names></name> <name><surname>Goel</surname> <given-names>D</given-names></name> <name><surname>Huang</surname> <given-names>K</given-names></name><etal/></person-group> <article-title>Interpreting black-box models: a review on explainable artificial intelligence</article-title>. <source>Cognit Comput</source>. (<year>2024</year>) <volume>16</volume>:<fpage>45</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1007/s12559-023-10179-8</pub-id></mixed-citation></ref>
<ref id="B41"><label>41.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jung</surname> <given-names>J</given-names></name> <name><surname>Lee</surname> <given-names>H</given-names></name> <name><surname>Jung</surname> <given-names>H</given-names></name> <name><surname>Kim</surname> <given-names>H</given-names></name></person-group>. <article-title>Essential properties and explanation effectiveness of explainable artificial intelligence in healthcare: a systematic review</article-title>. <source>Heliyon</source>. (<year>2023</year>) <volume>9</volume>:<fpage>e16110</fpage>. <pub-id pub-id-type="doi">10.1016/j.heliyon.2023.e16110</pub-id><pub-id pub-id-type="pmid">37234618</pub-id></mixed-citation></ref>
<ref id="B42"><label>42.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pahud De Mortanges</surname> <given-names>A</given-names></name> <name><surname>Luo</surname> <given-names>H</given-names></name> <name><surname>Shu</surname> <given-names>SZ</given-names></name> <name><surname>Kamath</surname> <given-names>A</given-names></name> <name><surname>Suter</surname> <given-names>Y</given-names></name> <name><surname>Shelan</surname> <given-names>M</given-names></name><etal/></person-group> <article-title>Orchestrating explainable artificial intelligence for multimodal and longitudinal data in medical imaging</article-title>. <source>NPJ Digit Med</source>. (<year>2024</year>) <volume>7</volume>:<fpage>195</fpage>. <pub-id pub-id-type="doi">10.1038/s41746-024-01190-w</pub-id><pub-id pub-id-type="pmid">39039248</pub-id></mixed-citation></ref>
<ref id="B43"><label>43.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barredo Arrieta</surname> <given-names>A</given-names></name> <name><surname>D&#x00ED;az-Rodr&#x00ED;guez</surname> <given-names>N</given-names></name> <name><surname>Del Ser</surname> <given-names>J</given-names></name> <name><surname>Bennetot</surname> <given-names>A</given-names></name> <name><surname>Tabik</surname> <given-names>S</given-names></name> <name><surname>Barbado</surname> <given-names>A</given-names></name><etal/></person-group> <article-title>Explainable artificial intelligence (XAI): concepts, taxonomies, opportunities and challenges toward responsible AI</article-title>. <source>Information Fusion</source>. (<year>2020</year>) <volume>58</volume>:<fpage>82</fpage>&#x2013;<lpage>115</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2019.12.012</pub-id></mixed-citation></ref>
<ref id="B44"><label>44.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Urcelay</surname> <given-names>L</given-names></name> <name><surname>Hinjos</surname> <given-names>D</given-names></name> <name><surname>Martin-Torres</surname> <given-names>PA</given-names></name> <name><surname>Gonzalez</surname> <given-names>M</given-names></name> <name><surname>Mendez</surname> <given-names>M</given-names></name> <name><surname>Civico</surname> <given-names>S</given-names></name> <name><surname>Alvarez-Napagao</surname> <given-names>S</given-names></name><etal/></person-group> <comment>Exploring the Role of Explainability in AI-Assisted Embryo Selection. <italic>ArXiv</italic></comment> (<year>2023</year>).</mixed-citation></ref>
<ref id="B45"><label>45.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guidotti</surname> <given-names>R</given-names></name> <name><surname>Monreale</surname> <given-names>A</given-names></name> <name><surname>Ruggieri</surname> <given-names>S</given-names></name> <name><surname>Turini</surname> <given-names>F</given-names></name> <name><surname>Giannotti</surname> <given-names>F</given-names></name> <name><surname>Pedreschi</surname> <given-names>D</given-names></name></person-group>. <article-title>A survey of methods for explaining black box models</article-title>. <source>ACM Comput. Surv.</source> (<year>2018</year>) <volume>51</volume>:<fpage>93</fpage>. <pub-id pub-id-type="doi">10.1145/3236009</pub-id></mixed-citation></ref>
<ref id="B46"><label>46.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Selvaraju</surname> <given-names>RR</given-names></name> <name><surname>Cogswell</surname> <given-names>M</given-names></name> <name><surname>Das</surname> <given-names>A</given-names></name> <name><surname>Vedantam</surname> <given-names>R</given-names></name> <name><surname>Parikh</surname> <given-names>D</given-names></name> <name><surname>Batra</surname> <given-names>D</given-names></name></person-group>. <article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title>. <source>Int J Comput Vis</source>. (<year>2020</year>) <volume>128</volume>:<fpage>336</fpage>&#x2013;<lpage>59</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-019-01228-7</pub-id></mixed-citation></ref>
<ref id="B47"><label>47.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pay&#x00E1;</surname> <given-names>E</given-names></name> <name><surname>Bori</surname> <given-names>L</given-names></name> <name><surname>Colomer</surname> <given-names>A</given-names></name> <name><surname>Meseguer</surname> <given-names>M</given-names></name> <name><surname>Naranjo</surname> <given-names>V</given-names></name></person-group>. <article-title>Automatic characterization of human embryos at day 4 post-insemination from time-lapse imaging using supervised contrastive learning and inductive transfer learning techniques</article-title>. <source>Comput Methods Prog Biomed.</source> (<year>2022</year>) <volume>221</volume>:<fpage>12</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2022.106895</pub-id></mixed-citation></ref>
<ref id="B48"><label>48.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Enatsu</surname> <given-names>N</given-names></name> <name><surname>Miyatsuka</surname> <given-names>I</given-names></name> <name><surname>An</surname> <given-names>LM</given-names></name> <name><surname>Inubushi</surname> <given-names>M</given-names></name> <name><surname>Enatsu</surname> <given-names>K</given-names></name> <name><surname>Otsuki</surname> <given-names>J</given-names></name><etal/></person-group> <article-title>A novel system based on artificial intelligence for predicting blastocyst viability and visualizing the explanation</article-title>. <source>Reprod Med Biol</source>. (<year>2022</year>) <volume>21</volume>:<fpage>e12443</fpage>. <pub-id pub-id-type="doi">10.1002/rmb2.12443</pub-id><pub-id pub-id-type="pmid">35386375</pub-id></mixed-citation></ref>
<ref id="B49"><label>49.</label><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Sharma</surname> <given-names>A</given-names></name> <name><surname>Stensen</surname> <given-names>MH</given-names></name> <name><surname>Delbarre</surname> <given-names>E</given-names></name> <name><surname>Haugen</surname> <given-names>TB</given-names></name> <name><surname>Hammer</surname> <given-names>HL</given-names></name></person-group>. <article-title>Explainable artificial intelligence for human embryo cell cleavage stages analysis</article-title>. In: <conf-name>Proceedings of the 3rd ACM Workshop on Intelligent Cross-Data Analysis and Retrieval</conf-name>. <publisher-loc>Newark, NJ, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name> (<year>2022</year>).</mixed-citation></ref>
<ref id="B50"><label>50.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Auriemma Citarella</surname> <given-names>A</given-names></name> <name><surname>Battistoni</surname> <given-names>P</given-names></name> <name><surname>Coscarelli</surname> <given-names>C</given-names></name> <name><surname>De Marco</surname> <given-names>F</given-names></name> <name><surname>Di Biasi</surname> <given-names>L</given-names></name> <name><surname>Wang</surname> <given-names>M</given-names></name></person-group>. <article-title>Embryovision AI: an explainable deep learning framework for enhanced blastocyst selection in assisted reproductive technologies</article-title>. <source>Image Vis Comput</source>. (<year>2026</year>) <volume>165</volume>:<fpage>105795</fpage>. <pub-id pub-id-type="doi">10.1016/j.imavis.2025.105795</pub-id></mixed-citation></ref>
<ref id="B51"><label>51.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stor&#x00E5;s</surname> <given-names>AM</given-names></name> <name><surname>Dreyer</surname> <given-names>M</given-names></name> <name><surname>Pahde</surname> <given-names>F</given-names></name> <name><surname>Lapuschkin</surname> <given-names>S</given-names></name> <name><surname>Samek</surname> <given-names>W</given-names></name> <name><surname>Halvorsen</surname> <given-names>P</given-names></name><etal/></person-group> <article-title>Exploring the clinical value of concept-based AI explanations in gastrointestinal disease detection</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>28860</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-14408-y</pub-id></mixed-citation></ref>
<ref id="B52"><label>52.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>B</given-names></name> <name><surname>Chen</surname> <given-names>H</given-names></name> <name><surname>Duan</surname> <given-names>H</given-names></name></person-group>. <article-title>Visualized hysteroscopic artificial intelligence fertility assessment system for endometrial injury: an image-deep-learning study</article-title>. <source>Ann Med</source>. (<year>2025</year>) <volume>57</volume>:<fpage>2478473</fpage>. <pub-id pub-id-type="doi">10.1080/07853890.2025.2478473</pub-id><pub-id pub-id-type="pmid">40098308</pub-id></mixed-citation></ref>
<ref id="B53"><label>53.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Kodali</surname> <given-names>R</given-names></name> <name><surname>Dhulipalla</surname> <given-names>VR</given-names></name> <name><surname>Tatavarty</surname> <given-names>VSK</given-names></name> <name><surname>Nadakuditi</surname> <given-names>M</given-names></name> <name><surname>Thiruveedhula</surname> <given-names>B</given-names></name> <name><surname>Gunnam</surname> <given-names>S</given-names></name><etal/></person-group> <comment>Interpretation of Deep Learning Model in Embryo Selection for <italic>in vitro</italic> Fertilization (IVF) Treatment. <italic>arXiv</italic></comment> (<year>2025</year>).</mixed-citation></ref>
<ref id="B54"><label>54.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Luong</surname> <given-names>T-M-T</given-names></name> <name><surname>Ho</surname> <given-names>N-T</given-names></name> <name><surname>Hwu</surname> <given-names>Y-M</given-names></name> <name><surname>Lin</surname> <given-names>S-Y</given-names></name> <name><surname>Ho</surname> <given-names>JY-P</given-names></name> <name><surname>Wang</surname> <given-names>R-S</given-names></name><etal/></person-group> <article-title>Beyond black-box models: explainable AI for embryo ploidy prediction and patient-centric consultation</article-title>. <source>J Assist Reprod Genet</source>. (<year>2024</year>) <volume>41</volume>:<fpage>2349</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1007/s10815-024-03178-7</pub-id><pub-id pub-id-type="pmid">38963605</pub-id></mixed-citation></ref>
<ref id="B55"><label>55.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Apter</surname> <given-names>S</given-names></name> <name><surname>Ebner</surname> <given-names>T</given-names></name> <name><surname>Freour</surname> <given-names>T</given-names></name> <name><surname>Guns</surname> <given-names>Y</given-names></name> <name><surname>Kovacic</surname> <given-names>B</given-names></name> <name><surname>Le Clef</surname> <given-names>N</given-names></name><etal/></person-group> <article-title>Good practice recommendations for the use of time-lapse technology&#x2020;</article-title>. <source>Human Reprod Open</source>. (<year>2020</year>) <volume>2020</volume>:<fpage>1</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1093/hropen/hoaa008</pub-id></mixed-citation></ref>
<ref id="B56"><label>56.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mart&#x00ED;nez-Granados</surname> <given-names>L</given-names></name> <name><surname>Serrano</surname> <given-names>M</given-names></name> <name><surname>Gonz&#x00E1;lez-Utor</surname> <given-names>A</given-names></name> <name><surname>Ortiz</surname> <given-names>N</given-names></name> <name><surname>Badajoz</surname> <given-names>V</given-names></name> <name><surname>L&#x00F3;pez-Regalado</surname> <given-names>ML</given-names></name><etal/></person-group> <article-title>Reliability and agreement on embryo assessment: 5 years of an external quality control programme</article-title>. <source>Reprod Biomed Online</source>. (<year>2018</year>) <volume>36</volume>:<fpage>259</fpage>&#x2013;<lpage>68</lpage>. <pub-id pub-id-type="doi">10.1016/j.rbmo.2017.12.008</pub-id></mixed-citation></ref>
<ref id="B57"><label>57.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rajendran</surname> <given-names>S</given-names></name> <name><surname>Rehani</surname> <given-names>E</given-names></name> <name><surname>Phu</surname> <given-names>W</given-names></name> <name><surname>Zhan</surname> <given-names>Q</given-names></name> <name><surname>Malmsten</surname> <given-names>JE</given-names></name> <name><surname>Meseguer</surname> <given-names>M</given-names></name><etal/></person-group> <article-title>A foundational model for <italic>in vitro</italic> fertilization trained on 18 million time-lapse images</article-title>. <source>Nat Commun</source>. (<year>2025</year>) <volume>16</volume>:<fpage>6235</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-025-61116-2</pub-id><pub-id pub-id-type="pmid">40645954</pub-id></mixed-citation></ref>
<ref id="B58"><label>58.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Boucret</surname> <given-names>L</given-names></name> <name><surname>Chabrun</surname> <given-names>F</given-names></name> <name><surname>Boguenet</surname> <given-names>M</given-names></name> <name><surname>Reynier</surname> <given-names>P</given-names></name> <name><surname>Bouet</surname> <given-names>P-E</given-names></name> <name><surname>May-Panloup</surname> <given-names>P</given-names></name></person-group>. <article-title>Deep-learning model for embryo selection using time-lapse imaging of matched high-quality embryos</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>28068</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-10531-y</pub-id><pub-id pub-id-type="pmid">40750959</pub-id></mixed-citation></ref>
<ref id="B59"><label>59.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Onthuam</surname> <given-names>K</given-names></name> <name><surname>Charnpinyo</surname> <given-names>N</given-names></name> <name><surname>Suthicharoenpanich</surname> <given-names>K</given-names></name> <name><surname>Engphaiboon</surname> <given-names>S</given-names></name> <name><surname>Siricharoen</surname> <given-names>P</given-names></name> <name><surname>Chaichaowarat</surname> <given-names>R</given-names></name><etal/></person-group> <article-title>Combined input deep learning pipeline for embryo selection for <italic>in vitro</italic> fertilization using light microscopic images and additional features</article-title>. <source>J Imaging</source>. (<year>2025</year>) <volume>11</volume>:<fpage>13</fpage>. <pub-id pub-id-type="doi">10.3390/jimaging11010013</pub-id><pub-id pub-id-type="pmid">39852326</pub-id></mixed-citation></ref>
<ref id="B60"><label>60.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Arun</surname> <given-names>N</given-names></name> <name><surname>Gaw</surname> <given-names>N</given-names></name> <name><surname>Singh</surname> <given-names>P</given-names></name> <name><surname>Chang</surname> <given-names>K</given-names></name> <name><surname>Aggarwal</surname> <given-names>M</given-names></name> <name><surname>Chen</surname> <given-names>B</given-names></name><etal/></person-group> <article-title>Assessing the trustworthiness of saliency maps for localizing abnormalities in medical imaging</article-title>. <source>Radiol Artif Intell</source>. (<year>2021</year>) <volume>3</volume>:<fpage>e200267</fpage>. <pub-id pub-id-type="doi">10.1148/ryai.2021200267</pub-id><pub-id pub-id-type="pmid">34870212</pub-id></mixed-citation></ref>
<ref id="B61"><label>61.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Venkatesh</surname> <given-names>K</given-names></name> <name><surname>Mutasa</surname> <given-names>S</given-names></name> <name><surname>Moore</surname> <given-names>F</given-names></name> <name><surname>Sulam</surname> <given-names>J</given-names></name> <name><surname>Yi</surname> <given-names>PH</given-names></name></person-group>. <article-title>Gradient-Based saliency maps are not trustworthy visual explanations of automated AI musculoskeletal diagnoses</article-title>. <source>J Imaging Inform Med</source>. (<year>2024</year>) <volume>37</volume>:<fpage>2490</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1007/s10278-024-01136-4</pub-id><pub-id pub-id-type="pmid">38710971</pub-id></mixed-citation></ref>
<ref id="B62"><label>62.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Adebayo</surname> <given-names>J</given-names></name> <name><surname>Gilmer</surname> <given-names>J</given-names></name> <name><surname>Muelly</surname> <given-names>M</given-names></name> <name><surname>Goodfellow</surname> <given-names>I</given-names></name> <name><surname>Hardt</surname> <given-names>M</given-names></name> <name><surname>Kim</surname> <given-names>B</given-names></name></person-group>. <comment>Sanity Checks for Saliency Maps. <italic>arXiv</italic>, 1810.03292</comment> (<year>2020</year>).</mixed-citation></ref>
<ref id="B63"><label>63.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rudin</surname> <given-names>C</given-names></name></person-group>. <article-title>Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead</article-title>. <source>Nat Mach Intell</source>. (<year>2019</year>) <volume>1</volume>:<fpage>206</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1038/s42256-019-0048-x</pub-id><pub-id pub-id-type="pmid">35603010</pub-id></mixed-citation></ref>
<ref id="B64"><label>64.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saporta</surname> <given-names>A</given-names></name> <name><surname>Gui</surname> <given-names>X</given-names></name> <name><surname>Agrawal</surname> <given-names>A</given-names></name> <name><surname>Pareek</surname> <given-names>A</given-names></name> <name><surname>Truong</surname> <given-names>SQH</given-names></name> <name><surname>Nguyen</surname> <given-names>CDT</given-names></name><etal/></person-group> <article-title>Benchmarking saliency methods for chest x-ray interpretation</article-title>. <source>Nat Mach Intell</source>. (<year>2022</year>) <volume>4</volume>:<fpage>867</fpage>&#x2013;<lpage>78</lpage>. <pub-id pub-id-type="doi">10.1038/s42256-022-00536-x</pub-id></mixed-citation></ref>
<ref id="B65"><label>65.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yavtukhovskyi</surname> <given-names>V</given-names></name> <name><surname>Tretynyk</surname> <given-names>V</given-names></name></person-group>. <article-title>Evaluation of similarity of image explanations produced by SHAP, LIME and grad-CAM</article-title>. <source>Cybernetics Comput Technol</source>. (<year>2025</year>) <volume>2</volume>:<fpage>69</fpage>&#x2013;<lpage>76</lpage>. <pub-id pub-id-type="doi">10.34229/2707-451X.25.2.6</pub-id></mixed-citation></ref>
<ref id="B66"><label>66.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>K</given-names></name> <name><surname>Zuo</surname> <given-names>J</given-names></name> <name><surname>Han</surname> <given-names>W</given-names></name> <name><surname>Guo</surname> <given-names>J-H</given-names></name></person-group>. <article-title>Intelligent assisted reproduction: innovative applications of artificial intelligence in embryo health assessment</article-title>. <source>LabMed Discovery</source>. (<year>2025</year>) <volume>2</volume>:<fpage>100075</fpage>. <pub-id pub-id-type="doi">10.1016/j.lmd.2025.100075</pub-id></mixed-citation></ref>
<ref id="B67"><label>67.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kaur</surname> <given-names>A</given-names></name> <name><surname>Dong</surname> <given-names>G</given-names></name> <name><surname>Basu</surname> <given-names>A</given-names></name></person-group>. <article-title>GradXcepUNet: explainable AI based medical image segmentation</article-title>. In: <person-group person-group-type="editor"><name><surname>Berretti</surname> <given-names>S</given-names></name> <name><surname>Su</surname> <given-names>G-M</given-names></name></person-group>, editors. <source>Smart Multimedia, 2022//</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name> (<year>2022</year>). p. <fpage>174</fpage>&#x2013;<lpage>88</lpage>.</mixed-citation></ref>
<ref id="B68"><label>68.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Merrick</surname> <given-names>L</given-names></name> <name><surname>Taly</surname> <given-names>A</given-names></name></person-group>. <article-title>The explanation game: explaining machine learning models using shapley values</article-title>. In: <person-group person-group-type="editor"><name><surname>Holzinger</surname> <given-names>A</given-names></name> <name><surname>Kieseberg</surname> <given-names>P</given-names></name> <name><surname>Tjoa</surname> <given-names>AM</given-names></name> <name><surname>Weippl</surname> <given-names>E</given-names></name></person-group>, editors. <source>Machine Learning and Knowledge Extraction</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name> (<year>2020</year>). p. <fpage>17</fpage>&#x2013;<lpage>38</lpage>.</mixed-citation></ref>
<ref id="B69"><label>69.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Doshi-Velez</surname> <given-names>F</given-names></name> <name><surname>Kim</surname> <given-names>B</given-names></name></person-group>. <comment>Towards A Rigorous Science of Interpretable Machine Learning</comment>. <comment><italic>arXiv</italic> [Preprint]</comment> (<year>2017</year>).</mixed-citation></ref>
<ref id="B70"><label>70.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zarenia</surname> <given-names>E</given-names></name> <name><surname>Far</surname> <given-names>AA</given-names></name> <name><surname>Rezaee</surname> <given-names>K</given-names></name></person-group>. <article-title>Automated multi-class MRI brain tumor classification and segmentation using deformable attention and saliency mapping</article-title>. <source>Sci Rep</source>. (<year>2025</year>) <volume>15</volume>:<fpage>8114</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-92776-1</pub-id><pub-id pub-id-type="pmid">40057634</pub-id></mixed-citation></ref></ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1076838/overview">Ioannis Sfontouris</ext-link>, Hygeia Hospital, Greece</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1376651/overview">Sepide Goharitaban</ext-link>, Hamadan University of Medical Sciences, Iran</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3264541/overview">J&#x00F8;rgen Berntsen</ext-link>, Vitrolife, Sweden</p></fn>
</fn-group>
</back>
</article>