<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Behav. Neurosci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Behavioral Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Behav. Neurosci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1662-5153</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbeh.2025.1735237</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>AI-enhanced adaptive testing with cognitive diagnostic feedback and its association with performance in undergraduate surgical education: a pilot study</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Silva Gon&#x00E7;alves</surname>
<given-names>Nuno</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/396410"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Collares</surname>
<given-names>Carlos</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/784878"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>P&#x00EA;go</surname>
<given-names>Jos&#x00E9; Miguel</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2487"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Life and Health Sciences Research Institute (ICVS), University of Minho</institution>, <city>Braga</city>, <country country="pt">Portugal</country></aff>
<aff id="aff2"><label>2</label><institution>ICVS/3B&#x2019;s, PT Government Associate Laboratory</institution>, <city>Braga</city>, <country country="pt">Portugal</country></aff>
<aff id="aff3"><label>3</label><institution>European Board of Medical Assessors</institution>, <city>Cardiff</city>, <country country="gb">United Kingdom</country></aff>
<aff id="aff4"><label>4</label><institution>Inspirali Educa&#x00E7;&#x00E3;o</institution>, <city>S&#x00E3;o Paulo</city>, <country country="br">Brazil</country></aff>
<aff id="aff5"><label>5</label><institution>Faculdades Pequeno Pr&#x00ED;ncipe</institution>, <city>Curitiba</city>, <country country="br">Brazil</country></aff>
<aff id="aff6"><label>6</label><institution>Medical Education Unit, Faculty of Medicine and Biomedical Sciences, University of Algarve</institution>, <city>Faro</city>, <country country="pt">Portugal</country></aff>
<aff id="aff7"><label>7</label><institution>iCognitus4ALL &#x2013; IT Solutions</institution>, <city>Porto</city>, <country country="pt">Portugal</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Nuno Silva Gon&#x00E7;alves, <email xlink:href="mailto:nunogoncalves@med.uminho.pt">nunogoncalves@med.uminho.pt</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-06">
<day>06</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>19</volume>
<elocation-id>1735237</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>24</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>11</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Silva Gon&#x00E7;alves, Collares and P&#x00EA;go.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Silva Gon&#x00E7;alves, Collares and P&#x00EA;go</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-06">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>Effective feedback in the cognitive domain is essential for surgical education but often limited by resource constraints and traditional assessment formats. Artificial Intelligence (AI) has emerged as a catalyst for innovation, enabling automated feedback, real-time cognitive diagnostics, and scalable item generation, thereby transforming how future surgeons learn and are assessed.</p>
</sec>
<sec>
<title>Methods</title>
<p>An item bank of 150 multiple-choice questions was developed using AI-assisted item generation and difficulty estimation. A formative Computerized Adaptive Testing (CAT), balanced across three cognitive domains (memory, analysis, and decision) and surgical topics, was delivered via QuizOne<sup>&#x00AE;</sup> 3&#x2013;5 days before the summative Progress Test. A total of 147 students participated, of whom 116 completed the formative CAT. Performance correlations, group comparisons, analysis of covariance (ANCOVA), and regression analyses were conducted.</p>
</sec>
<sec>
<title>Results</title>
<p>Students who voluntarily completed CAT showed higher Progress Test scores, though causality cannot be established due to self-selection bias (<italic>p</italic>&#x202F;=&#x202F;0.021), with the effect persisting after adjusting for prior academic performance (ANCOVA <italic>p</italic>&#x202F;=&#x202F;0.041). Memory skills were the strongest predictors of summative outcomes (<italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.180, <italic>&#x03B2;</italic>&#x202F;=&#x202F;0.425), followed by analysis (<italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.080, <italic>&#x03B2;</italic>&#x202F;=&#x202F;0.283); decision was not significant (<italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.029, <italic>&#x03B2;</italic>&#x202F;=&#x202F;0.170).</p>
</sec>
<sec>
<title>Conclusion</title>
<p>AI-enhanced CAT&#x2013;Cognitive Diagnostic Modeling (CDM) represents a promising formative approach in undergraduate surgical education, being associated with higher summative performance and providing individualized diagnostic feedback. Refining feedback presentation and enhancing decision-making assessment could further optimize its educational impact.</p>
</sec>
</abstract>
<kwd-group>
<kwd>artificial intelligence</kwd>
<kwd>computerized adaptive testing</kwd>
<kwd>cognitive diagnostic modeling</kwd>
<kwd>surgical education</kwd>
<kwd>feedback</kwd>
<kwd>cognitive skills</kwd>
<kwd>assessment innovation</kwd>
<kwd>educational technology</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that no financial support was received for the research and/or publication of this article.</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="36"/>
<page-count count="10"/>
<word-count count="6738"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Learning and Memory</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<title>Introduction</title>
<p>Artificial Intelligence (AI) is transforming medical education by enabling automated item generation, difficulty estimation, and individualized feedback that was previously impractical (<xref ref-type="bibr" rid="ref26">Pohn et al., 2025</xref>; <xref ref-type="bibr" rid="ref32">Shaw et al., 2025</xref>; <xref ref-type="bibr" rid="ref17">Gordon et al., 2024</xref>; <xref ref-type="bibr" rid="ref23">Mir et al., 2023</xref>). When integrated with adaptive testing and cognitive diagnostic models, AI allows feedback to evolve from static score reporting into a continuous, formative process that guides learning (<xref ref-type="bibr" rid="ref33">Sunmboye et al., 2025</xref>). This approach is particularly valuable in surgical education, where cognitive, technical, and non-technical skills intersect and are required, allowing educators to identify specific strengths and weaknesses and promote deliberate practice toward surgical competence (<xref ref-type="bibr" rid="ref16">Gomez et al., 2025</xref>; <xref ref-type="bibr" rid="ref28">Rosendal et al., 2023</xref>; <xref ref-type="bibr" rid="ref24">Ounounou et al., 2019</xref>; <xref ref-type="bibr" rid="ref10">Dedy et al., 2013</xref>).</p>
<p>Although feedback is widely recognized as a key driver of learning, its application in the cognitive domain, particularly during undergraduate surgical education, remains underexplored (<xref ref-type="bibr" rid="ref14">Garner et al., 2014</xref>; <xref ref-type="bibr" rid="ref11">El Boghdady and Alijani, 2017</xref>). Traditional mechanisms often fail to provide timely, specific, and actionable insights into students&#x2019; cognitive performance, especially in complex domains such as reasoning and decision-making (<xref ref-type="bibr" rid="ref4">Burgess et al., 2020</xref>; <xref ref-type="bibr" rid="ref31">Shaughness et al., 2017</xref>). As medical curricula increasingly adopt competency-based models, adopting assessment strategies that deliver targeted, data-informed feedback has become essential to enhance learning and self-regulation (<xref ref-type="bibr" rid="ref29">Ross et al., 2022</xref>).</p>
<p>Cognitive competence involves integrating basic scientific knowledge with clinical information to interpret findings and make informed decisions under uncertainty, skills that are critical in surgical practice (<xref ref-type="bibr" rid="ref9">Crebbin et al., 2013</xref>; <xref ref-type="bibr" rid="ref22">Madani et al., 2017</xref>). However, traditional multiple-choice exams often emphasize factual recall, provide delayed feedback, and fail to capture deeper levels of reasoning (<xref ref-type="bibr" rid="ref6">Butler and Roediger, 2008</xref>).</p>
<p>Providing feedback in summative assessments remains a challenge in medical education. Concerns over item security, fairness, and resource limitations often lead institutions to restrict item-level feedback (<xref ref-type="bibr" rid="ref1">Appelhaus et al., 2023</xref>; <xref ref-type="bibr" rid="ref18">Harrison, 2017</xref>). Consequently, summative assessments frequently become &#x201C;black boxes,&#x201D; offering scores without meaningful guidance and reinforcing a culture of performance rather than development.</p>
<p>When integrated into assessment systems, AI can support the creation of high-quality questions, estimate item difficulty, and generate individualized diagnostic feedback with minimal faculty effort. These capabilities complement Computerized Adaptive Testing (CAT), a psychometric method proposed by <xref ref-type="bibr" rid="ref20">Lord (1971)</xref>, <xref ref-type="bibr" rid="ref25">Owen (1975)</xref>, and <xref ref-type="bibr" rid="ref7">Chang and Ying (1996)</xref> that dynamically adjusts item difficulty based on student responses (<xref ref-type="bibr" rid="ref7">Chang and Ying, 1996</xref>). By tailoring the test to the learner&#x2019;s ability level, CAT increases the efficiency and precision of assessment with fewer items, reducing test fatigue while maximizing information. It can also be useful in determining the true score and competence of an examinee (<xref ref-type="bibr" rid="ref8">Collares and Cecilio-Fernandes, 2019</xref>; <xref ref-type="bibr" rid="ref5">Burr et al., 2016</xref>). In the context of medical education, CAT has been successfully implemented in progress testing, licensing exams, and residency selection processes (<xref ref-type="bibr" rid="ref5">Burr et al., 2016</xref>; <xref ref-type="bibr" rid="ref36">Xu et al., 2023</xref>; <xref ref-type="bibr" rid="ref34">Van Wijk et al., n.d.</xref>; <xref ref-type="bibr" rid="ref30">Seo et al., 2024</xref>). Its potential as a formative tool for learning, however, remains underutilized.</p>
<p>CAT can also be coupled with Cognitive Diagnostic Modeling (CDM), a psychometric framework that analyzes student responses to infer the proficiency of specific cognitive attributes. Unlike classical test theory, which provides a single overall score, CDM enables a multidimensional understanding of performance by categorizing items and responses according to specific cognitive processes. For instance, an item might assess recall of factual knowledge, interpretation of clinical signs, or the application of pathophysiological reasoning. By classifying and analyzing items in this way, CDM supports the generation of individualized feedback, offering students a roadmap for targeted improvement (<xref ref-type="bibr" rid="ref21">Ma et al., 2023</xref>; <xref ref-type="bibr" rid="ref35">Williamson, n.d.</xref>; <xref ref-type="bibr" rid="ref19">Leighton and Gierl, 2007</xref>; <xref ref-type="bibr" rid="ref2">Barthakur et al., 2022</xref>). This granular diagnostic capability is particularly relevant in complex curricular areas like surgery, where different cognitive skills are needed to approach diverse clinical scenarios. In this study, items were categorized into three cognitive diagnostic models (memory, analysis, and decision) reflecting the cognitive tasks proposed by the National Board of Medical Examiners (NBME) for assessing medical knowledge application (<xref ref-type="bibr" rid="ref3">Billings et al., n.d.</xref>). These domains were selected because they align with the NBME&#x2019;s framework for evaluating progressively complex levels of cognitive processing, ranging from factual recall to clinical reasoning and decision-making, which are particularly relevant in the context of surgical education.</p>
<p>Despite their theoretical advantages, CAT and CDM have rarely been explored as learning tools in undergraduate surgical education. By evaluating an AI-supported CAT&#x2013;CDM intervention in a real educational setting, this study aims to advance data-informed feedback practices, improve alignment between formative and summative assessment, and support the development of self-regulated surgical learners.</p>
</sec>
<sec sec-type="materials|methods" id="sec2">
<title>Materials and methods</title>
<sec id="sec3">
<title>Item development and cognitive models</title>
<p>An item bank was developed using Automated Item Generation (AIGen) techniques, guided by cognitive models aligned with the exam blueprint (<xref ref-type="bibr" rid="ref19">Leighton and Gierl, 2007</xref>; <xref ref-type="bibr" rid="ref13">Falc&#x00E3;o et al., 2023</xref>; <xref ref-type="bibr" rid="ref15">Gierl et al., 2022</xref>; <xref ref-type="bibr" rid="ref12">Falc&#x00E3;o et al., 2022</xref>). The overall workflow of AI-assisted item generation, expert validation, and calibration is illustrated in <xref ref-type="fig" rid="fig1">Figure 1</xref>. Each item was also categorized according to three cognitive diagnostic models based on the NBME framework (<xref ref-type="bibr" rid="ref3">Billings et al., n.d.</xref>): memory (questions that require recall of factual information), analysis (questions that demand interpretation of clinical information), or decision (questions that require synthesis and clinical critical decision). To ensure consistency, two independent raters with experience in medical education and assessment independently classified all items according to these categories. Inter-rater reliability for initial classification was substantial (Cohen&#x2019;s <italic>&#x03BA;</italic>&#x202F;=&#x202F;0.76), indicating good agreement. Discrepancies were resolved through discussion until consensus was achieved.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Workflow of AI-assisted item generation, validation, and calibration. Items were first generated by ChatGPT-4 guided by cognitive models and blueprint alignment, categorized by cognitive domain (memory, analysis, decision) using dual independent coding (<italic>&#x03BA;</italic>&#x202F;=&#x202F;0.76), validated by expert review, and integrated with pre-calibrated OAIPT items. Final items were assigned initial difficulty indices (&#x2212;3 to +3 IRT scale) and empirically validated during administration.</p>
</caption>
<graphic xlink:href="fnbeh-19-1735237-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating a process: AI-assisted item generation using AIGEN and ChatGPT-4, followed by expert validation and difficulty calibration. Next, computerized adaptive test delivery with forty-eight items occurs. This leads to cognitive diagnostic modeling, focusing on memory, analysis, and decision-making. The final step is an automated individual feedback report.</alt-text>
</graphic>
</fig>
<p>Additionally, a retrospective double-blind classification of a random sample of 25 items by two external raters who were not involved in the original coding. Inter-rater reliability for this retrospective validation was Cohen&#x2019;s <italic>&#x03BA;</italic>&#x202F;=&#x202F;0.68, indicating substantial agreement and supporting the consistency of domain assignment. In addition to newly generated items, calibrated items from the existing bank of the Online Adaptive International Progress Test (OAIPT) were incorporated, taking advantage of their known difficulty parameters and item quality. For the newly created items, difficulty indices (ranging from &#x2212;3 to +3 on the IRT scale) were initially estimated using ChatGPT-4 (OpenAI, web interface) with the following structured prompt: &#x201C;Given the previous examples with their respective difficulty indices, estimate the expected difficulty of the following multiple-choice question for 5th-year medical students in a summative exam.&#x201D; Each AI-generated estimate and rationale were reviewed by two content experts for accuracy and consistency. Items were accepted only when both experts agreed that (a) the content was clinically correct, (b) the reasoning aligned with the intended cognitive domain, and (c) the predicted difficulty was plausible compared with similar calibrated items. Disagreements were resolved through discussion. This process produced an initial difficulty calibration for new items before empirical validation during test administration. To enhance transparency, representative examples of each cognitive domain are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary material 1</xref>, allowing readers to evaluate the appropriateness of domain assignment.</p>
</sec>
<sec id="sec4">
<title>Platform setup</title>
<p>The digital assessment platform QuizOne<sup>&#x00AE;</sup> was used to deliver CAT via its dedicated module (<xref ref-type="bibr" rid="ref27">Rice et al., 2022</xref>). This platform enabled dynamic test adaptation based on individual student responses. Weighted Likelihood (WL) method was used as the theta estimator, providing robust estimates of student ability even with a small number of items. For the selection of subsequent items, the platform applied the Maximum Fisher Information (MFI) criterion, which chooses the next item that maximizes the expected information gain at the student&#x2019;s ability level (<xref ref-type="bibr" rid="ref7">Chang and Ying, 1996</xref>).</p>
</sec>
<sec id="sec5">
<title>Formative CAT exam design</title>
<p>A formative Computerized Adaptive Test (CAT) consisting of 48 multiple-choice questions (MCQs) was created for this study and made available to students 3&#x2013;5&#x202F;days before the summative Progress Test of the Surgical Curricular Unit. The CAT was delivered through the platform and integrated Cognitive Diagnostic Modeling (CDM) to classify each item by cognitive domain&#x2014;memory, analysis, or decision. The platform was programmed to present one item from each cognitive domain across 16 surgical topics defined in the exam blueprint: Urology, Trauma, Abdominal Wall, Orthopedics, Otorhinolaryngology, Ophthalmology, Neurosurgery, Hepatobiliopancreatic, Esophagogastric, Endocrine and Breast, Colorectal, and Vascular Surgery. Each student therefore, completed exactly 48 items, as the adaptive algorithm was configured with a fixed-length stopping rule to ensure comparable duration and psychometric precision across participants. The formative CAT aimed to reinforce learning by identifying specific cognitive gaps and providing individualized feedback on students&#x2019; performance profiles. While there was curricular overlap in learning objectives, the specific items used in the formative CAT were distinct from those included in the summative exam, preventing direct content duplication.</p>
<p>Additionally, the authors collected the reports of the previous two exams of the surgical curricular unit (Exam 1 and Exam 2) done by the same student sample.</p>
</sec>
<sec id="sec6">
<title>Participants</title>
<p>All the students who enrolled in the surgical curricular unit were invited to participate in the formative exam. A convenience sample of 116 volunteer students completed the formative CAT exam.</p>
</sec>
<sec id="sec7">
<title>Data collection and analysis</title>
<p>Item-level performance data and overall test scores were extracted from the platform. Statistical analysis was performed using JASP (version 0.18.1).</p>
<p>Before inferential testing, the distribution of all continuous variables was assessed using the Shapiro&#x2013;Wilk test to evaluate assumptions of normality. Normality was tested with the Shapiro&#x2013;Wilk test. When assumptions of normality were violated, non-parametric tests were applied (Spearman&#x2019;s <italic>&#x03C1;</italic> for correlations, Mann&#x2013;Whitney <italic>U</italic> for between-group comparisons, Kruskal&#x2013;Wallis <italic>H</italic> for multiple groups). Linear regression assumptions were examined by inspecting residual plots and variance inflation factors. All analyses were independently verified by two authors to ensure consistency and accuracy of results. To examine relationships between cognitive performance indicators and summative outcomes, we conducted correlation analyses between final CAT scores, CDM sub-scores, and summative exam grades. Group comparisons were performed to assess differences between students who completed the formative CAT exam and those who did not. Additionally, ANCOVA models were used to evaluate the independent effect of CAT participation on summative performance, adjusting for prior academic performance. Finally, simple linear regressions were performed to identify predictors of summative exam scores, including the contribution of specific CDM sub-scores and overall test performance. Statistical significance was set at <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05.</p>
</sec>
<sec id="sec8">
<title>Feedback delivery</title>
<p>Upon completion of the formative CAT, each student received an individualized automated feedback report generated by the platform. The report began with a brief explanation of the standardized scoring system (mean&#x202F;=&#x202F;500, SD&#x202F;=&#x202F;100) and an illustrative graph clarifying how to interpret scores (e.g., 600&#x202F;=&#x202F;1 SD above average). Subsequent pages presented tables and bar charts displaying the student&#x2019;s standardized scores across multiple dimensions: 1) System categories (e.g., respiratory, digestive, musculoskeletal); 2) Surgical disciplines (e.g., hepatobiliopancreatic, colorectal, trauma, vascular); 3) Medical competencies (diagnosis, treatment, scientific principles, management, pathophysiology); 4) Cognitive domains (memory, analysis, decision).</p>
</sec>
<sec id="sec9">
<title>Satisfaction questionnaire</title>
<p>At the end of the curricular unit, participating students completed a satisfaction questionnaire assessing the usability, perceived usefulness, and impact of CAT and CDM on their learning process. The questionnaire consisted of nine Likert-scale items (1&#x202F;=&#x202F;least positive; 5&#x202F;=&#x202F;most positive) and two open-ended questions addressing the most positive and least positive aspects of the test and feedback. However, the response rate was insufficient (13%, 15 of 116 participants) to allow meaningful analysis; therefore, these data were not included in the present report.</p>
</sec>
<sec id="sec10">
<title>Ethical commission approval</title>
<p>This study was conducted following the ethical principles outlined in the Declaration of Helsinki. Ethical approval was obtained from the Ethics Committee for Research in Life and Health Sciences, University of Minho. Participation was voluntary, and all students provided informed consent before inclusion in the study. No identifiable personal data was collected. Participants were informed that their academic evaluation would not be influenced by their decision to participate or by their performance in the formative assessments. Data were stored securely and used exclusively for research purposes.</p>
</sec>
</sec>
<sec sec-type="results" id="sec11">
<title>Results</title>
<p>The final item bank used in the CAT exam was composed of 150 MCQ items, 84 extracted from the previously calibrated database (OAIPT) and 66 created by faculty members, experienced in item writing. From that item bank, the software developed an individual customized exam with 48 questions, tailored to the student&#x2019;s level of competence, with a matching blueprint of the same topics as the progress test and the previous surgical curricular unit. The 48 questions were equally divided between the three different diagnostic models: decision, memory, and analysis.</p>
<p>A total of 147 students were included in the analysis, of whom 116 completed the formative CAT. The participant inclusion process and attrition are illustrated in <xref ref-type="fig" rid="fig2">Figure 2</xref>. A post-hoc power analysis for the ANCOVA comparing CAT participants (<italic>n</italic>&#x202F;=&#x202F;116) and non-participants (<italic>n</italic>&#x202F;=&#x202F;31), with <italic>&#x03B1;</italic>&#x202F;=&#x202F;0.05, <italic>f</italic>&#x202F;=&#x202F;0.173 (derived from <italic>&#x03B7;</italic><sup>2</sup>&#x202F;=&#x202F;0.029) and two covariates, indicated approximately 0.49 power to detect this effect size. This suggests limited sensitivity for small effects, consistent with the exploratory nature of the study.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>CONSORT-style flowchart illustrating participant inclusion and attrition. Of the 147 students initially enrolled in the Surgical Curricular Unit, 116 voluntarily completed the formative Computerized Adaptive Test (CAT). Thirty-one students did not participate. Among the CAT participants, 15 completed the post-course satisfaction survey.</p>
</caption>
<graphic xlink:href="fnbeh-19-1735237-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart showing participant progression: 147 assessed for eligibility, 31 refuse participation. 116 complete the CAT exam, and 13 complete the satisfaction questionnaire.</alt-text>
</graphic>
</fig>
<p>Normality was evaluated using the Shapiro&#x2013;Wilk test. Most variables, including Progress Test, Exam 1, and Exam 2, and CDM-specific measures, deviated from normality in at least one group (<italic>p</italic>&#x202F;&#x003C;&#x202F;0.05). Therefore, non-parametric tests were applied when assumptions were not met. As expected, grades from the CAT exam followed a normal distribution. <xref ref-type="table" rid="tab1">Table 1</xref> summarizes the descriptive statistics for all key variables, including the overall CAT score, cognitive sub-scores, and summative exam results.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Descriptive statistics for key variables, including mean, standard deviation, and observed range values for the formative CAT total score, cognitive domain sub-scores, and summative assessments.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Variable</th>
<th align="center" valign="top">Mean</th>
<th align="center" valign="top">SD</th>
<th align="center" valign="top">Minimum</th>
<th align="center" valign="top">Maximum</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Final CAT score</td>
<td align="center" valign="top">589.95</td>
<td align="center" valign="top">46.44</td>
<td align="center" valign="top">447.00</td>
<td align="center" valign="top">698.00</td>
</tr>
<tr>
<td align="left" valign="top">Skill&#x2013;Memory</td>
<td align="center" valign="top">604.79</td>
<td align="center" valign="top">54.87</td>
<td align="center" valign="top">446.00</td>
<td align="center" valign="top">713.00</td>
</tr>
<tr>
<td align="left" valign="top">Skill&#x2013;Analysis</td>
<td align="center" valign="top">564.94</td>
<td align="center" valign="top">57.71</td>
<td align="center" valign="top">434.00</td>
<td align="center" valign="top">694.00</td>
</tr>
<tr>
<td align="left" valign="top">Skill&#x2013;Decision</td>
<td align="center" valign="top">551.11</td>
<td align="center" valign="top">33.05</td>
<td align="center" valign="top">436.00</td>
<td align="center" valign="top">671.00</td>
</tr>
<tr>
<td align="left" valign="top">Progress Test score</td>
<td align="center" valign="top">14.91</td>
<td align="center" valign="top">2.16</td>
<td align="center" valign="top">8.80</td>
<td align="center" valign="top">18.20</td>
</tr>
<tr>
<td align="left" valign="top">Exam 1 score</td>
<td align="center" valign="top">12.98</td>
<td align="center" valign="top">2.07</td>
<td align="center" valign="top">5.70</td>
<td align="center" valign="top">17.10</td>
</tr>
<tr>
<td align="left" valign="top">Exam 2 score</td>
<td align="center" valign="top">12.49</td>
<td align="center" valign="top">2.47</td>
<td align="center" valign="top">6.20</td>
<td align="center" valign="top">17.20</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Spearman&#x2019;s rank correlations were computed to examine the relationships between the final CAT score, cognitive diagnostic sub-scores (Skill&#x2013;Memory, Skill&#x2013;Analysis, Skill&#x2013;Decision), and academic outcomes (Exam 1, Exam 2, and Progress Test). Results are detailed in <xref ref-type="table" rid="tab2">Table 2</xref> and visually represented in <xref ref-type="fig" rid="fig3">Figure 3</xref>.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Multiple Linear regression analyses evaluating the predictive value of the final CAT score and cognitive diagnostic sub-scores (Skill&#x2013;Memory, Skill&#x2013;Analysis, Skill&#x2013;Decision) on Progress Test performance.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Predictor</th>
<th align="center" valign="top"><italic>&#x03B2;</italic> (standardized)</th>
<th align="center" valign="top">95%CI</th>
<th align="center" valign="top"><italic>t</italic></th>
<th align="center" valign="top"><italic>p</italic></th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Skill&#x2013;Memory</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">[0.002, 0.029]</td>
<td align="center" valign="top">2.22</td>
<td align="center" valign="top">0.29</td>
</tr>
<tr>
<td align="left" valign="top">Skill&#x2013;Analysis</td>
<td align="center" valign="top">0.25</td>
<td align="center" valign="top">[0.003, 0.021]</td>
<td align="center" valign="top">1.48</td>
<td align="center" valign="top">0.143</td>
</tr>
<tr>
<td align="left" valign="top">Skill&#x2013;Decision</td>
<td align="center" valign="top">0.05</td>
<td align="center" valign="top">[&#x2212;0.008, 0.015]</td>
<td align="center" valign="top">0.58</td>
<td align="center" valign="top">0.565</td>
</tr>
<tr>
<td align="left" valign="top">Final CAT score</td>
<td align="center" valign="top">&#x2212;0.05</td>
<td align="center" valign="top">[&#x2212;0.0025, 0.020]</td>
<td align="center" valign="top">&#x2212;0.20</td>
<td align="center" valign="top">0.845</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Model: <italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.23, adjusted <italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.20, <italic>F</italic>(4,107)&#x202F;=&#x202F;7.98, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001.</p>
</table-wrap-foot>
</table-wrap>
<p>The final CAT score showed strong correlations with Skill&#x2013;Memory (<italic>&#x03C1;</italic>&#x202F;=&#x202F;0.73, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001) and Skill&#x2013;Analysis (<italic>&#x03C1;</italic>&#x202F;=&#x202F;0.63, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), and a moderate correlation with Skill&#x2013;Decision (<italic>&#x03C1;</italic>&#x202F;=&#x202F;0.42, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001).</p>
<p>Regarding academic outcomes, the Progress Test was moderately correlated with Skill&#x2013;Memory (&#x03C1;&#x202F;=&#x202F;0.38, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001) and showed weaker associations with Skill&#x2013;Analysis (<italic>&#x03C1;</italic>&#x202F;=&#x202F;0.26, <italic>p</italic>&#x202F;=&#x202F;0.006) and Skill&#x2013;Decision (<italic>&#x03C1;</italic>&#x202F;=&#x202F;0.28, <italic>p</italic>&#x202F;=&#x202F;0.003). Both Exam 1 and Exam 2 correlated more strongly with Skill&#x2013;Memory than with the other sub-scores. Additionally, Exam 1, Exam 2, and the Progress Test were highly intercorrelated (<italic>&#x03C1;</italic>&#x202F;&#x2248;&#x202F;0.8, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), indicating consistent performance across summative assessments.</p>
<p>A Mann&#x2013;Whitney test revealed that CAT participants scored significantly higher on the Progress Test than non-participants (<italic>U</italic>&#x202F;=&#x202F;1,311, <italic>p</italic>&#x202F;=&#x202F;0.021, rank-biserial correlation&#x202F;=&#x202F;&#x2212;0.271). The differences in Exam 1 (<italic>U</italic>&#x202F;=&#x202F;1392.5, <italic>p</italic>&#x202F;=&#x202F;0.054) and Exam 2 did not differ significantly between groups (<italic>U</italic>&#x202F;=&#x202F;1,466, <italic>p</italic>&#x202F;=&#x202F;0.115). <xref ref-type="fig" rid="fig4">Figure 4</xref> illustrates the group comparison of Progress Test scores between students who completed the formative CAT and those who did not.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Spearman&#x2019;s correlation matrix between adaptive test performance, cognitive diagnostic dimensions, and academic outcomes. The heatmap shows correlations between the Computerized Adaptive Test (CAT) total score, Cognitive Diagnostic sub-scores (Skill&#x2013;Memory, Skill&#x2013;Analysis, Skill&#x2013;Decision), and summative exam results (Exam 1, Exam 2, and Progress Test). Darker red shades indicate stronger positive correlations, while blue indicates negative associations. All displayed coefficients are significant at <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05.</p>
</caption>
<graphic xlink:href="fnbeh-19-1735237-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Spearman's correlation matrix heatmap showing correlations among various skills and exams. Darker red indicates higher correlation. The scale ranges from zero to one. Key variables include final score, skill memory, skill analysis, skill decision, exam one, exam two, and progress test.</alt-text>
</graphic>
</fig>
<p>An ANCOVA was conducted to evaluate the impact of participation in the formative CAT exam on the summative score. After adjusting for prior performance (Exam 1 and Exam 2), participation in the formative CAT exam had a statistically significant effect on the Progress Test, <italic>F</italic>(1, 143)&#x202F;=&#x202F;4.239, <italic>p</italic>&#x202F;=&#x202F;0.041, <italic>&#x03B7;</italic><sup>2</sup>&#x202F;=&#x202F;0.029. Both Exam 1 (<italic>F</italic>&#x202F;=&#x202F;35.835, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001) and Exam 2 (<italic>F</italic>&#x202F;=&#x202F;48.388, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001) were also significant predictors of performance, indicating that earlier academic performance was strongly associated with the outcome. These findings suggest that the formative exam were associated with improved performance beyond what could be explained by prior academic achievement alone. <xref ref-type="fig" rid="fig4">Figure 4</xref> illustrates the regression relationships between CAT performance metrics and summative Progress Test results.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Box plot comparing Progress Test scores between students who participated in the formative Computerized Adaptive Test (CAT) and those who did not. CAT participants achieved significantly higher scores on the summative Progress Test (<italic>U</italic>&#x202F;=&#x202F;1,311, <italic>p</italic>&#x202F;=&#x202F;0.021, rank-biserial correlation&#x202F;=&#x202F;&#x2212;0.27). Error bars represent standard deviations.</p>
</caption>
<graphic xlink:href="fnbeh-19-1735237-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Scatter plot displaying progress test scores based on CAT participation. Two points with error bars compare scores for participants, labeled as "No" and "Yes". "Yes" shows a higher average score and a narrower error range.</alt-text>
</graphic>
</fig>
<p>A series of linear regressions were conducted to evaluate the predictive value of the final CAT score and each cognitive diagnostic sub-score (Skill&#x2013;Memory, Skill&#x2013;Analysis, and Skill&#x2013;Decision) on Progress Test performance.</p>
<p>The model using the final CAT score as a predictor was statistically significant, <italic>F</italic>(1, 114)&#x202F;=&#x202F;25.364, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001, explaining 18.2% of the variance in Progress Test scores (<italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.182). The final CAT score emerged as a significant positive predictor (<italic>&#x03B2;</italic>&#x202F;=&#x202F;0.427, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), indicating that higher adaptive test performance was associated with improved summative outcomes.</p>
<p>When each cognitive skill domain was analyzed separately, Skill&#x2013;Memory was the strongest predictor of Progress Test performance, <italic>F</italic>(1, 110)&#x202F;=&#x202F;24.196, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001, accounting for 18.0% of the variance (<italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.180; <italic>&#x03B2;</italic>&#x202F;=&#x202F;0.425, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), indicating a moderate positive association between memory skills and Progress Test performance. Skill&#x2013;Analysis also significantly predicted Progress Test scores, <italic>F</italic>(1, 110)&#x202F;=&#x202F;9.574, <italic>p</italic>&#x202F;=&#x202F;0.003, though with a smaller effect size (<italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.080; <italic>&#x03B2;</italic>&#x202F;=&#x202F;0.283, <italic>p</italic>&#x202F;=&#x202F;0.003). In contrast, Skill&#x2013;Decision explained only 2.9% of the variance and was not a significant predictor (<italic>R</italic><sup>2</sup>&#x202F;=&#x202F;0.029; <italic>&#x03B2;</italic>&#x202F;=&#x202F;0.170, <italic>p</italic>&#x202F;=&#x202F;0.072). However, when all domains were entered simultaneously into a multiple regression model, only <italic>Skill&#x2013;Memory</italic> remained a significant independent predictor (<italic>&#x03B2;</italic>&#x202F;=&#x202F;0.41, <italic>p</italic>&#x202F;=&#x202F;0.029), confirming its dominant role after controlling for intercorrelations among cognitive domains (<xref ref-type="table" rid="tab1">Table 1</xref>). <xref ref-type="fig" rid="fig5">Figure 5</xref> illustrates the regression relationships between CAT performance metrics and summative outcomes.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Scatterplots showing relationships between formative Computerized Adaptive Test (CAT) performance and summative Progress Test outcomes. Each panel displays standardized residuals from linear regression models with fitted regression lines and 95% confidence intervals. <bold>(A)</bold> Skill&#x2013;Memory, <bold>(B)</bold> Skill&#x2013;Analysis, <bold>(C)</bold> Final CAT score, and <bold>(D)</bold> Skill&#x2013;Decision. Among the predictors, Skill&#x2013;Memory showed the strongest positive association with Progress Test performance, while the other domains exhibited weaker or non-significant trends.</p>
</caption>
<graphic xlink:href="fnbeh-19-1735237-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Scatter plot grid showing four panels: A) Residuals Progress Test Score vs. Residuals Skill-Memory, B) Residuals Progress Test Score vs. Residuals Skill-Analysis, C) Residuals Progress Test Score vs. Residuals Final Score, D) Residuals Progress Test Score vs. Residuals Skill-Decision. Each panel has data points scattered around a line indicating a positive correlation.</alt-text>
</graphic>
</fig>
</sec>
<sec sec-type="discussion" id="sec12">
<title>Discussion</title>
<p>This study explored the use of CAT combined with CDM as a formative assessment tool in undergraduate surgical education. A distinctive feature of the intervention was the integration of AI into both item development and feedback generation, allowing efficient test calibration, individualized difficulty estimation, and automated reporting with minimal faculty workload. These findings align with the growing evidence that AI-driven assessment systems can enhance precision, scalability, and personalization in medical education (<xref ref-type="bibr" rid="ref26">Pohn et al., 2025</xref>; <xref ref-type="bibr" rid="ref32">Shaw et al., 2025</xref>; <xref ref-type="bibr" rid="ref17">Gordon et al., 2024</xref>; <xref ref-type="bibr" rid="ref23">Mir et al., 2023</xref>; <xref ref-type="bibr" rid="ref33">Sunmboye et al., 2025</xref>; <xref ref-type="bibr" rid="ref16">Gomez et al., 2025</xref>).</p>
<p>The association between participation in the formative CAT&#x2013;CDM exam and higher performance in the subsequent summative Progress Test suggests that adaptive, feedback-oriented assessments may promote more effective study strategies. The multiple regression analysis, including all cognitive domains and the total CAT score, confirmed that only <italic>Skill&#x2013;Memory</italic> remained a significant independent predictor of summative performance, whereas <italic>Skill&#x2013;Analysis</italic> and <italic>Skill&#x2013;Decision</italic> did not contribute additional explanatory value once intercorrelations were controlled for. This finding indicates that the predictive relationship between analytical and decision-making domains and summative performance is largely shared with memory-based competence. It also reinforces the interpretation that current multiple-choice examination formats primarily reward factual recall rather than complex reasoning or integrative decision-making. These results highlight the need for assessment strategies capable of isolating higher-order cognitive processes from underlying knowledge recall. One possible explanation is that items categorized as decision may not have captured the full complexity of real-world clinical reasoning. As highlighted in previous work, problem-solving in surgery often depends on context, uncertainty, and prioritization rather than discrete knowledge application (<xref ref-type="bibr" rid="ref29">Ross et al., 2022</xref>; <xref ref-type="bibr" rid="ref9">Crebbin et al., 2013</xref>) Multiple-choice questions, even when well constructed, are limited in their ability to elicit such integrative reasoning. In contrast, memory items, more closely aligned with the structure of summative exams, may show stronger statistical relationships simply because both assessments rely on similar cognitive processes. This alignment may reflect systemic bias toward factual recall in traditional testing, emphasizing the need for new formats that authentically measure complex reasoning, such as rich clinical vignettes, branching scenarios, or virtual patients (<xref ref-type="bibr" rid="ref16">Gomez et al., 2025</xref>; <xref ref-type="bibr" rid="ref3">Billings et al., n.d.</xref>; <xref ref-type="bibr" rid="ref27">Rice et al., 2022</xref>) Beyond item design limitations, several alternative explanations merit consideration. First, decision-making competence may not yet be sufficiently developed in fifth-year medical students to exhibit measurable variance, given that authentic clinical decision-making typically matures during postgraduate training. Second, the near-zero correlation between analysis and decision domains (<italic>&#x03C1;</italic>&#x202F;=&#x202F;0.08) suggests potential overlap or misalignment within the cognitive classification framework itself, indicating that the current three-domain model may have limited discriminant validity. Third, the summative Progress Test used as the external criterion primarily measures factual and analytical reasoning, which may not adequately fully capture the situational judgment or uncertainty management, considered core features of surgical decision-making. Forth, decision-making items may demand context integration and abstraction beyond what can be effectively assessed in a brief, text-based MCQ, causing students to rely on pattern recognition rather than deliberate reasoning. Fifth, the weak association might also reflect a misalignment between formative and summative constructs: whereas the CAT&#x2013;CDM aimed to assess applied reasoning, the Progress Test could be predominantly capturing factual recall, thereby reducing shared variance by design. These combined factors likely explain the absence of significant associations and underscore that current multiple-choice formats are inherently constrained in representing complex cognitive processes such as risk&#x2013;benefit reasoning, ethical trade-offs, and context-specific prioritization. As such, the null finding highlights an important boundary in the construct validity of decision-making assessment and identifies an area for future instrument development and validation.</p>
<p>From an educational perspective, these results have practical implications for curriculum design. Although statistically reliable, the effect sizes were modest, suggesting that the practical impact of short formative interventions may be incremental rather than transformative. If recall remains the main predictor of summative success, surgical educators risk over-emphasizing rote knowledge at the expense of reasoning and decision-making competence. Integrating adaptive cognitive diagnostics into teaching could help identify students who rely predominantly on memorization and guide them toward deliberate practice in interpretation and judgment. Educators should consider coupling CAT-CDM data with simulation or case-based discussions, transforming feedback into structured remediation plans. In parallel, AI-supported feedback dashboards could provide students with a dynamic map of their evolving cognitive profile, encouraging self-regulated learning and early correction of deficiencies. By integrating these tools, surgical curricula could progressively shift from knowledge reproduction to diagnostic reasoning and decision-making mastery.</p>
<p>Beyond its psychometric contribution, the present study provides several educational insights. The CAT&#x2013;CDM framework aligns closely with principles of adaptive learning, in which instructional content and assessment dynamically adjust to each learner&#x2019;s ability level, which we believe to be key to learning. By identifying specific cognitive domains requiring reinforcement, adaptive testing provides personalized diagnostic feedback that can guide self-regulated learning. Students can use the domain-specific results to direct their study strategies toward weaker areas, engage in deliberate practice, and monitor progress over time. From an instructional standpoint, such diagnostic information enables educators to allocate remediation resources more efficiently and to tailor teaching toward common cognitive gaps.</p>
<p>Furthermore, the integration of AI-assisted item generation and feedback represents a scalable model for formative assessment with minimal faculty workload, consistent with the growing emphasis on feedback as a continuous, learner-driven process rather than an episodic event. By converting performance data into interpretable cognitive profiles, this approach helps close the feedback loop and fosters reflection, autonomy, and iterative improvement.</p>
<p>Several limitations must be acknowledged. First, participation in the formative exam was voluntary, introducing potential selection bias, as more motivated students may have been more likely to participate. Although all students from the same curricular unit were invited to participate, and overall participation was high (79%), voluntary participation may have led to self-selection bias, with more motivated or academically stronger students being overrepresented among CAT participants. This possibility should be considered when interpreting the association between formative and summative performance since it restricts the ability to establish causal inferences; therefore, the observed differences should be interpreted as associations rather than direct effects of the intervention. This short interval of 3&#x2013;5&#x202F;days before the summative Progress Test mirrors real-world preparation patterns in which students consolidate learning shortly before assessment. The aim was not to measure long-term knowledge retention but to determine whether adaptive feedback could inform final revision strategies. Future studies will extend the interval between formative and summative assessments to investigate sustained learning effects and behavioral change over time. Second, this was a single-center study, potentially limiting applicability to other educational contexts. Finally, the study focused on short-term outcomes; future research should examine whether the observed benefits persist in long-term knowledge retention and clinical performance. Although initial inter-rater reliability for cognitive classification was substantial (<italic>&#x03BA;</italic>&#x202F;=&#x202F;0.76), and retrospective double-blind validation of a sample of items demonstrated similarly substantial agreement (<italic>&#x03BA;</italic>&#x202F;=&#x202F;0.68), some residual subjectivity in distinguishing analysis from decision items cannot be fully excluded. This issue is particularly relevant given the weaker associations observed in the Skill&#x2013;Decision domain. Future research should include independent expert classification of the entire item bank to further strengthen the construct validity of cognitive domain distinctions.</p>
<p>Future research should employ randomized or crossover designs to confirm efficacy, explore the durability of learning effects, and examine the longitudinal impact of adaptive feedback on self-regulated learning. Establishing causal links between adaptive testing, feedback quality, and performance improvement would provide stronger evidence for scaling this approach. Integrating the CAT&#x2013;CDM model with simulation-based or virtual-patient environments could enhance authenticity and allow assessment of higher-order decision-making under uncertainty&#x2014;an essential but often under-evaluated component of surgical competence. Expanding item banks with scenario-driven and branching questions may further strengthen construct validity for complex reasoning. Ultimately, such investigations could contribute to a more holistic and adaptive assessment ecosystem encompassing the cognitive, technical, and non-technical dimensions of surgical education.</p>
</sec>
<sec sec-type="conclusions" id="sec13">
<title>Conclusion</title>
<p>This study found that the use of Computerized Adaptive Testing (CAT) combined with Cognitive Diagnostic Modeling (CDM), supported by AI-based item generation and automated feedback, was associated with higher performance in a subsequent summative assessment. Although decision-making skills were underrepresented in predictive models, the results highlight the need for curricular strategies that better promote higher-order cognitive processing in surgical education. Within the acknowledged methodological limitations, AI-enhanced CAT&#x2013;CDM emerges as a promising approach for delivering meaningful cognitive feedback and fostering data-informed learning in medical training.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec14">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="sec15">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Ethics Committee for Research in Life and Health Sciences. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study.</p>
</sec>
<sec sec-type="author-contributions" id="sec16">
<title>Author contributions</title>
<p>NS: Validation, Resources, Investigation, Visualization, Data curation, Conceptualization, Formal analysis, Project administration, Writing &#x2013; review &#x0026; editing, Funding acquisition, Software, Supervision, Methodology, Writing &#x2013; original draft. CC: Data curation, Validation, Conceptualization, Supervision, Methodology, Project administration, Investigation, Resources, Writing &#x2013; review &#x0026; editing, Funding acquisition, Software, Formal analysis, Visualization. JP: Visualization, Resources, Funding acquisition, Validation, Formal analysis, Project administration, Supervision, Data curation, Writing &#x2013; review &#x0026; editing, Investigation, Conceptualization, Software, Methodology.</p>
</sec>
<sec sec-type="COI-statement" id="sec17">
<title>Conflict of interest</title>
<p>JP is a co-founder of iCognitus4ALL &#x2013; IT Solutions (Porto, Portugal), the company that developed QuizOne<sup>&#x00AE;</sup>, the assessment platform used in this study. This affiliation did not influence study design, data collection, analysis, or interpretation of results. No financial compensation, consultancy fees, or royalties were received by any author for conducting or publishing this research. QuizOne<sup>&#x00AE;</sup> was accessed through the University of Minho&#x2019;s standard institutional spin-off agreement at no additional cost to the research project, and the platform is routinely employed for both formative and summative assessments across the medical curriculum. iCognitus had no role in manuscript preparation, data analysis, interpretation of findings, or the decision to publish. There were no restrictions on the reporting of results, including null or negative findings.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The author(s) declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec sec-type="ai-statement" id="sec18">
<title>Generative AI statement</title>
<p>The authors declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec19">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec20">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fnbeh.2025.1735237/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fnbeh.2025.1735237/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.DOCX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_2.DOCX" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Appelhaus</surname><given-names>S.</given-names></name> <name><surname>Werner</surname><given-names>S.</given-names></name> <name><surname>Grosse</surname><given-names>P.</given-names></name> <name><surname>K&#x00E4;mmer</surname><given-names>J. E.</given-names></name></person-group> (<year>2023</year>). <article-title>Feedback, fairness, and validity: effects of disclosing and reusing multiple-choice questions in medical schools</article-title>. <source>Med. Educ. Online</source> <volume>28</volume>:<fpage>2143298</fpage>. doi: <pub-id pub-id-type="doi">10.1080/10872981.2022.2143298</pub-id>, <pub-id pub-id-type="pmid">36350605</pub-id></mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barthakur</surname><given-names>A.</given-names></name> <name><surname>Kovanovic</surname><given-names>V.</given-names></name> <name><surname>Joksimovic</surname><given-names>S.</given-names></name> <name><surname>Zhang</surname><given-names>Z.</given-names></name> <name><surname>Richey</surname><given-names>M.</given-names></name> <name><surname>Pardo</surname><given-names>A.</given-names></name></person-group> (<year>2022</year>). <article-title>Measuring leadership development in workplace learning using automated assessments: learning analytics and measurement theory approach</article-title>. <source>Br. J. Educ. Technol.</source> <volume>53</volume>, <fpage>1842</fpage>&#x2013;<lpage>1863</lpage>. doi: <pub-id pub-id-type="doi">10.1111/bjet.13218</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Billings</surname><given-names>MS</given-names></name> <name><surname>DeRuchie</surname><given-names>K</given-names></name> <name><surname>Go</surname><given-names>S</given-names></name> <name><surname>Hussie</surname><given-names>K</given-names></name> <name><surname>Kulesher</surname><given-names>A</given-names></name> <name><surname>Merrell</surname><given-names>J</given-names></name></person-group> <source>Constructing written test questions for the health SCIENCES</source> <year>n.d.</year></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Burgess</surname><given-names>A.</given-names></name> <name><surname>van Diggele</surname><given-names>C.</given-names></name> <name><surname>Roberts</surname><given-names>C.</given-names></name> <name><surname>Mellis</surname><given-names>C.</given-names></name></person-group> (<year>2020</year>). <article-title>Feedback in the clinical setting</article-title>. <source>BMC Med. Educ.</source> <volume>20</volume>:<fpage>460</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12909-020-02280-5</pub-id>, <pub-id pub-id-type="pmid">33272265</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Burr</surname><given-names>S. A.</given-names></name> <name><surname>Gale</surname><given-names>T.</given-names></name> <name><surname>Kisielewska</surname><given-names>J.</given-names></name> <name><surname>Millin</surname><given-names>P.</given-names></name> <name><surname>P&#x00EA;go</surname><given-names>J. M.</given-names></name> <name><surname>Pinter</surname><given-names>G.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>A narrative review of adaptive testing and its application to medical education</article-title>. <source>MedEdPublish</source> <volume>2023</volume>:<fpage>221</fpage>. doi: <pub-id pub-id-type="doi">10.12688/mep.19844.1</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Butler</surname><given-names>A. C.</given-names></name> <name><surname>Roediger</surname><given-names>H. L.</given-names></name></person-group> (<year>2008</year>). <article-title>Feedback enhances the positive effects and reduces the negative effects of multiple-choice testing</article-title>. <source>Mem. Cogn.</source> <volume>36</volume>, <fpage>604</fpage>&#x2013;<lpage>616</lpage>. doi: <pub-id pub-id-type="doi">10.3758/MC.36.3.604</pub-id>, <pub-id pub-id-type="pmid">18491500</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chang</surname><given-names>H.-H.</given-names></name> <name><surname>Ying</surname><given-names>Z.</given-names></name></person-group> (<year>1996</year>). <article-title>A global information approach to computerized adaptive testing</article-title>. <source>Appl. Psychol. Meas.</source> <volume>20</volume>, <fpage>213</fpage>&#x2013;<lpage>229</lpage>. doi: <pub-id pub-id-type="doi">10.1177/014662169602000303</pub-id></mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Collares</surname><given-names>C. F.</given-names></name> <name><surname>Cecilio-Fernandes</surname><given-names>D.</given-names></name></person-group> (<year>2019</year>). <article-title>When I say &#x2026; computerised adaptive testing</article-title>. <source>Med. Educ.</source> <volume>53</volume>, <fpage>115</fpage>&#x2013;<lpage>116</lpage>. doi: <pub-id pub-id-type="doi">10.1111/medu.13648</pub-id>, <pub-id pub-id-type="pmid">30125393</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Crebbin</surname><given-names>W.</given-names></name> <name><surname>Beasley</surname><given-names>S. W.</given-names></name> <name><surname>Watters</surname><given-names>D. A. K.</given-names></name></person-group> (<year>2013</year>). <article-title>Clinical decision making: how surgeons do it</article-title>. <source>ANZ J. Surg.</source> <volume>83</volume>, <fpage>422</fpage>&#x2013;<lpage>428</lpage>. doi: <pub-id pub-id-type="doi">10.1111/ans.12180</pub-id>, <pub-id pub-id-type="pmid">23638720</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dedy</surname><given-names>N. J.</given-names></name> <name><surname>Bonrath</surname><given-names>E. M.</given-names></name> <name><surname>Zevin</surname><given-names>B.</given-names></name> <name><surname>Grantcharov</surname><given-names>T. P.</given-names></name></person-group> (<year>2013</year>). <article-title>Teaching nontechnical skills in surgical residency: a systematic review of current approaches and outcomes</article-title>. <source>Surgery</source> <volume>154</volume>, <fpage>1000</fpage>&#x2013;<lpage>1008</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.surg.2013.04.034</pub-id>, <pub-id pub-id-type="pmid">23777588</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>El Boghdady</surname><given-names>M.</given-names></name> <name><surname>Alijani</surname><given-names>A.</given-names></name></person-group> (<year>2017</year>). <article-title>Feedback in surgical education</article-title>. <source>Surgeon</source> <volume>15</volume>, <fpage>98</fpage>&#x2013;<lpage>103</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.surge.2016.06.006</pub-id>, <pub-id pub-id-type="pmid">27426914</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Falc&#x00E3;o</surname><given-names>F.</given-names></name> <name><surname>Costa</surname><given-names>P.</given-names></name> <name><surname>P&#x00EA;go</surname><given-names>J. M.</given-names></name></person-group> (<year>2022</year>). <article-title>Feasibility assurance: a review of automatic item generation in medical assessment</article-title>. <source>Adv. Health Sci. Educ. Theory Pract.</source> <volume>27</volume>, <fpage>405</fpage>&#x2013;<lpage>425</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10459-022-10092-z</pub-id>, <pub-id pub-id-type="pmid">35230589</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Falc&#x00E3;o</surname><given-names>F.</given-names></name> <name><surname>Pereira</surname><given-names>D. M.</given-names></name> <name><surname>Gon&#x00E7;alves</surname><given-names>N.</given-names></name> <name><surname>De Champlain</surname><given-names>A.</given-names></name> <name><surname>Costa</surname><given-names>P.</given-names></name> <name><surname>P&#x00EA;go</surname><given-names>J. M.</given-names></name></person-group> (<year>2023</year>). <article-title>A suggestive approach for assessing item quality, usability and validity of automatic item generation</article-title>. <source>Adv. Health Sci. Educ. Theory Pract.</source> <volume>28</volume>, <fpage>1441</fpage>&#x2013;<lpage>1465</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10459-023-10225-y</pub-id>, <pub-id pub-id-type="pmid">37097483</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Garner</surname><given-names>M. S.</given-names></name> <name><surname>Gusberg</surname><given-names>R. J.</given-names></name> <name><surname>Kim</surname><given-names>A. W.</given-names></name></person-group> (<year>2014</year>). <article-title>The positive effect of immediate feedback on medical student education during the surgical clerkship</article-title>. <source>J. Surg. Educ.</source> <volume>71</volume>, <fpage>391</fpage>&#x2013;<lpage>397</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jsurg.2013.10.009</pub-id>, <pub-id pub-id-type="pmid">24797856</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gierl</surname><given-names>M. J.</given-names></name> <name><surname>Shin</surname><given-names>J.</given-names></name> <name><surname>Firoozi</surname><given-names>T.</given-names></name> <name><surname>Lai</surname><given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>Using content coding and automatic item generation to improve test security</article-title>. <source>Front. Educ.</source> <volume>7</volume>. doi: <pub-id pub-id-type="doi">10.3389/feduc.2022.853578</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Gomez</surname><given-names>C</given-names></name> <name><surname>Seenivasan</surname><given-names>L</given-names></name> <name><surname>Zou</surname><given-names>X</given-names></name> <name><surname>Yoon</surname><given-names>J</given-names></name> <name><surname>Chu</surname><given-names>S</given-names></name> <name><surname>Leong</surname><given-names>A</given-names></name> <etal/></person-group> <source>Explainable AI for automated user-specific feedback in surgical skill acquisition</source>. In <conf-name>Human-AI collaboration: first international workshop, HAIC 2025, held in conjunction with MICCAI 2025, Daejeon, South Korea</conf-name> <year>2025</year>.</mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gordon</surname><given-names>M.</given-names></name> <name><surname>Daniel</surname><given-names>M.</given-names></name> <name><surname>Ajiboye</surname><given-names>A.</given-names></name> <name><surname>Uraiby</surname><given-names>H.</given-names></name> <name><surname>Xu</surname><given-names>N. Y.</given-names></name> <name><surname>Bartlett</surname><given-names>R.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>A scoping review of artificial intelligence in medical education: BEME guide no. 84</article-title>. <source>Med. Teach.</source> <volume>46</volume>, <fpage>446</fpage>&#x2013;<lpage>470</lpage>. doi: <pub-id pub-id-type="doi">10.1080/0142159X.2024.2314198</pub-id>, <pub-id pub-id-type="pmid">38423127</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Harrison</surname></name></person-group> (<year>2017</year>). <source>Feedback in the context of high-stakes assessment: can summative be formative?</source> <publisher-loc>Maastricht</publisher-loc>: <publisher-name>Maastricht University</publisher-name>, doi: <pub-id pub-id-type="doi">10.26481/dis.20170920ch</pub-id>.</mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="book"><person-group person-group-type="editor"><name><surname>Leighton</surname><given-names>J.</given-names></name> <name><surname>Gierl</surname><given-names>M.</given-names></name></person-group> (Eds.) (<year>2007</year>). <source>Cognitive diagnostic assessment for education: theory and applications</source>. <publisher-loc>Cambridge</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name> doi: <pub-id pub-id-type="doi">10.1017/CBO9780511611186</pub-id>.</mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lord</surname><given-names>F. M.</given-names></name></person-group> (<year>1971</year>). <article-title>Robbins-monro procedures for tailored testing</article-title>. <source>Educ. Psychol. Meas.</source> <volume>31</volume>, <fpage>3</fpage>&#x2013;<lpage>31</lpage>. doi: <pub-id pub-id-type="doi">10.1177/001316447103100101</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname><given-names>C.</given-names></name> <name><surname>Ouyang</surname><given-names>J.</given-names></name> <name><surname>Xu</surname><given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>Learning latent and hierarchical structures in cognitive diagnosis models</article-title>. <source>Psychometrika</source> <volume>88</volume>, <fpage>175</fpage>&#x2013;<lpage>207</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11336-022-09867-5</pub-id>, <pub-id pub-id-type="pmid">35596101</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Madani</surname><given-names>A.</given-names></name> <name><surname>Vassiliou</surname><given-names>M. C.</given-names></name> <name><surname>Watanabe</surname><given-names>Y.</given-names></name> <name><surname>Al-Halabi</surname><given-names>B.</given-names></name> <name><surname>Al-Rowais</surname><given-names>M. S.</given-names></name> <name><surname>Deckelbaum</surname><given-names>D. L.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>What are the principles that guide Behaviors in the operating room? Creating a framework to define and measure performance</article-title>. <source>Ann. Surg.</source> <volume>265</volume>, <fpage>255</fpage>&#x2013;<lpage>267</lpage>. doi: <pub-id pub-id-type="doi">10.1097/SLA.0000000000001962</pub-id>, <pub-id pub-id-type="pmid">27611618</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mir</surname><given-names>M. M.</given-names></name> <name><surname>Mir</surname><given-names>G. M.</given-names></name> <name><surname>Raina</surname><given-names>N. T.</given-names></name> <name><surname>Mir</surname><given-names>S. M.</given-names></name> <name><surname>Mir</surname><given-names>S. M.</given-names></name> <name><surname>Miskeen</surname><given-names>E.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Application of artificial intelligence in medical education: current scenario and future perspectives</article-title>. <source>J. Adv. Med. Educ. Prof.</source> <volume>11</volume>, <fpage>133</fpage>&#x2013;<lpage>140</lpage>. doi: <pub-id pub-id-type="doi">10.30476/JAMP.2023.98655.1803</pub-id>, <pub-id pub-id-type="pmid">37469385</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ounounou</surname><given-names>E.</given-names></name> <name><surname>Aydin</surname><given-names>A.</given-names></name> <name><surname>Brunckhorst</surname><given-names>O.</given-names></name> <name><surname>Khan</surname><given-names>M. S.</given-names></name> <name><surname>Dasgupta</surname><given-names>P.</given-names></name> <name><surname>Ahmed</surname><given-names>K.</given-names></name></person-group> (<year>2019</year>). <article-title>Nontechnical skills in surgery: a systematic review of current training modalities</article-title>. <source>J. Surg. Educ.</source> <volume>76</volume>, <fpage>14</fpage>&#x2013;<lpage>24</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jsurg.2018.05.017</pub-id>, <pub-id pub-id-type="pmid">30122636</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Owen</surname><given-names>R. J.</given-names></name></person-group> (<year>1975</year>). <article-title>A Bayesian sequential procedure for quantal response in the context of adaptive mental testing</article-title>. <source>J. Am. Stat. Assoc.</source> <volume>70</volume>, <fpage>351</fpage>&#x2013;<lpage>356</lpage>. doi: <pub-id pub-id-type="doi">10.2307/2285821</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pohn</surname><given-names>B.</given-names></name> <name><surname>Mehnen</surname><given-names>L.</given-names></name> <name><surname>Fitzek</surname><given-names>S.</given-names></name> <name><surname>Choi</surname><given-names>K.-E.</given-names></name> <name><surname>Braun</surname><given-names>R. J.</given-names></name> <name><surname>Hatamikia</surname><given-names>S.</given-names></name></person-group> (<year>2025</year>). <article-title>Integrating artificial intelligence into pre-clinical medical education: challenges, opportunities, and recommendations</article-title>. <source>Front. Educ.</source> <volume>10</volume>. doi: <pub-id pub-id-type="doi">10.3389/feduc.2025.1570389</pub-id></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rice</surname><given-names>N.</given-names></name> <name><surname>P&#x00EA;go</surname><given-names>J. M.</given-names></name> <name><surname>Collares</surname><given-names>C. F.</given-names></name> <name><surname>Kisielewska</surname><given-names>J.</given-names></name> <name><surname>Gale</surname><given-names>T.</given-names></name></person-group> (<year>2022</year>). <article-title>The development and implementation of a computer adaptive progress test across European countries</article-title>. <source>Comput. Educ.</source> <volume>3</volume>:<fpage>100083</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.caeai.2022.100083</pub-id>, <pub-id pub-id-type="pmid">41295951</pub-id></mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rosendal</surname><given-names>A. A.</given-names></name> <name><surname>Sloth</surname><given-names>S. B.</given-names></name> <name><surname>R&#x00F6;lfing</surname><given-names>J. D.</given-names></name> <name><surname>Bie</surname><given-names>M.</given-names></name> <name><surname>Jensen</surname><given-names>R. D.</given-names></name></person-group> (<year>2023</year>). <article-title>Technical, non-technical, or both? A scoping review of skills in simulation-based surgical training</article-title>. <source>J. Surg. Educ.</source> <volume>80</volume>, <fpage>731</fpage>&#x2013;<lpage>749</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jsurg.2023.02.011</pub-id>, <pub-id pub-id-type="pmid">36906398</pub-id></mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ross</surname><given-names>S.</given-names></name> <name><surname>Pirraglia</surname><given-names>C.</given-names></name> <name><surname>Aquilina</surname><given-names>A. M.</given-names></name> <name><surname>Zulla</surname><given-names>R.</given-names></name></person-group> (<year>2022</year>). <article-title>Effective competency-based medical education requires learning environments that promote a mastery goal orientation: a narrative review</article-title>. <source>Med. Teach.</source> <volume>44</volume>, <fpage>527</fpage>&#x2013;<lpage>534</lpage>. doi: <pub-id pub-id-type="doi">10.1080/0142159X.2021.2004307</pub-id>, <pub-id pub-id-type="pmid">34807798</pub-id></mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Seo</surname><given-names>D. G.</given-names></name> <name><surname>Choi</surname><given-names>J.</given-names></name> <name><surname>Kim</surname><given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Comparison of real data and simulated data analysis of a stopping rule based on the standard error of measurement in computerized adaptive testing for medical examinations in Korea: a psychometric study</article-title>. <source>J. Educ. Eval. Health Prof.</source> <volume>21</volume>:<fpage>18</fpage>. doi: <pub-id pub-id-type="doi">10.3352/jeehp.2024.21.18</pub-id>, <pub-id pub-id-type="pmid">38977033</pub-id></mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shaughness</surname><given-names>G.</given-names></name> <name><surname>Georgoff</surname><given-names>P. E.</given-names></name> <name><surname>Sandhu</surname><given-names>G.</given-names></name> <name><surname>Leininger</surname><given-names>L.</given-names></name> <name><surname>Nikolian</surname><given-names>V. C.</given-names></name> <name><surname>Reddy</surname><given-names>R.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Assessment of clinical feedback given to medical students via an electronic feedback system</article-title>. <source>J. Surg. Res.</source> <volume>218</volume>, <fpage>174</fpage>&#x2013;<lpage>179</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jss.2017.05.055</pub-id>, <pub-id pub-id-type="pmid">28985846</pub-id></mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shaw</surname><given-names>K.</given-names></name> <name><surname>Henning</surname><given-names>M. A.</given-names></name> <name><surname>Webster</surname><given-names>C. S.</given-names></name></person-group> (<year>2025</year>). <article-title>Artificial intelligence in medical education: a scoping review of the evidence for efficacy and future directions</article-title>. <source>Med. Sci. Educ.</source> <volume>35</volume>, <fpage>1803</fpage>&#x2013;<lpage>1816</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s40670-025-02373-0</pub-id>, <pub-id pub-id-type="pmid">40625971</pub-id></mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sunmboye</surname><given-names>K.</given-names></name> <name><surname>Strafford</surname><given-names>H.</given-names></name> <name><surname>Noorestani</surname><given-names>S.</given-names></name> <name><surname>Wilison-Pirie</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <article-title>Exploring the influence of artificial intelligence integration on personalized learning: a cross-sectional study of undergraduate medical students in the United Kingdom</article-title>. <source>BMC Med. Educ.</source> <volume>25</volume>:<fpage>570</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12909-025-07084-z</pub-id>, <pub-id pub-id-type="pmid">40247306</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Van Wijk</surname><given-names>E. V.</given-names></name> <name><surname>Donkers</surname><given-names>J.</given-names></name> <name><surname>De Laat</surname><given-names>P. C. J.</given-names></name> <name><surname>Meiboom</surname><given-names>A. A.</given-names></name> <name><surname>Jacobs</surname><given-names>B.</given-names></name> <name><surname>Ravesloot</surname><given-names>J. H.</given-names></name> <etal/></person-group>. (<year>n.d.</year>). <article-title>Computer adaptive vs. non-adaptive medical Progress testing: feasibility, test performance, and student experiences</article-title>. <source>Perspect Med Educ</source> <volume>13</volume>, <fpage>406</fpage>&#x2013;<lpage>416</lpage>. doi: <pub-id pub-id-type="doi">10.5334/pme.1345</pub-id>, <pub-id pub-id-type="pmid">39071727</pub-id></mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Williamson</surname><given-names>J.</given-names></name></person-group> <source>Cognitive diagnostic models and how they can be useful</source> <year>n.d.</year></mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname><given-names>L.</given-names></name> <name><surname>Jiang</surname><given-names>Z.</given-names></name> <name><surname>Han</surname><given-names>Y.</given-names></name> <name><surname>Liang</surname><given-names>H.</given-names></name> <name><surname>Ouyang</surname><given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>Developing computerized adaptive testing for a national health professionals exam: an attempt from psychometric simulations</article-title>. <source>Perspect. Med. Educ.</source> <volume>12</volume>, <fpage>462</fpage>&#x2013;<lpage>471</lpage>. doi: <pub-id pub-id-type="doi">10.5334/pme.855</pub-id>, <pub-id pub-id-type="pmid">37929203</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/21417/overview">Nivaldo A. P. de Vasconcelos</ext-link>, Federal University of Pernambuco, Brazil</p></fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2406042/overview">David Escobar-Castillejos</ext-link>, Panamerican University, Mexico</p><p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3268302/overview">Mert Ba&#x015F;arano&#x011F;lu</ext-link>, Mersin University, T&#x00FC;rkiye</p></fn>
</fn-group>
</back>
</article>