<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Digit. Health</journal-id><journal-title-group>
<journal-title>Frontiers in Digital Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Digit. Health</abbrev-journal-title></journal-title-group>
<issn pub-type="epub">2673-253X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdgth.2025.1737882</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Evaluation of multiple generative large language models on neurology board-style questions</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author"><name><surname>Almomani</surname><given-names>Mohammad</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/3093438/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role></contrib>
<contrib contrib-type="author"><name><surname>Valaparla</surname><given-names>Vijaya</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/2145763/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role></contrib>
<contrib contrib-type="author"><name><surname>Weatherhead</surname><given-names>James</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/3283891/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Fang</surname><given-names>Xiang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/260690/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Dabi</surname><given-names>Alok</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Li</surname><given-names>Chih-Ying</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>McCaffrey</surname><given-names>Peter</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Hier</surname><given-names>Dan</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/1230804/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Rodr&#x00ED;guez-Fern&#x00E1;ndez</surname><given-names>Jorge Mario</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor1">&#x002A;</xref>
<xref ref-type="author-notes" rid="fn001"><sup>&#x2020;</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/74576/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Neurology, University of Texas Medical Branch (UTMB)</institution>, <city>Galveston</city>, <state>TX</state>, <country country="us">United States</country></aff>
<aff id="aff2"><label>2</label><institution>School of Biomedical Sciences, University of Texas Medical Branch (UTMB)</institution>, <city>Galveston</city>, <state>TX</state>, <country country="us">United States</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Occupational Therapy, University of Texas Medical Branch (UTMB)</institution>, <city>Galveston</city>, <state>TX</state>, <country country="us">United States</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Pathology, University of Texas Medical Branch (UTMB)</institution>, <city>Galveston</city>, <state>TX</state>, <country country="us">United States</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Neurology and Rehabilitation, University of Illinois at Chicago</institution>, <city>Chicago</city>, <state>IL</state>, <country country="us">United States</country></aff>
<aff id="aff6"><label>6</label><institution>Department of Electrical and Computer Engineering, Missouri University of Science and Technology</institution>, <city>Rolla</city>, <state>MO</state>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label><bold>Correspondence:</bold> Jorge Mario Rodr&#x00ED;guez-Fern&#x00E1;ndez <email xlink:href="mailto:jormrodr@utmb.edu">jormrodr@utmb.edu</email></corresp>
<fn fn-type="other" id="fn001"><label>&#x2020;</label><p>ORCID Jorge Mario Rodr&#x00ED;guez-Fern&#x00E1;ndez <uri xlink:href="https://orcid.org/0000-0003-4888-2041">orcid.org/0000-0003-4888-2041</uri></p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-05"><day>05</day><month>01</month><year>2026</year></pub-date>
<pub-date publication-format="electronic" date-type="collection"><year>2025</year></pub-date>
<volume>7</volume><elocation-id>1737882</elocation-id>
<history>
<date date-type="received"><day>02</day><month>11</month><year>2025</year></date>
<date date-type="rev-recd"><day>25</day><month>11</month><year>2025</year></date>
<date date-type="accepted"><day>01</day><month>12</month><year>2025</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 Almomani, Valaparla, Weatherhead, Fang, Dabi, Li, McCaffrey, Hier and Rodr&#x00ED;guez-Fern&#x00E1;ndez.</copyright-statement>
<copyright-year>2026</copyright-year><copyright-holder>Almomani, Valaparla, Weatherhead, Fang, Dabi, Li, McCaffrey, Hier and Rodr&#x00ED;guez-Fern&#x00E1;ndez</copyright-holder><license><ali:license_ref start_date="2026-01-05">https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p></license>
</permissions>
<abstract><sec><title>Objective</title>
<p>To compare the performance of eight large language models (LLMs) with neurology residents on board-style multiple-choice questions across seven subspecialties and two cognitive levels.</p>
</sec><sec><title>Methods</title>
<p>In a cross-sectional benchmarking study, we evaluated Bard, Claude, Gemini v1, Gemini 2.5, ChatGPT-3.5, ChatGPT-4, ChatGPT-4o, and ChatGPT-5 using 107 text-only items spanning movement disorders, vascular neurology, neuroanatomy, neuroimmunology, epilepsy, neuromuscular disease, and neuro-infectious disease. Items were labeled as lower- or higher-order per Bloom&#x0027;s taxonomy by two neurologists. Models answered each item in a fresh session and reported confidence and Bloom classification. Residents completed the same set under exam-like conditions. Outcomes included overall and domain accuracies, guessing-adjusted accuracy, confidence&#x2013;accuracy calibration (Spearman <italic>&#x03C1;</italic>), agreement with expert Bloom labels (Cohen <italic>&#x03BA;</italic>), and inter-generation scaling (linear regression of topic-level accuracies). Group differences used Fisher exact or <italic>&#x03C7;</italic><sup>2</sup> tests with Bonferroni correction.</p>
</sec><sec><title>Results</title>
<p>Residents scored 64.9&#x0025;. ChatGPT-5 achieved 84.1&#x0025; and ChatGPT-4o 81.3&#x0025;, followed by Gemini 2.5 at 77.6&#x0025; and ChatGPT-4 at 68.2&#x0025;; Claude (56.1&#x0025;), Bard (54.2&#x0025;), ChatGPT-3.5 (53.3&#x0025;), and Gemini v1 (39.3&#x0025;) underperformed residents. On higher-order items, ChatGPT-5 (86&#x0025;) and ChatGPT-4o (82.5&#x0025;) maintained superiority; Gemini 2.5 matched 82.5&#x0025;. Guessing-adjusted accuracy preserved rank order (ChatGPT-5 78.8&#x0025;, ChatGPT-4o 75.1&#x0025;, Gemini 2.5 70.1&#x0025;). Confidence&#x2013;accuracy calibration was weak across models. Inter-generation scaling was strong within the ChatGPT lineage (ChatGPT-4 to 4o <italic>R</italic><sup>2</sup>&#x2009;&#x003D;&#x2009;0.765, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.010; 4o to 5 <italic>R</italic><sup>2</sup>&#x2009;&#x003D;&#x2009;0.908, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) but absent for Gemini v1 to 2.5 (R<sup>2</sup>&#x2009;&#x003D;&#x2009;0.002, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.918), suggesting discontinuous improvements.</p>
</sec><sec><title>Conclusions</title>
<p>LLMs&#x2014;particularly ChatGPT-5 and ChatGPT-4o&#x2014;exceeded resident performance on text-based neurology board-style questions across subspecialties and cognitive levels. Gemini 2.5 showed substantial gains over v1 but with domain-uneven scaling. Given weak confidence calibration, LLMs should be integrated as supervised educational adjuncts with ongoing validation, version governance, and transparent metadata to support safe use in neurology education.</p>
</sec>
</abstract>
<kwd-group>
<kwd>artificial intelligence</kwd>
<kwd>large language models</kwd>
<kwd>neurology education</kwd>
<kwd>board examinations</kwd>
<kwd>model performance analysis</kwd>
</kwd-group><funding-group><funding-statement>The author(s) declare that no financial support was received for the research and/or publication of this article.</funding-statement></funding-group><counts>
<fig-count count="2"/>
<table-count count="3"/><equation-count count="0"/><ref-count count="23"/><page-count count="8"/><word-count count="1110"/></counts><custom-meta-group><custom-meta><meta-name>section-at-acceptance</meta-name><meta-value>Health Informatics</meta-value></custom-meta></custom-meta-group>
</article-meta>
</front>
<body><sec id="s1" sec-type="intro"><title>Introduction</title>
<p>Rapid advancements in artificial intelligence (AI) have transformed many aspects of medicine, with generative AI emerging as a particularly promising innovation in neurology (<xref ref-type="bibr" rid="B1">1</xref>). Large language models (LLMs), a subset of AI, hold potential for enhancing diagnostic accuracy, advancing therapeutics, and contributing to patient and clinician education (<xref ref-type="bibr" rid="B2">2</xref>). Machine learning approaches have already demonstrated applicability across neurology subspecialties by improving the analysis of complex clinical data and supporting individualized outcome prediction (<xref ref-type="bibr" rid="B3">3</xref>).</p>
<p>LLMs are trained on vast corpora using deep learning and have exhibited strong performance in natural language processing tasks such as summarization, translation, and question answering. Recently, models such as OpenAI&#x0027;s ChatGPT and Microsoft&#x0027;s Bing Chat have been evaluated on standardized medical licensing examinations, including the United States Medical Licensing Examination (USMLE), where they achieved near-pass or passing scores (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B5">5</xref>). However, their application in neurology remains relatively limited. One study reported that ChatGPT-3.5 performed below the average passing threshold on neurosurgery board examinations, whereas ChatGPT-4 exceeded passing standards (<xref ref-type="bibr" rid="B6">6</xref>).</p>
<p>Neurology board examinations present unique challenges, requiring mastery of complex clinical narratives integrating neuroanatomy, neuropathology, and neurophysiology. These assessments demand higher-order reasoning, nuanced differential diagnosis, and synthesis of multifaceted case presentations (<xref ref-type="bibr" rid="B7">7</xref>). While prior research has examined LLMs on general medical board-style assessments, their performance in neurology-specific examinations has not been systematically characterized across multiple model generations (<xref ref-type="bibr" rid="B8">8</xref>&#x2013;<xref ref-type="bibr" rid="B10">10</xref>). This study addresses that gap by assessing the accuracy, reasoning capabilities, and limitations of LLMs in neurology board&#x2013;style examinations and by comparing their performance with neurology residents. By understanding both the strengths and weaknesses of LLMs in this specialized context, we aim to evaluate their role in medical education and clinical decision support, paving the way for future AI-assisted advancements in neurology.</p>
</sec>
<sec id="s2" sec-type="methods"><title>Methods</title>
<sec id="s2a"><title>Study design and ethical considerations</title>
<p>This was a cross-sectional exploratory study. Institutional review board (IRB) exemption was obtained because the project did not involve human participants or identifiable patient information and was conducted as an educational benchmarking activity using deidentified data.</p>
</sec>
<sec id="s2b"><title>Question development and classification</title>
<p>Multiple-choice questions were written and reviewed by board-certified physicians to resemble neurology board examinations. All questions were written <italic>de novo</italic> by board-certified neurologists and were not adapted or copied from commercial question banks or public online sources. All items were text-based, excluded radiologic or pathologic images, and followed a single-best-answer format. A total of 107 questions were developed and categorized by subspecialty: epilepsy (<italic>n</italic>&#x2009;&#x003D;&#x2009;8), movement disorders (<italic>n</italic>&#x2009;&#x003D;&#x2009;13), neuroanatomy (<italic>n</italic>&#x2009;&#x003D;&#x2009;25), neuroimmunology (<italic>n</italic>&#x2009;&#x003D;&#x2009;14), neuroinfectious disease (<italic>n</italic>&#x2009;&#x003D;&#x2009;17), neuromuscular disease (<italic>n</italic>&#x2009;&#x003D;&#x2009;17), and vascular neurology (<italic>n</italic>&#x2009;&#x003D;&#x2009;13).</p>
<p>Each question was further classified according to Bloom&#x0027;s taxonomy as lower-order (remembering or basic understanding) or higher-order (application, analysis, or evaluation). Classification was performed independently by two board-certified physicians (V.V., J.M.R.F.), with disagreements resolved by consensus (<xref ref-type="bibr" rid="B11">11</xref>). Each large language model (LLM) was then asked to classify each item as higher- or lower-order and to provide confidence ratings for both classification and answer selection. The following standardized prompt was used (verbatim):</p><disp-quote>
<p>You are a medical doctor and are taking the neurology board exam. The board exam consists of multiple choice questions. All output that you give must be in CSV format with the following six columns (1) Question number (2) Return the answer letter (3) Give an explanation (4) Rate your own confidence in your answer based on a Likert scale that has the following grades: 1&#x2009;&#x003D;&#x2009;no confidence [stating it does not know]; 2&#x2009;&#x003D;&#x2009;little confidence [i.e., maybe]; 3&#x2009;&#x003D;&#x2009;some confidence; 4&#x2009;&#x003D;&#x2009;confidence [i.e., likely]; 5&#x2009;&#x003D;&#x2009;high confidence [stating answer and explanation without doubt] (5) Classify the question into the following two categories: (1) lower order questions that probe remembering and basic understanding, and (2) higher order question where knowledge needs to be applied, analysis capabilities are examined, or evaluation is needed (return &#x201C;Higher&#x201D; or &#x201C;Lower&#x201D;) (6). Rate the confidence of your classification into these categories based on the Likert scale that has the following grades1&#x2009;&#x003D;&#x2009;no confidence [stating it does not know]; 2&#x2009;&#x003D;&#x2009;little confidence [i.e., maybe]; 3&#x2009;&#x003D;&#x2009;some confidence; 4&#x2009;&#x003D;&#x2009;confidence [ie, likely]; 5&#x2009;&#x003D;&#x2009;high confidence [stating answer and explanation without doubt]) Your output must look like the following row header: &#x007B;&#x201C;questionnumber&#x201D;:&#x2026;,&#x201C;answerletter&#x201D;:&#x2026;,&#x201C;reasoning&#x201D;:&#x2026;,&#x201C;confidence_answer_likert&#x201C;:&#x2026;,&#x201D;classification&#x201C;:&#x2026;,&#x201D; confidence_classification_likert&#x201C;:&#x2026;&#x201D;&#x007D;.</p></disp-quote>
<p>Although the prompt includes an exam-level framing, each question was entered into a separate, isolated chat session to maintain strict question-level independence and prevent conversational carry-over. This phrasing was retained because it reliably improved adherence to the required CSV output format without creating shared multi-question context.</p>
</sec>
<sec id="s2c"><title>Model evaluation procedures</title>
<p>All models were evaluated through their publicly available, web-based graphical user interfaces (GUIs) to simulate real-world clinician and trainee use. Each question was entered in a new, independent chat session to prevent conversational context from influencing subsequent responses. The interaction date for each model was recorded to document the version tested. To preserve real-world usability and reflect typical clinician interactions with public LLM interfaces, model responses were collected directly from the web platforms without automated post-processing or formatting scripts. When the model&#x0027;s output did not fully adhere to the required CSV structure, the prompt was re-submitted once in a new, isolated session to obtain a complete response. No question was ever regenerated after the model had visibility of the correct answer, and answer content was not manually modified.</p>
</sec>
<sec id="s2d"><title>Resident comparison group</title>
<p>Neurology residents completed the same 107 board-style multiple-choice questions covering all major subspecialties. All results were deidentified before analysis to maintain confidentiality. A total of 16 neurology residents participated (PGY-4 <italic>n</italic>&#x2009;&#x003D;&#x2009;4, PGY-3 <italic>n</italic>&#x2009;&#x003D;&#x2009;6, PGY-2 <italic>n</italic>&#x2009;&#x003D;&#x2009;6), and participation was voluntary as part of an educational benchmarking activity. The study adhered to institutional standards for ethical conduct and data privacy. Question content and difficulty were designed to reflect those of the American Academy of Neurology (AAN) Residency In-Service Training Examination and the American Board of Psychiatry and Neurology (ABPN) Certification Examination (<xref ref-type="bibr" rid="B12">12</xref>, <xref ref-type="bibr" rid="B13">13</xref>).</p>
<p>Eight LLMs were tested using their publicly available web-based graphical user interfaces to replicate real-world conditions. Each question was entered into a new, independent session to prevent conversational carry-over. The models included Bard, Claude-1, Gemini v1, Gemini 2.5, ChatGPT-3.5, ChatGPT-4, ChatGPT-4o, and ChatGPT-5. All models were given identical prompts instructing them to select the most appropriate answer, provide a short explanation, classify the item as higher- or lower-order according to Bloom&#x0027;s taxonomy, and rate their confidence for both the answer and classification on a five-point Likert scale (1&#x2009;&#x003D;&#x2009;no confidence to 5&#x2009;&#x003D;&#x2009;high confidence). Each prompt was formatted for CSV output to facilitate later analysis, and all sessions were time-stamped to record the model version tested. These models were selected because they were publicly accessible, widely used by clinicians and trainees, and representative of major contemporary LLM families. Other contemporary or domain-specific models that were not publicly accessible through stable consumer-facing interfaces at the time of testing were not included in this study.</p>
<p>For human comparison, neurology residents at the University of Texas Medical Branch completed the same 107 questions under controlled exam conditions. All scores were deidentified before analysis to ensure anonymity and compliance with institutional privacy standards.</p>
</sec>
<sec id="s2e"><title>Outcome measures and statistical analysis</title>
<p>Accuracy was calculated as the proportion of correctly answered questions for each model and for the resident cohort. Comparisons between model and resident performance were performed using Fisher&#x0027;s exact test with Bonferroni correction for multiple comparisons. Accuracy differences between higher- and lower-order questions were evaluated using <italic>&#x03C7;</italic><sup>2</sup> tests, and corrected accuracy was computed using the formula: number correct&#x2014;[number incorrect &#x00F7; (k&#x2014;1)], where <italic>k</italic> represents the number of answer options. Associations between model confidence and correctness were assessed with the Mann&#x2013;Whitney <italic>U</italic> test and Spearman rank correlation (<italic>&#x03C1;</italic>), while agreement between model classifications and expert labels was measured with Cohen&#x0027;s <italic>&#x03BA;</italic>. Longitudinal model-to-model improvement was examined through linear regression of subspecialty-specific accuracies, focusing on the transitions ChatGPT-4 and ChatGPT-4o, ChatGPT-4o and ChatGPT-5, and Gemini v1 and Gemini 2.5, with regression coefficients (<italic>&#x03B2;</italic>&#x2081;), coefficients of determination (<italic>R</italic><sup>2</sup>), and <italic>p</italic> values reported. All statistical analyses were conducted using R (version 4.0.5; R Foundation for Statistical Computing, Vienna, Austria), and statistical significance was defined as two-tailed <italic>p</italic>&#x2009;&#x003C;&#x2009;.05.</p>
</sec>
</sec>
<sec id="s3" sec-type="results"><title>Results</title>
<sec id="s3a"><title>Overall model and resident performance</title>
<p>A total of 107 items were analyzed across seven subspecialties. Neurology residents achieved a mean overall accuracy of 64.9&#x0025; across all 107 questions. Among the eight large language models evaluated, ChatGPT-5 demonstrated the highest performance, achieving 84.1&#x0025; accuracy, followed by ChatGPT-4o at 81.3&#x0025; and Gemini 2.5 at 77.6&#x0025;. Each of these models significantly outperformed the resident cohort. ChatGPT-4 scored 68.2&#x0025;, only modestly above resident performance, whereas Claude (56.1&#x0025;), Bard (54.2&#x0025;), and ChatGPT-3.5 (53.3&#x0025;) clustered below the resident mean. Gemini v1 was the weakest model, with an overall accuracy of 39.3&#x0025;. Pairwise Fisher&#x0027;s exact testing confirmed clear separation between higher- and lower-performing model tiers: ChatGPT-5 and ChatGPT-4o differed significantly from nearly all other models (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) but not from each other (<italic>p</italic>&#x2009;&#x003D;&#x2009;0.718). ChatGPT-4 was significantly stronger than Gemini v1 (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) but statistically similar to Gemini 2.5 (<italic>p</italic>&#x2009;&#x003D;&#x2009;0.166). At the lower end, Bard and ChatGPT-3.5 were indistinguishable (<italic>p</italic>&#x2009;&#x003D;&#x2009;1.0), as were Bard and Claude (<italic>p</italic>&#x2009;&#x003D;&#x2009;0.891). Gemini v1 was decisively outperformed by Gemini 2.5 (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), confirming a large inter-version improvement.</p>
<p>Detailed accuracy by subspecialty and question category is presented in <xref ref-type="table" rid="T1">Table 1</xref>, with corresponding pairwise statistical comparisons shown in <xref ref-type="table" rid="T2">Table 2</xref> and calibration metrics summarized in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T1" position="float"><label>Table&#x00A0;1</label>
<caption><p>Overall and domain-specific accuracy, question category, and corrected accuracy.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Topic</th>
<th valign="top" align="center">Questions</th>
<th valign="top" align="center">Students (&#x0025;)</th>
<th valign="top" align="center">Bard (&#x0025;)</th>
<th valign="top" align="center">Claude (&#x0025;)</th>
<th valign="top" align="center">Gemini v1 (&#x0025;)</th>
<th valign="top" align="center">Gemini 2.5 (&#x0025;)</th>
<th valign="top" align="center">ChatGPT-3.5 (&#x0025;)</th>
<th valign="top" align="center">ChatGPT-4 (&#x0025;)</th>
<th valign="top" align="center">ChatGPT-4o (&#x0025;)</th>
<th valign="top" align="center">ChatGPT-5 (&#x0025;)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Overall</td>
<td valign="top" align="center">107</td>
<td valign="top" align="center">64.9</td>
<td valign="top" align="center">54.2</td>
<td valign="top" align="center">56.1</td>
<td valign="top" align="center">39.3</td>
<td valign="top" align="center">77.6</td>
<td valign="top" align="center">53.3</td>
<td valign="top" align="center">68.2</td>
<td valign="top" align="center">81.3</td>
<td valign="top" align="center">84.1</td>
</tr>
<tr>
<td valign="top" align="left">Movement disorders</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">72.6</td>
<td valign="top" align="center">38.5</td>
<td valign="top" align="center">53.9</td>
<td valign="top" align="center">38.5</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">38.5</td>
<td valign="top" align="center">53.9</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">76.9</td>
</tr>
<tr>
<td valign="top" align="left">Neuroanatomy</td>
<td valign="top" align="center">25</td>
<td valign="top" align="center">70.3</td>
<td valign="top" align="center">44</td>
<td valign="top" align="center">48</td>
<td valign="top" align="center">40</td>
<td valign="top" align="center">96</td>
<td valign="top" align="center">44</td>
<td valign="top" align="center">80</td>
<td valign="top" align="center">88</td>
<td valign="top" align="center">96</td>
</tr>
<tr>
<td valign="top" align="left">Neuro-infections</td>
<td valign="top" align="center">17</td>
<td valign="top" align="center">69.3</td>
<td valign="top" align="center">47.1</td>
<td valign="top" align="center">52.9</td>
<td valign="top" align="center">41.2</td>
<td valign="top" align="center">76.5</td>
<td valign="top" align="center">64.7</td>
<td valign="top" align="center">64.7</td>
<td valign="top" align="center">70.6</td>
<td valign="top" align="center">70.6</td>
</tr>
<tr>
<td valign="top" align="left">Neuroimmunology</td>
<td valign="top" align="center">14</td>
<td valign="top" align="center">66.1</td>
<td valign="top" align="center">78.6</td>
<td valign="top" align="center">78.6</td>
<td valign="top" align="center">35.7</td>
<td valign="top" align="center">64.3</td>
<td valign="top" align="center">78.6</td>
<td valign="top" align="center">100</td>
<td valign="top" align="center">100</td>
<td valign="top" align="center">100</td>
</tr>
<tr>
<td valign="top" align="left">Epilepsy</td>
<td valign="top" align="center">8</td>
<td valign="top" align="center">65.6</td>
<td valign="top" align="center">62.5</td>
<td valign="top" align="center">62.5</td>
<td valign="top" align="center">50</td>
<td valign="top" align="center">62.5</td>
<td valign="top" align="center">62.5</td>
<td valign="top" align="center">62.5</td>
<td valign="top" align="center">75</td>
<td valign="top" align="center">75</td>
</tr>
<tr>
<td valign="top" align="left">Vascular neurology</td>
<td valign="top" align="center">13</td>
<td valign="top" align="center">58.1</td>
<td valign="top" align="center">84.6</td>
<td valign="top" align="center">61.5</td>
<td valign="top" align="center">46.2</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">53.9</td>
<td valign="top" align="center">61.5</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">76.9</td>
</tr>
<tr>
<td valign="top" align="left">Neuromuscular</td>
<td valign="top" align="center">17</td>
<td valign="top" align="center">50.4</td>
<td valign="top" align="center">41.2</td>
<td valign="top" align="center">47.1</td>
<td valign="top" align="center">29.4</td>
<td valign="top" align="center">70.6</td>
<td valign="top" align="center">41.2</td>
<td valign="top" align="center">47.1</td>
<td valign="top" align="center">76.5</td>
<td valign="top" align="center">82.4</td>
</tr>
<tr>
<td valign="top" align="left">Lower-order</td>
<td valign="top" align="center">50</td>
<td valign="top" align="center">66.4</td>
<td valign="top" align="center">56</td>
<td valign="top" align="center">66</td>
<td valign="top" align="center">42</td>
<td valign="top" align="center">72</td>
<td valign="top" align="center">60</td>
<td valign="top" align="center">68</td>
<td valign="top" align="center">80</td>
<td valign="top" align="center">82</td>
</tr>
<tr>
<td valign="top" align="left">Higher-order</td>
<td valign="top" align="center">57</td>
<td valign="top" align="center">63.5</td>
<td valign="top" align="center">52.6</td>
<td valign="top" align="center">47.4</td>
<td valign="top" align="center">36.8</td>
<td valign="top" align="center">82.5</td>
<td valign="top" align="center">47.4</td>
<td valign="top" align="center">68.4</td>
<td valign="top" align="center">82.5</td>
<td valign="top" align="center">86</td>
</tr>
<tr>
<td valign="top" align="left">Corrected accuracy</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">38.9</td>
<td valign="top" align="center">41.4</td>
<td valign="top" align="center">19</td>
<td valign="top" align="center">70.1</td>
<td valign="top" align="center">37.7</td>
<td valign="top" align="center">57.6</td>
<td valign="top" align="center">75.1</td>
<td valign="top" align="center">78.8</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float"><label>Table&#x00A0;2</label>
<caption><p>Pairwise fisher exact test <italic>p</italic>-values.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Model 1</th>
<th valign="top" align="center">Model 2</th>
<th valign="top" align="center"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="center">0.036</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="center">1.95&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;&#x2075;</sup></td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="center">1.69&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2076;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="left">Bard</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="left">Claude</td>
<td valign="top" align="center">0.784</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="center">0.055</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">2.97&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2074;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="center">0.04</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="center">0.0098</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="left">Bard</td>
<td valign="top" align="center">0.049</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="left">Claude</td>
<td valign="top" align="center">0.091</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="center">3.48&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2075;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">0.166</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="center">0.718</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="left">Bard</td>
<td valign="top" align="center">3.52&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2075;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="left">Claude</td>
<td valign="top" align="center">1.10&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2074;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="center">3.83&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;1</sup>&#x2070;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">0.612</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="left">Bard</td>
<td valign="top" align="center">3.25&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2076;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="left">Claude</td>
<td valign="top" align="center">1.15&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2075;</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="center">1.37&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;11</sup></td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">0.297</td>
</tr>
<tr>
<td valign="top" align="left">Bard</td>
<td valign="top" align="left">Claude</td>
<td valign="top" align="center">0.891</td>
</tr>
<tr>
<td valign="top" align="left">Bard</td>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="center">0.0396</td>
</tr>
<tr>
<td valign="top" align="left">Bard</td>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">4.96&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2074;</td>
</tr>
<tr>
<td valign="top" align="left">Claude</td>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="center">0.0198</td>
</tr>
<tr>
<td valign="top" align="left">Claude</td>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">1.32&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;3</sup></td>
</tr>
<tr>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">1.86&#x2009;&#x00D7;&#x2009;10<sup>&#x2212;</sup>&#x2078;</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float"><label>Table&#x00A0;3</label>
<caption><p>Calibration and agreement per model.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Cohen&#x0027;s <italic>&#x03BA;</italic></th>
<th valign="top" align="center">Spearman <italic>&#x03C1;</italic></th>
<th valign="top" align="center"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Bard</td>
<td valign="top" align="center">0.078</td>
<td valign="top" align="center">0.053</td>
<td valign="top" align="center">0.588</td>
</tr>
<tr>
<td valign="top" align="left">Claude</td>
<td valign="top" align="center">0.03</td>
<td valign="top" align="center">&#x2212;0.033</td>
<td valign="top" align="center">0.739</td>
</tr>
<tr>
<td valign="top" align="left">Gemini v1</td>
<td valign="top" align="center">0.009</td>
<td valign="top" align="center">0.184</td>
<td valign="top" align="center">0.058</td>
</tr>
<tr>
<td valign="top" align="left">Gemini 2.5</td>
<td valign="top" align="center">0.237</td>
<td valign="top" align="center">&#x2212;0.074</td>
<td valign="top" align="center">0.447</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-3.5</td>
<td valign="top" align="center">0.021</td>
<td valign="top" align="center">&#x2212;0.068</td>
<td valign="top" align="center">0.487</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4</td>
<td valign="top" align="center">0.085</td>
<td valign="top" align="center">0.176</td>
<td valign="top" align="center">0.07</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-4o</td>
<td valign="top" align="center">0.218</td>
<td valign="top" align="center">&#x2212;0.081</td>
<td valign="top" align="center">0.404</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT-5</td>
<td valign="top" align="center">0.217</td>
<td valign="top" align="center">&#x2212;0.090</td>
<td valign="top" align="center">0.4</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3b"><title>Performance by cognitive level</title>
<p>When questions were stratified by cognitive complexity, performance gradients became evident. On lower-order items, residents achieved 66.4&#x0025;, while ChatGPT-5 (82&#x0025;), ChatGPT-4o (80&#x0025;), and Gemini 2.5 (72&#x0025;) maintained clear advantages. ChatGPT-4 (68&#x0025;) performed comparably to residents, whereas Claude (66&#x0025;), Bard (56&#x0025;), and ChatGPT-3.5 (60&#x0025;) offered minimal improvement. Gemini v1 again performed worst (42&#x0025;). For higher-order questions, resident accuracy declined slightly to 63.5&#x0025;, while the gap between models widened: ChatGPT-5 (86&#x0025;) and ChatGPT-4o (82.5&#x0025;) led all models, Gemini 2.5 matched them at 82.5&#x0025;, and ChatGPT-4 held steady at 68.4&#x0025;. Claude (47.4&#x0025;), Bard (52.6&#x0025;), ChatGPT-3.5 (47.4&#x0025;), and Gemini v1 (36.8&#x0025;) demonstrated marked deterioration on higher-order reasoning tasks.</p>
</sec>
<sec id="s3c"><title>Subspecialty-level accuracy</title>
<p>Subspecialty analysis revealed similar hierarchical trends. In movement disorders, residents attained 72.6&#x0025; accuracy, outperforming lower-tier models but equaled by ChatGPT-5, ChatGPT-4o, and Gemini 2.5 (all 76.9&#x0025;). In neuroanatomy, advanced models excelled, with ChatGPT-5 and Gemini 2.5 each reaching 96&#x0025;, ChatGPT-4o at 88&#x0025;, and ChatGPT-4 at 80&#x0025;, all surpassing residents (70.3&#x0025;). In neuroimmunology, ChatGPT-5, ChatGPT-4o, and ChatGPT-4 achieved perfect scores (100&#x0025;), far above residents (66.1&#x0025;), while Claude and Bard performed similarly (78.6&#x0025;), and Gemini v1 remained lowest (35.7&#x0025;). Vascular neurology produced the most unexpected finding: Bard achieved the highest single-domain score (84.6&#x0025;), exceeding residents (58.1&#x0025;) and even the strongest models (ChatGPT-4o, ChatGPT-5, and Gemini 2.5, all 76.9&#x0025;). In neuromuscular disease&#x2014;the most challenging section for residents (50.4&#x0025;)&#x2014;the advanced models again separated clearly, with ChatGPT-5 at 82.4&#x0025;, ChatGPT-4o at 76.5&#x0025;, and Gemini 2.5 at 70.6&#x0025;, all outperforming other models by wide margins. In epilepsy and neuro-infectious disease, model and human accuracies converged more closely, with residents at 65&#x0025;&#x2013;69&#x0025; and top models modestly higher (70&#x0025;&#x2013;76&#x0025;).</p>
</sec>
<sec id="s3d"><title>Guessing-adjusted accuracy and calibration</title>
<p>Adjusting for random guessing, corrected accuracy preserved the same performance hierarchy. ChatGPT-5 (78.8&#x0025;) and ChatGPT-4o (75.1&#x0025;) remained the strongest models, followed by Gemini 2.5 (70.1&#x0025;) and ChatGPT-4 (57.6&#x0025;). The lower-performing group&#x2014;Claude (41.4&#x0025;), Bard (38.9&#x0025;), ChatGPT-3.5 (37.7&#x0025;), and Gemini v1 (19.0&#x0025;)&#x2014;clustered near chance level, indicating limited comprehension.</p>
<p>Confidence-accuracy calibration analyses revealed generally weak and nonsignificant correlations. ChatGPT-4 (<italic>&#x03C1;</italic>&#x2009;&#x003D;&#x2009;0.176, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.070) and Gemini v1 (<italic>&#x03C1;</italic>&#x2009;&#x003D;&#x2009;0.184, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.058) showed mild positive trends, suggesting that higher reported confidence modestly aligned with accuracy. By contrast, ChatGPT-5 (<italic>&#x03C1;</italic>&#x2009;&#x003D;&#x2009;&#x2212;0.090, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.400), ChatGPT-4o (<italic>&#x03C1;</italic>&#x2009;&#x003D;&#x2009;&#x2212;0.081, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.404), and Gemini 2.5 (<italic>&#x03C1;</italic>&#x2009;&#x003D;&#x2009;&#x2212;0.074, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.447) demonstrated weak negative associations, indicating poor self-calibration. Agreement with expert classifications by Bloom&#x0027;s taxonomy also varied: Gemini 2.5 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.237), ChatGPT-4o (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.218), and ChatGPT-5 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.217) exhibited the highest concordance with expert labels, whereas ChatGPT-4 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.085) and Bard (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.078) showed only slight agreement, and Claude (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.030), ChatGPT-3.5 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.021), and Gemini v1 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.009) approached random alignment.</p>
</sec>
<sec id="s3e"><title>Inter-model improvement across generations</title>
<p>Regression analyses demonstrated consistent incremental improvement across ChatGPT model generations. Topic-level accuracies for ChatGPT-4 and ChatGPT-4o were strongly correlated (R<sup>2</sup>&#x2009;&#x003D;&#x2009;0.765, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.0099), with ChatGPT-4o gaining roughly half a percentage point for each one-point increase in ChatGPT-4 performance (<italic>&#x03B2;</italic>&#x2009;&#x003D;&#x2009;0.497, 95&#x0025; CI 0.18&#x2013;0.81). The transition from ChatGPT-4o to ChatGPT-5 showed even greater linearity (R<sup>2</sup>&#x2009;&#x003D;&#x2009;0.908, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001), with near-uniform accuracy improvements across subspecialties (<italic>&#x03B2;</italic>&#x2009;&#x003D;&#x2009;1.06, 95&#x0025; CI 0.67&#x2013;1.45). By contrast, Gemini v1 and Gemini 2.5 demonstrated virtually no linear relationship (<italic>R</italic><sup>2</sup>&#x2009;&#x003D;&#x2009;0.002, <italic>p</italic>&#x2009;&#x003D;&#x2009;0.918), reflecting discontinuous and domain-inconsistent progress. <xref ref-type="fig" rid="F1">Figure 1</xref> illustrates inter-model regression relationships, while <xref ref-type="fig" rid="F2">Figure 2</xref> depicts per-topic change in accuracy between large language model generations. These findings indicate that ChatGPT iterations improved predictably and systematically across subspecialties, while Gemini&#x0027;s evolution was abrupt but uneven.</p>
<fig id="F1" position="float"><label>Figure&#x00A0;1</label>
<caption><p>Linear regression analyses comparing accuracy between consecutive large language model generations. Panels <bold>(A&#x2013;C</bold>) display regression analyses evaluating the relationship between the accuracies of successive large language model versions across neurology question domains. Panel <bold>(A)</bold> compares GPT-4 and GPT-4o; panel <bold>(B)</bold> compares GPT-4o and GPT-5; and panel <bold>(C)</bold> compares Gemini version 1 and Gemini 2.5. Each plot includes the fitted regression line (dashed blue), 95&#x0025; confidence interval (gray shading), and corresponding model metrics (<italic>R</italic><sup>2</sup>, <italic>&#x03B2;</italic>, <italic>p</italic>) shown in the lower right corner of each panel. Both axes represent model accuracy expressed as a percentage.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1737882-g001.tif"><alt-text content-type="machine-generated">Three scatter plots labeled A, B, and C show relationships between accuracy percentages on both axes. Plot A demonstrates a positive correlation with R&#x00B2; = 0.765, &#x03B2;&#x02081; = 0.497, p = 0.00994. Plot B also shows a strong positive correlation with R&#x00B2; = 0.908, &#x03B2;&#x02081; = 1.058, p = 0.00091. Plot C indicates no significant correlation with R&#x00B2; = 0.002, &#x03B2;&#x02081; = -0.079, p = 0.91844. Each plot has a dotted regression line and a shaded confidence interval.</alt-text>
</graphic>
</fig>
<fig id="F2" position="float"><label>Figure&#x00A0;2</label>
<caption><p>Change in per-topic accuracy between consecutive large language model generations. Panels <bold>(A&#x2013;C)</bold> depict the per-topic change (&#x0394;) in accuracy, expressed in percentage points, between successive large language model versions. Panel <bold>(A)</bold> compares GPT-4 and GPT-4o, <bold>(B)</bold> compares GPT-4o and GPT-5, and <bold>(C)</bold> compares Gemini version 1 and Gemini 2.5. Each point represents a neurology topic, with vertical lines extending from zero to the observed change in accuracy. Positive values indicate improvement relative to the preceding model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1737882-g002.tif"><alt-text content-type="machine-generated">Dot plot showing changes in accuracy (percentage points) across different neurological topics: epilepsy, movement disorders, neuro-infections, neuroanatomy, neuromuscular, vascular neurology, and neuroimmunology. Three sections labeled A, B, and C highlight variations, with A showing most changes and B and C remaining stable.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s4" sec-type="discussion"><title>Discussion</title>
<p>LLMs are beginning to supplement traditional teaching methods in medical education, but their application in neurology requires careful evaluation (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B5">5</xref>). While their application in neurology remains underexplored, the specialty&#x0027;s reliance on nuanced clinical reasoning, pattern recognition, and hypothesis generation presents both opportunities and challenges for AI integration (<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>). Evaluating LLMs in neurology-specific examinations is therefore critical to guide their optimal use in training, curriculum design, and board preparation (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B14">14</xref>).</p>
<p>In this study, eight LLMs were compared against neurology residents across a board-style multiple-choice examination spanning seven subspecialties. Consistent with prior evaluations in neurosurgery (<xref ref-type="bibr" rid="B6">6</xref>), radiology (<xref ref-type="bibr" rid="B8">8</xref>), and ophthalmology (<xref ref-type="bibr" rid="B10">10</xref>), newer models clearly outperformed earlier versions. ChatGPT-5 achieved the highest overall accuracy, followed closely by ChatGPT-4o, both exceeding resident performance across most subspecialties. Gemini 2.5 also demonstrated substantial improvement over Gemini v1, achieving accuracy closer to ChatGPT-4, though with more variability across topics. Bard, Claude, and ChatGPT-3.5 performed below resident levels, aligning with reports of uneven LLM performance across medical disciplines (<xref ref-type="bibr" rid="B9">9</xref>). <sans-serif>Our findings complement international evidence. In a recent study of practicing neurologists in Spain, Ros-Arlanz&#x00F3;n and P&#x00E9;rez-Sempere found that ChatGPT-4 outperformed clinicians on a high-stakes neurology certification exam administered in Spanish, whereas ChatGPT-3.5 performed below the clinician cohort. Their results reinforce the generational performance gradient observed in our analysis (</sans-serif><xref ref-type="bibr" rid="B15">15</xref>).</p>
<p>A notable finding was the systematic, domain-consistent improvement observed across ChatGPT generations, contrasting with the discontinuous leap between Gemini v1 and Gemini 2.5. Model improvement analyses demonstrated strong positive correlations between ChatGPT-4 and ChatGPT-4o, and between ChatGPT-4o and ChatGPT-5, indicating steady, distributed gains across neurology subspecialties&#x2014;results consistent with prior evidence of iterative refinement across successive LLM generations (<xref ref-type="bibr" rid="B16">16</xref>, <xref ref-type="bibr" rid="B17">17</xref>). In contrast, Gemini&#x0027;s performance gains were abrupt and uneven; although Gemini 2.5 achieved substantially higher accuracy than v1, these changes were not significantly correlated across domains, raising concerns about the reproducibility and stability of its progress. This difference underscores that evaluating LLMs for neurology education requires attention not only to absolute accuracy but also to the <italic>trajectory</italic> and <italic>predictability</italic> of model advancement. Furthermore, this finding highlights broader implications for version governance. Because up-versioning or down-versioning may not yield predictable changes in performance, maintaining validated, version-specific performance data for each deployed model becomes essential. Transitions between model generations should therefore be deliberate, transparent, and monitored for downstream impact. As model routing and multimodal agent frameworks proliferate&#x2014;where automatic switching between models or versions may occur dynamically&#x2014;such oversight grows increasingly complex. Future work and emerging standards must address not only model performance but also the systematic recording of version metadata, its inclusion in audit trails, and the development of mechanisms enabling agentic tools and routers to make model-selection decisions aligned with safe, optimal medical reasoning.</p>
<p>Confidence&#x2013;accuracy calibration emerged as a persistent limitation. ChatGPT-4o and ChatGPT-5 both showed weak negative correlations between confidence and correctness, while Gemini v1 and Gemini 2.5 demonstrated inconsistent calibration. This pattern mirrors prior evidence that LLMs frequently overestimate correctness by 20&#x0025;&#x2013;60&#x0025;, raising concern for educational or clinical contexts where confidently incorrect outputs may mislead learners (<xref ref-type="bibr" rid="B18">18</xref>). In our analysis, corrected accuracy further magnified performance differences: Gemini v1 dropped to near-chance levels, underscoring the risk of relying on raw outputs when confidence misalignment persists.</p>
<p>From an educational standpoint, high-performing models such as ChatGPT-4o, ChatGPT-5, and Gemini 2.5 may serve as effective adjuncts for board preparation, question generation, and literature review (<xref ref-type="bibr" rid="B2">2</xref>, <xref ref-type="bibr" rid="B7">7</xref>). Their strengths in factual recall and content summarization make them promising tools for structured learning, yet their limitations in higher-order reasoning highlight the continued need for expert oversight (<xref ref-type="bibr" rid="B1">1</xref>). These findings echo broader concerns regarding reproducibility and reliability across prompts (<xref ref-type="bibr" rid="B14">14</xref>).</p>
<p>This study has several limitations. The relatively small number of questions within certain subspecialties may have reduced statistical power. We did not assess intra-model variability, hallucination frequency, or multimodal reasoning, each of which constrains the applicability of LLMs in clinical contexts (<xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B20">20</xref>). Because LLMs evolve rapidly, the performance reported here may not reflect future iterations (<xref ref-type="bibr" rid="B14">14</xref>, <xref ref-type="bibr" rid="B16">16</xref>). Models were tested through publicly available graphical user interfaces, preventing adjustment of inference parameters (e.g., temperature) and limiting reproducibility, though this approach mirrors real-world use (<xref ref-type="bibr" rid="B21">21</xref>). These findings apply only to text-based, single-best-answer questions without imaging, waveform, or multimedia elements, which limits generalizability to the multimodal assessments used in neurology (<xref ref-type="bibr" rid="B22">22</xref>). Furthermore, the &#x201C;best-answer&#x201D; structure lacked a &#x201C;none of the above&#x201D; option, which may obscure key failure modes recently identified in LLM testing (<xref ref-type="bibr" rid="B23">23</xref>). The prompts themselves combined multiple requests&#x2014;question response, confidence rating, explanation, Bloom&#x0027;s classification, and CSV formatting&#x2014;creating a one-shot configuration that could increase variability compared with explicit chain-of-thought prompting. Finally, some board-style questions may overlap with publicly available educational material incorporated into model training data; paradoxically, even seemingly esoteric items can appear online when they reference pathognomonic findings or &#x201C;classic&#x201D; examination vignettes.</p>
</sec>
<sec id="s5" sec-type="conclusions"><title>Conclusions</title>
<p>LLMs&#x2014;particularly ChatGPT-4o, ChatGPT-5, and Gemini 2.5&#x2014;demonstrate strong potential as educational tools in neurology, often matching or exceeding resident performance in subspecialty assessments. However, inconsistent calibration of confidence and limitations in higher-order reasoning restrict their readiness for unsupervised educational or clinical use. Deployment in medical training should be cautious, with robust oversight, accuracy validation, and transparency. As models evolve and agentic AI capabilities mature, their role in neurology education is likely to expand, but careful integration with traditional teaching and expert guidance will remain essential.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability"><title>Data availability statement</title>
<p>Deidentified model outputs and aggregated question-level accuracy data are available from the corresponding author upon reasonable request for educational replication. No protected health information is included.</p>
</sec>
<sec id="s7" sec-type="ethics-statement"><title>Ethics statement</title>
<p>The studies involving humans were approved by UTMB IRB Board, The University of Texas Medical Branch. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x0027; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p>
</sec>
<sec id="s8" sec-type="author-contributions"><title>Author contributions</title>
<p>MA: Formal analysis, Investigation, Project administration, Validation, Writing &#x2013; original draft. VV: Conceptualization, Methodology, Project administration, Writing &#x2013; original draft. JW: Methodology, Writing &#x2013; review &#x0026; editing. XF: Investigation, Methodology, Writing &#x2013; review &#x0026; editing. AD: Methodology, Writing &#x2013; review &#x0026; editing. CL: Conceptualization, Investigation, Methodology, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. PM: Investigation, Methodology, Writing &#x2013; review &#x0026; editing. DH: Conceptualization, Investigation, Methodology, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. JR-F: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. </p>
</sec>
<sec id="s10" sec-type="COI-statement"><title>Conflict of interest</title>
<p>The authors declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s11" sec-type="ai-statement"><title>Generative AI statement</title>
<p>The author(s) declared that financial support was received for this work and/or its publication. Yes &#x2014; generative AI was used for proofreading and to enhance clarity. The authors take full responsibility for all content, analysis, and conclusions presented in this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s12" sec-type="disclaimer"><title>Publisher&#x0027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list><title>References</title>
<ref id="B1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hillis</surname> <given-names>JM</given-names></name> <name><surname>Bizzo</surname> <given-names>BC</given-names></name></person-group>. <article-title>Use of artificial intelligence in clinical neurology</article-title>. <source>Semin Neurol</source>. (<year>2022</year>) <volume>42</volume>(<issue>1</issue>):<fpage>39</fpage>&#x2013;<lpage>47</lpage>. <pub-id pub-id-type="doi">10.1055/s-0041-1742180</pub-id><pub-id pub-id-type="pmid">35576929</pub-id></mixed-citation></ref>
<ref id="B2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Romano</surname> <given-names>MF</given-names></name> <name><surname>Shih</surname> <given-names>LC</given-names></name> <name><surname>Paschalidis</surname> <given-names>IC</given-names></name> <name><surname>Au</surname> <given-names>R</given-names></name> <name><surname>Kolachalama</surname> <given-names>VB</given-names></name></person-group>. <article-title>Large language models in neurology research and future practice</article-title>. <source>Neurology</source>. (<year>2023</year>) <volume>101</volume>(<issue>23</issue>):<fpage>1058</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.1212/WNL.0000000000207967</pub-id><pub-id pub-id-type="pmid">37816646</pub-id></mixed-citation></ref>
<ref id="B3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dabi</surname> <given-names>A</given-names></name> <name><surname>Banerjee</surname> <given-names>P</given-names></name> <name><surname>Narvaez Caicedo</surname> <given-names>C</given-names></name> <name><surname>Rodr&#x00ED;guez Fern&#x00E1;ndez</surname> <given-names>JM</given-names></name></person-group>. <article-title>Machine learning in neurocritical care: overview, pitfalls, and potential solutions</article-title>. <source>J Neurol Neurol Disord</source>. (<year>2024</year>) <volume>10</volume>(<issue>1</issue>):<fpage>105</fpage>.</mixed-citation></ref>
<ref id="B4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kung</surname> <given-names>TH</given-names></name> <name><surname>Cheatham</surname> <given-names>M</given-names></name> <name><surname>Medenilla</surname> <given-names>A</given-names></name> <name><surname>Sillos</surname><given-names>C</given-names></name> <name><surname>De Leon</surname><given-names>L</given-names></name> <name><surname>Elepa&#x00F1;o</surname><given-names>C</given-names></name><etal/></person-group> <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>. <source>PLoS Digit Health</source>. (<year>2023</year>) <volume>2</volume>(<issue>2</issue>):<fpage>e0000198</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="pmid">36812645</pub-id></mixed-citation></ref>
<ref id="B5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vinny</surname> <given-names>PW</given-names></name> <name><surname>Vishnu</surname> <given-names>VY</given-names></name> <name><surname>Srivastava</surname> <given-names>MVP</given-names></name></person-group>. <article-title>Artificial intelligence shaping the future of neurology practice</article-title>. <source>Med J Armed Forces India</source>. (<year>2021</year>) <volume>77</volume>(<issue>3</issue>):<fpage>276</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1016/j.mjafi.2021.06.003</pub-id><pub-id pub-id-type="pmid">34305279</pub-id></mixed-citation></ref>
<ref id="B6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ali</surname> <given-names>R</given-names></name> <name><surname>Tang</surname> <given-names>OY</given-names></name> <name><surname>Connolly</surname> <given-names>ID</given-names></name> <name><surname>Zadnik Sullivan</surname><given-names>PL</given-names></name> <name><surname>Shin</surname><given-names>JH</given-names></name> <name><surname>Fridley</surname><given-names>JS</given-names></name><etal/></person-group> <article-title>Performance of ChatGPT and GPT-4 on neurosurgery written board examinations</article-title>. <source>Neurosurgery</source>. (<year>2023</year>) <volume>93</volume>(<issue>6</issue>):<fpage>1353</fpage>&#x2013;<lpage>65</lpage>. <pub-id pub-id-type="doi">10.1227/neu.0000000000002632</pub-id><pub-id pub-id-type="pmid">37581444</pub-id></mixed-citation></ref>
<ref id="B7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Figari Jordan</surname> <given-names>R</given-names></name> <name><surname>Sandrone</surname> <given-names>S</given-names></name> <name><surname>Southerland</surname> <given-names>AM</given-names></name></person-group>. <article-title>Opportunities and challenges for incorporating artificial intelligence and natural language processing in neurology education</article-title>. <source>Neurology</source>. (<year>2024</year>) <volume>3</volume>(<issue>1</issue>):<fpage>e200116</fpage>. <pub-id pub-id-type="doi">10.1212/NE9.0000000000200116</pub-id><pub-id pub-id-type="pmid">39360153</pub-id></mixed-citation></ref>
<ref id="B8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhayana</surname> <given-names>R</given-names></name> <name><surname>Krishna</surname> <given-names>S</given-names></name> <name><surname>Bleakney</surname> <given-names>RR</given-names></name></person-group>. <article-title>Performance of ChatGPT on radiology board-style examination: insights into current strengths and limitations</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>307</volume>(<issue>5</issue>):<fpage>e230582</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.230582</pub-id><pub-id pub-id-type="pmid">37191485</pub-id></mixed-citation></ref>
<ref id="B9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gilson</surname> <given-names>A</given-names></name> <name><surname>Safranek</surname> <given-names>CW</given-names></name> <name><surname>Huang</surname> <given-names>T</given-names></name> <name><surname>Socrates</surname><given-names>V</given-names></name> <name><surname>Chi</surname><given-names>L</given-names></name> <name><surname>Taylor</surname><given-names>RA</given-names></name><etal/></person-group> <article-title>How does ChatGPT perform on the United States medical licensing examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title>. <source>JMIR Med Educ</source>. (<year>2023</year>) <volume>9</volume>:<fpage>e45312</fpage>. <pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="pmid">36753318</pub-id></mixed-citation></ref>
<ref id="B10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mihalache</surname> <given-names>A</given-names></name> <name><surname>Popovic</surname> <given-names>MM</given-names></name> <name><surname>Muni</surname> <given-names>RH</given-names></name></person-group>. <article-title>Performance of an artificial intelligence chatbot in ophthalmic knowledge assessment</article-title>. <source>JAMA Ophthalmol</source>. (<year>2023</year>) <volume>141</volume>(<issue>6</issue>):<fpage>589</fpage>&#x2013;<lpage>97</lpage>. <pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.1144</pub-id><pub-id pub-id-type="pmid">37103928</pub-id></mixed-citation></ref>
<ref id="B11"><label>11.</label><mixed-citation publication-type="book"><collab>American Academy of Neurology</collab>. <source>Residency in-Service Training Examination (RITE) Content Outline</source>. <publisher-loc>Minneapolis, MN</publisher-loc>: <publisher-name>American Academy of Neurology</publisher-name> (<year>2024</year>). <comment>Available online at:</comment> <ext-link ext-link-type="uri" xlink:href="https://www.aan.com/tools-resources/residency-in-service-training-examination">https://www.aan.com/tools-resources/residency-in-service-training-examination</ext-link> <comment>(Accessed October 21, 2025)</comment>.</mixed-citation></ref>
<ref id="B12"><label>12.</label><mixed-citation publication-type="book"><collab>American Board of Psychiatry and Neurology</collab>. <source>Instructions for the Neurology Certification Examination</source>. <publisher-loc>Deerfield, IL</publisher-loc>: <publisher-name>American Board of Psychiatry and Neurology</publisher-name> (<year>2024</year>). <comment>Available online at:</comment> <ext-link ext-link-type="uri" xlink:href="https://www.abpn.com/wp-content/uploads/2020/11/2021_Neurology_CERT_Format_and_Scoring.pdf">https://www.abpn.com/wp-content/uploads/2020/11/2021_Neurology_CERT_Format_and_Scoring.pdf</ext-link> <comment>(Accessed October 21, 2025)</comment>.</mixed-citation></ref>
<ref id="B13"><label>13.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bloom</surname> <given-names>BS</given-names></name> <name><surname>Engelhart</surname> <given-names>MD</given-names></name> <name><surname>Furst</surname> <given-names>EJ</given-names></name> <name><surname>Hill</surname> <given-names>WH</given-names></name> <name><surname>Krathwohl</surname> <given-names>DR</given-names></name></person-group>. <source>Taxonomy of Educational Objectives: The Classification of Educational Goals. Handbook I: Cognitive Domain</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>David McKay Company</publisher-name> (<year>1956</year>).</mixed-citation></ref>
<ref id="B14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schubert</surname> <given-names>MC</given-names></name> <name><surname>Wick</surname> <given-names>W</given-names></name> <name><surname>Venkataramani</surname> <given-names>V</given-names></name></person-group>. <article-title>Performance of large language models on a neurology board-style examination</article-title>. <source>JAMA Netw Open</source>. (<year>2023</year>) <volume>6</volume>(<issue>12</issue>):<fpage>e2346721</fpage>. <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.46721</pub-id><pub-id pub-id-type="pmid">38060223</pub-id></mixed-citation></ref>
<ref id="B15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ros-Arlanz&#x00F3;n</surname> <given-names>P</given-names></name> <name><surname>P&#x00E9;rez-Sempere</surname> <given-names>A</given-names></name></person-group>. <article-title>Evaluating AI competence in specialized medicine: comparative analysis of ChatGPT and neurologists in a neurology specialist examination in Spain</article-title>. <source>JMIR Med Educ</source>. (<year>2024</year>) <volume>10</volume>:<fpage>e56762</fpage>. <pub-id pub-id-type="doi">10.2196/56762</pub-id></mixed-citation></ref>
<ref id="B16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Inojosa</surname><given-names>H</given-names></name> <name><surname>Ramezanzadeh</surname><given-names>A</given-names></name> <name><surname>Gasparovic-Curtini</surname><given-names>I</given-names></name> <name><surname>Wiest</surname><given-names>I</given-names></name> <name><surname>Kather</surname><given-names>JN</given-names></name> <name><surname>Gilbert</surname><given-names>S</given-names></name><etal/></person-group> <article-title>Integrating large language models in care, research, and education in multiple sclerosis management</article-title>. <source>Mult Scler</source>. (<year>2024</year>) <volume>30</volume>(<issue>11&#x2013;12</issue>):<fpage>1392</fpage>&#x2013;<lpage>401</lpage>. <pub-id pub-id-type="doi">10.1177/13524585241277376</pub-id><pub-id pub-id-type="pmid">39308156</pub-id></mixed-citation></ref>
<ref id="B17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barrit</surname> <given-names>S</given-names></name> <name><surname>Torcida</surname> <given-names>N</given-names></name> <name><surname>Mazeraud</surname> <given-names>A</given-names></name> <name><surname>Boulogne</surname><given-names>S</given-names></name> <name><surname>Benoit</surname><given-names>J</given-names></name> <name><surname>Carette</surname><given-names>T</given-names></name><etal/></person-group> <article-title>Specialized large language model outperforms neurologists at complex diagnosis in blinded case-based evaluation</article-title>. <source>Brain Sci</source>. (<year>2025</year>) <volume>15</volume>(<issue>4</issue>):<fpage>347</fpage>. <pub-id pub-id-type="doi">10.3390/brainsci15040347</pub-id><pub-id pub-id-type="pmid">40309809</pub-id></mixed-citation></ref>
<ref id="B18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Moura</surname> <given-names>L</given-names></name> <name><surname>Jones</surname> <given-names>DT</given-names></name> <name><surname>Sheikh</surname> <given-names>IS</given-names></name> <name><surname>Murphy</surname><given-names>S</given-names></name> <name><surname>Kalfin</surname><given-names>M</given-names></name> <name><surname>Kummer</surname><given-names>BR</given-names></name><etal/></person-group> <article-title>Implications of large language models for quality and efficiency of neurologic care: emerging issues in neurology</article-title>. <source>Neurology</source>. (<year>2024</year>) <volume>102</volume>(<issue>11</issue>):<fpage>e209497</fpage>. <pub-id pub-id-type="doi">10.1212/WNL.0000000000209497</pub-id><pub-id pub-id-type="pmid">38759131</pub-id></mixed-citation></ref>
<ref id="B19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Matsoukas</surname> <given-names>S</given-names></name> <name><surname>Morey</surname> <given-names>J</given-names></name> <name><surname>Lock</surname> <given-names>G</given-names></name> <name><surname>Chada</surname><given-names>D</given-names></name> <name><surname>Shigematsu</surname><given-names>T</given-names></name> <name><surname>Marayati</surname><given-names>NF</given-names></name><etal/></person-group> <article-title>AI software detection of large vessel occlusion stroke on CT angiography: a real-world prospective diagnostic accuracy study</article-title>. <source>J Neurointerv Surg</source>. (<year>2023</year>) <volume>15</volume>(<issue>1</issue>):<fpage>52</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1136/neurintsurg-2021-018391</pub-id><pub-id pub-id-type="pmid">35086962</pub-id></mixed-citation></ref>
<ref id="B20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Grzybowski</surname> <given-names>A</given-names></name> <name><surname>Brona</surname> <given-names>P</given-names></name> <name><surname>Lim</surname> <given-names>G</given-names></name> <name><surname>Ruamviboonsuk</surname><given-names>P</given-names></name> <name><surname>Tan</surname><given-names>GSW</given-names></name> <name><surname>Abramoff</surname><given-names>M</given-names></name><etal/></person-group> <article-title>Artificial intelligence for diabetic retinopathy screening: a review</article-title>. <source>Eye</source>. (<year>2020</year>) <volume>34</volume>(<issue>3</issue>):<fpage>451</fpage>&#x2013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1038/s41433-019-0566-0</pub-id><pub-id pub-id-type="pmid">31488886</pub-id></mixed-citation></ref>
<ref id="B21"><label>21.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Yuan</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>H</given-names></name> <name><surname>Ding</surname> <given-names>X</given-names></name> <name><surname>Xie</surname><given-names>W</given-names></name> <name><surname>Li</surname><given-names>YJ</given-names></name> <name><surname>Zhao</surname><given-names>W</given-names></name><etal/></person-group> <comment>Give me FP32 or give me death? Challenges and solutions for reproducible reasoning. <italic>arXiv</italic> [Preprint]. <italic>arXiv:2506.09501</italic> (2025)</comment>. <comment>Available online at:</comment> <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2506.09501">https://arxiv.org/abs/2506.09501</ext-link> (<comment>Accessed October 21, 2025</comment>).</mixed-citation></ref>
<ref id="B22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>B</given-names></name> <name><surname>Boxiong</surname> <given-names>B</given-names></name></person-group>. <article-title>Performance evaluation and implications of large language models in radiology board exams: prospective comparative analysis</article-title>. <source>JMIR Med Educ</source>. (<year>2025</year>) <volume>11</volume>:<fpage>e64284</fpage>. <pub-id pub-id-type="doi">10.2196/64284</pub-id><pub-id pub-id-type="pmid">39819381</pub-id></mixed-citation></ref>
<ref id="B23"><label>23.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tam</surname> <given-names>ZR</given-names></name> <name><surname>Wu</surname> <given-names>CK</given-names></name> <name><surname>Lin</surname> <given-names>CY</given-names></name> <name><surname>Chen</surname> <given-names>YN</given-names></name></person-group>. <article-title>None of the above, less of the right: parallel patterns in human and LLM performance on multiple-choice question answering</article-title>. In: <person-group person-group-type="author"><name><surname>Che</surname><given-names>W</given-names></name> <name><surname>Nabende</surname><given-names>J</given-names></name> <name><surname>Shutova</surname><given-names>E</given-names></name> <name><surname>Pilehvar</surname><given-names>MT</given-names></name></person-group>, editors. <source>Findings of the Association for Computational Linguistics: ACL 2025</source>. <publisher-loc>Vienna</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name> (<year>2025</year>). p. <fpage>20112</fpage>&#x2013;<lpage>34</lpage>. <comment>Available online at:</comment> <ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.findings-acl.1031">https://aclanthology.org/2025.findings-acl.1031</ext-link> <comment>(Accessed October 21, 2025)</comment>.</mixed-citation></ref></ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1184048/overview">Kirti Sundar Sahu</ext-link>, Canadian Red Cross, Ottawa, Canada</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/386952/overview">Angel Perez Sempere</ext-link>, Hospital General Universitario de Alicante, Spain</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1933685/overview">Sina Shool</ext-link>, Iran University of Medical Sciences, Iran</p></fn>
</fn-group>
</back>
</article>