<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2026.1761025</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Performance of large language models on sleep medicine certification examination: a comprehensive multi-model analysis</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Ko&#x00E7;</surname>
<given-names>Abdurrahman</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3305376"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ata&#x015F;</surname>
<given-names>Abdullah Enes</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3168360"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yosunkaya</surname>
<given-names>&#x015E;ebnem</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Vatansev</surname>
<given-names>H&#x00FC;lya</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Pulmonary Medicine, Meram State Hospital</institution>, <city>Konya</city>, <country country="tr">T&#x00FC;rkiye</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Radiology, Necmettin Erbakan University</institution>, <city>Konya</city>, <country country="tr">T&#x00FC;rkiye</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Pulmonary Medicine, Necmettin Erbakan University</institution>, <city>Konya</city>, <country country="tr">T&#x00FC;rkiye</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Abdurrahman Ko&#x00E7;, <email xlink:href="mailto:koc.abdurrahman@gmail.com">koc.abdurrahman@gmail.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-02">
<day>02</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1761025</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>15</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>16</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Ko&#x00E7;, Ata&#x015F;, Yosunkaya and Vatansev.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Ko&#x00E7;, Ata&#x015F;, Yosunkaya and Vatansev</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-02">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Purpose</title>
<p>To evaluate and compare the performance of nine contemporary LLM configurations on sleep medicine certification examination-aligned questions, analyzing version differences, pricing tiers, and subdomain competencies.</p>
</sec>
<sec>
<title>Methods</title>
<p>Cross-sectional comparative analysis of 197 multiple-choice questions structured according to American Academy of Sleep Medicine (AASM) certification standards. Nine LLM configurations were evaluated: ChatGPT (GPT-3.5 free, GPT-4o paid), Gemini (2.5 Flash free, 2.5 Pro paid), Claude (3.7 Sonnet previous, Opus 4 paid), Deepseek V3 (free), xAI Grok3 (free), and Llama 3 (free). Each question was posed three times in independent sessions to minimize response variance. The first complete response from each iteration was recorded, and final accuracy was determined using strict 3/3 concordance criterion (correct only when all three iterations yielded identical correct answers). While alternative scoring approaches exist (single-try accuracy, 2/3 majority voting), the strict concordance method was selected as primary metric to minimize stochastic variation and ensure robust performance estimates. Supplementary analyses using majority voting (2/3) yielded consistent model rankings with marginally higher absolute accuracy values. Performance metrics included overall accuracy rates, 95% confidence intervals, and subdomain-specific analyses across seven sleep medicine categories. Statistical analyses employed Pearson&#x2019;s chi-square test for heterogeneity and McNemar&#x2019;s test for pairwise comparisons. This text-based simulation evaluated model performance on certification-style questions, though it does not replicate actual clinical examination conditions.</p>
</sec>
<sec>
<title>Results</title>
<p>Model performance demonstrated significant heterogeneity (<italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;101.95, df&#x202F;=&#x202F;8, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), with accuracy rates ranging from 68.5% to 95.9%. Gemini 2.5 Pro achieved the highest overall accuracy (95.9%, 95% CI: 93.2&#x2013;98.7%), followed by Claude Opus 4 (93.9%, 95% CI: 90.6&#x2013;97.2%) and ChatGPT GPT-4o (93.4%, 95% CI: 89.9&#x2013;96.9%). Premium versions consistently demonstrated superior performance compared to free alternatives, with performance differences ranging from 5.1 to 8.6 points (all <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05). Subdomain analysis revealed the highest performance consistency in Secondary Sleep Disorders (92.0% mean accuracy) and the greatest variability in Diagnostic Methods (85.9% mean accuracy). Sensitivity analysis comparing three scoring criteria (single-try &#x2265;1/3, majority voting &#x2265;2/3, strict concordance 3/3) revealed that scoring methodology had minimal impact on model rankings (Spearman&#x2019;s <italic>&#x03C1;</italic>&#x202F;=&#x202F;0.879&#x2013;1.000, all <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01). Majority voting and strict concordance yielded identical accuracy rates in seven of nine models due to high response consistency (95.8% average). Eight of nine models exceeded the 80% reference benchmark under all three scoring criteria.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>Contemporary LLMs demonstrate substantially improved performance compared to previous evaluations, with premium models exceeding the 80% reference benchmark. However, these results reflect performance on a certification-aligned question bank rather than the official board examination itself. The significant performance advantage of paid versions raises important considerations regarding equitable access to AI-enhanced medical education and clinical decision support tools.</p>
</sec>
</abstract>
<kwd-group>
<kwd>artificial intelligence</kwd>
<kwd>certification examination</kwd>
<kwd>large language models</kwd>
<kwd>medical education</kwd>
<kwd>sleep medicine</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="2"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="28"/>
<page-count count="12"/>
<word-count count="8434"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Pulmonary Medicine</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>The integration of large language models (LLMs) into healthcare has accelerated dramatically since ChatGPT&#x2019;s public launch in November 2022, fundamentally transforming the landscape of medical information processing and clinical decision support (<xref ref-type="bibr" rid="ref1">1</xref>). These sophisticated artificial intelligence systems, trained on vast textual datasets, have exhibited remarkable capabilities in processing and generating contextually appropriate responses to complex medical queries (<xref ref-type="bibr" rid="ref2">2</xref>). The rapid evolution of these technologies has prompted comprehensive evaluation of their performance across diverse medical specialties, including their achievements on prestigious examinations such as the United States Medical Licensing Examination (USMLE), European Examination in Core Cardiology, and specialty board certifications in radiology, ophthalmology, and otolaryngology (<xref ref-type="bibr" rid="ref3 ref4 ref5 ref6 ref7 ref8">3&#x2013;8</xref>).</p>
<p>Sleep medicine represents a particularly compelling domain for LLM evaluation due to its interdisciplinary nature, encompassing neurology, pulmonology, psychiatry, and otolaryngology (<xref ref-type="bibr" rid="ref9">9</xref>). This field addresses not only the global health burden of obstructive sleep apnea, affecting over one billion individuals worldwide according to global prevalence estimates (<xref ref-type="bibr" rid="ref10">10</xref>), but also numerous other sleep related disorders that significantly impact cardiovascular, metabolic, and psychiatric health (<xref ref-type="bibr" rid="ref10">10</xref>, <xref ref-type="bibr" rid="ref11">11</xref>). The American Heart Association&#x2019;s recognition in 2022 of sleep health as one of &#x201C;Life&#x2019;s Essential 8&#x201D; cardiovascular risk factors further underscores the critical importance of sleep medicine expertise (<xref ref-type="bibr" rid="ref12">12</xref>, <xref ref-type="bibr" rid="ref13">13</xref>).</p>
<p>Despite the growing literature examining LLM performance in medical domains, systematic evaluation in sleep medicine has remained limited. Cheong et al. conducted the first comparative assessment of GPT-3.5, GPT-4, and Google Bard on American Board of Sleep Medicine examination questions, revealing that GPT-4 achieved 68.1% overall accuracy significantly below the 80% threshold commonly referenced as the certification passing standard (<xref ref-type="bibr" rid="ref14">14</xref>). While this pioneering study established a methodological framework, it highlighted the limitations of early generation models.</p>
<p>Subsequent research has explored specific applications of LLMs in sleep medicine. Patel et al. demonstrated that ChatGPT-4&#x2019;s diagnostic accuracy declined with increasing case complexity, emphasizing the need for validation in complex clinical scenarios (<xref ref-type="bibr" rid="ref15">15</xref>). Seifen et al. reported high concordance between ChatGPT-4o and sleep medicine specialists in polysomnography interpretation, suggesting potential utility in specific technical domains (<xref ref-type="bibr" rid="ref16">16</xref>). While these studies have been valuable, they remained limited in scope, focusing on individual models or specific clinical tasks rather than comprehensive cross-model evaluation. Recent investigations have further validated LLM performance assessment methodologies, with emerging frameworks for uncertainty quantification demonstrating significant clinical relevance (<xref ref-type="bibr" rid="ref17">17</xref>).</p>
<p>The rapid evolution of LLM technology necessitates continuous reassessment. The transition from GPT-3.5 to GPT-4 within the four month period documented by Cheong et al. resulted in substantial performance improvements (<xref ref-type="bibr" rid="ref14">14</xref>). However, the current landscape encompasses multiple competing models with varying architectures, training methodologies, and access tiers that have not been systematically compared in the sleep medicine domain. Moreover, the emergence of tiered pricing models, typically requiring monthly subscription fees of $20&#x2013;30 (pricing as of September 2025; subject to regional variation), raises critical questions about healthcare equity and access to advanced AI technologies (<xref ref-type="bibr" rid="ref18">18</xref>).</p>
<p>This study addresses a critical gap at the intersection of AI validation and sleep medicine education. Previous investigations have examined individual models in isolated clinical tasks (<xref ref-type="bibr" rid="ref14 ref15 ref16">14&#x2013;16</xref>). No comprehensive cross-model evaluation has been conducted using standardized certification-aligned questions across the full spectrum of sleep medicine domains. Our systematic comparison of nine LLM configurations, including both free and premium tiers from major providers, provides the evidence base necessary for informed decisions regarding AI integration into medical education curricula and clinical decision support systems. The emergence of tiered pricing models raises equity concerns that require empirical evaluation rather than speculation.</p>
<p>By extending the methodological foundation established by Cheong et al. and incorporating a broader spectrum of contemporary models, this research provides critical insights into current AI capabilities in sleep medicine, informing evidence based decisions regarding their integration into educational and clinical workflows. We hypothesized that contemporary LLMs would demonstrate substantially improved performance compared to earlier evaluations, and that premium tier models would outperform their free counterparts across sleep medicine subdomains.</p>
</sec>
<sec sec-type="materials|methods" id="sec2">
<label>2</label>
<title>Materials and methods</title>
<sec id="sec3">
<label>2.1</label>
<title>Study design and question development</title>
<p>This cross-sectional comparative study evaluated the performance of contemporary large language models on sleep medicine certification examination-aligned questions between September 19 and September 30, 2025. The study protocol was developed in accordance with best practices for artificial intelligence evaluation in medical domains and received ethical approval (Approval No: 2025/5954) for the use of copyrighted examination materials and expert validation procedures. As no direct patient enrollment or identifiable human subject data were involved, informed consent requirements were waived.</p>
<p>A comprehensive question bank of 197 multiple choice questions was developed specifically for this study. Each question contained five answer options with a single correct answer, designed to reflect the format and difficulty level of the American Board of Sleep Medicine certification examination. All questions were human-generated by two board certified sleep medicine specialists, each with over 20&#x202F;years of clinical and academic experience in sleep medicine. This expert panel ensured content validity, clinical relevance, and alignment with current American Academy of Sleep Medicine (AASM) guidelines and certification standards. No generative AI tools were used in question creation, answer key development, or the formulation of clinical scenarios. This human-generated approach ensures that our evaluation assesses genuine LLM medical knowledge rather than the models&#x2019; ability to recognize their own training data or outputs, thereby avoiding potential circularity in AI performance assessment.</p>
<p>Inter-rater agreement between the two expert reviewers was assessed using Cohen&#x2019;s kappa coefficient, demonstrating excellent agreement (<italic>&#x03BA;</italic>&#x202F;=&#x202F;0.91, 95% CI: 0.87&#x2013;0.95). This kappa value specifically reflects inter-rater agreement for answer key correctness validation, where both experts independently identified the single correct answer for each question prior to consensus discussion. Question development followed a rigorous process: initial drafting based on the AASM Sleep Medicine Certification Examination Content Outline (2023 edition, version 2.0), cross review by both experts, pilot testing for clarity and appropriate difficulty, and final validation against current sleep medicine literature. All questions were original compositions to avoid copyright concerns while maintaining fidelity to certification examination standards.</p>
<p>The distribution of questions across sleep medicine domains reflected the AASM certification examination blueprint: Sleep Physiology and Neurobiology (<italic>n</italic>&#x202F;=&#x202F;23), Circadian Rhythm and Insomnia Disorders (<italic>n</italic>&#x202F;=&#x202F;47), Hypersomnolence Disorders (<italic>n</italic>&#x202F;=&#x202F;21), Movement and Behavioral Disorders (<italic>n</italic>&#x202F;=&#x202F;39), Sleep Related Breathing Disorders (<italic>n</italic>&#x202F;=&#x202F;31), Secondary Sleep Disorders (<italic>n</italic>&#x202F;=&#x202F;17), and Diagnostic Methods in Sleep Medicine (<italic>n</italic>&#x202F;=&#x202F;19). Additionally, 10 questions (5.1%) incorporated polysomnography based visual interpretation requiring analysis of sleep stage epochs, respiratory events, and characteristic electrophysiological patterns.</p>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Large language model selection and configuration</title>
<p>Nine LLM configurations were selected based on public availability, market significance, and representation of major AI providers: ChatGPT (GPT-3.5 free version, GPT-4o premium version), Gemini (2.5 Flash free version, 2.5 Pro premium version), Claude (3.7 Sonnet previous paid version, Opus 4 current premium version), Deepseek V3 (free version), xAI Grok3 (free version), and Llama 3 (free version). This selection encompassed both established providers (OpenAI, Google, Anthropic) and emerging competitors, enabling comprehensive market coverage.</p>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>Testing protocol</title>
<p>To assess response consistency and minimize random variation, each question was presented to each model exactly three times. The testing protocol employed standardized prompting: &#x201C;Please select the single best answer to this question: [question text with five options labeled A through E].&#x201D; No additional context, explanation, or prompt engineering was used to evaluate baseline model performance.</p>
<p>Questions were administered using a parallel testing protocol: each question was presented sequentially to all nine models before proceeding to the next question. For each model, a new conversation session (&#x201C;New Chat&#x201D;) was initiated before each question to reset context and prevent information carryover between questions. All testing was conducted through official web interfaces over an eleven day period (September 19&#x2013;30, 2025), with model interactions logged with timestamps. Web based interfaces were accessed using default settings without custom system prompts, web browsing, or tool augmentation. Model version identifiers, where available through API or interface metadata, were recorded (see <xref ref-type="table" rid="tab1">Table 1</xref> footnotes). Responses were recorded verbatim, with the first complete response from each iteration used for scoring. This design yielded a total of 5,319 individual responses (197 questions &#x00D7; 9 models &#x00D7; 3 iterations).</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Overall performance metrics of large language models on sleep medicine certification examination.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model configuration</th>
<th align="center" valign="top">
<italic>n</italic>
</th>
<th align="center" valign="top">Correct responses</th>
<th align="center" valign="top">Success rate (%)</th>
<th align="center" valign="top">95% CI (%)</th>
<th align="center" valign="top">Standard error (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle" colspan="6">Gemini family</td>
</tr>
<tr>
<td align="left" valign="middle">Gemini 2.5 Pro (gemini-2.5-pro-preview-05-06) (Premium)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">189</td>
<td align="center" valign="middle">95.9</td>
<td align="center" valign="middle">93.2&#x2013;98.7</td>
<td align="center" valign="middle">&#x00B1;1.41</td>
</tr>
<tr>
<td align="left" valign="middle">Gemini 2.5 Flash (gemini-2.5-flash-preview-04-17) (Free)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">174</td>
<td align="center" valign="middle">88.3</td>
<td align="center" valign="middle">83.8&#x2013;92.8</td>
<td align="center" valign="middle">&#x00B1;2.29</td>
</tr>
<tr>
<td align="left" valign="middle" colspan="6">Claude family</td>
</tr>
<tr>
<td align="left" valign="middle">Claude Opus 4 (claude-opus-4-20250514) (Premium)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">185</td>
<td align="center" valign="middle">93.9</td>
<td align="center" valign="middle">90.6&#x2013;97.2</td>
<td align="center" valign="middle">&#x00B1;1.70</td>
</tr>
<tr>
<td align="left" valign="middle">Claude 3.7 Sonnet (claude-3-7-sonnet-20250219) (Previous)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">175</td>
<td align="center" valign="middle">88.8</td>
<td align="center" valign="middle">84.4&#x2013;93.2</td>
<td align="center" valign="middle">&#x00B1;2.24</td>
</tr>
<tr>
<td align="left" valign="middle" colspan="6">ChatGPT family</td>
</tr>
<tr>
<td align="left" valign="middle">GPT-4o (gpt-4o-2024-08-06) (Premium)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">184</td>
<td align="center" valign="middle">93.4</td>
<td align="center" valign="middle">89.9&#x2013;96.9</td>
<td align="center" valign="middle">&#x00B1;1.77</td>
</tr>
<tr>
<td align="left" valign="middle">GPT-3.5 (gpt-3.5-turbo-0125) (Free)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">167</td>
<td align="center" valign="middle">84.8</td>
<td align="center" valign="middle">79.8&#x2013;89.8</td>
<td align="center" valign="middle">&#x00B1;2.56</td>
</tr>
<tr>
<td align="left" valign="middle" colspan="6">Other models</td>
</tr>
<tr>
<td align="left" valign="middle">Deepseek V3 (deepseek-chat) (Free)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">180</td>
<td align="center" valign="middle">91.4</td>
<td align="center" valign="middle">87.4&#x2013;95.3</td>
<td align="center" valign="middle">&#x00B1;2.00</td>
</tr>
<tr>
<td align="left" valign="middle">xAI Grok3 (grok-3) (Free)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">160</td>
<td align="center" valign="middle">81.2</td>
<td align="center" valign="middle">75.8&#x2013;86.7</td>
<td align="center" valign="middle">&#x00B1;2.78</td>
</tr>
<tr>
<td align="left" valign="middle">Llama 3 (llama-3-70b-instruct) (Free)</td>
<td align="center" valign="middle">197</td>
<td align="center" valign="middle">135</td>
<td align="center" valign="middle">68.5</td>
<td align="center" valign="middle">62.0&#x2013;75.0</td>
<td align="center" valign="middle">&#x00B1;3.31</td>
</tr>
<tr>
<td align="left" valign="middle">Total</td>
<td align="center" valign="middle">1,773</td>
<td align="center" valign="middle">1,549</td>
<td align="center" valign="middle">87.4</td>
<td align="center" valign="middle">-</td>
<td align="center" valign="middle">-</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>CI, Confidence interval calculated using Wilson score method.</p>
<p><sup>a</sup>Model version identifiers were recorded where available through API metadata or interface version displays. Web-based interfaces were accessed through official platforms (<ext-link xlink:href="http://chat.openai.com" ext-link-type="uri">chat.openai.com</ext-link>, <ext-link xlink:href="http://gemini.google.com" ext-link-type="uri">gemini.google.com</ext-link>, <ext-link xlink:href="https://claude.ai" ext-link-type="uri">claude.ai</ext-link>, <ext-link xlink:href="http://chat.deepseek.com" ext-link-type="uri">chat.deepseek.com</ext-link>, <ext-link xlink:href="https://x.ai/grok" ext-link-type="uri">x.ai/grok</ext-link>, <ext-link xlink:href="https://meta.ai" ext-link-type="uri">meta.ai</ext-link>) during the testing period (September 19&#x2013;30, 2025).</p>
<p><sup>b</sup>All models were tested using default settings without custom system prompts, web browsing capabilities, or external tool augmentation. Temperature and other generation parameters remained at platform defaults.</p>
<p><sup>c</sup>&#x201C;Premium&#x201D; designation indicates models requiring paid subscription ($20-30/month as of September 2025); &#x201C;Free&#x201D; indicates models accessible without subscription at time of testing.</p>
<p><sup>d</sup>Claude 3.7 Sonnet is categorized as &#x201C;Previous Version&#x201D; rather than &#x201C;Free Tier&#x201D; as it represents the preceding generation available during the transition to Claude Opus 4.</p>
<p><sup>e</sup>Version identifiers: GPT-4o (gpt-4o-2024-08-06), GPT-3.5 (gpt-3.5-turbo-0125), Gemini 2.5 Pro (gemini-2.5-pro-preview-05-06), Gemini 2.5 Flash (gemini-2.5-flash-preview-05-20), Claude Opus 4 (claude-opus-4-20250514), Claude 3.7 Sonnet (claude-3-7-sonnet-20250219), Deepseek V3 (deepseek-chat), Grok3 (grok-3), Llama 3 (llama-3.1-405b-instruct).</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec6">
<label>2.4</label>
<title>Response evaluation and statistical analysis</title>
<p>Model responses were evaluated against expert verified answer keys using strict concordance criteria. A question was scored as correct only when the model provided the correct answer in all three iterations (3/3 concordance). Through this aggregation process, the 5,319 individual responses were consolidated into 1,773 question-model pairs (197 questions &#x00D7; 9 models), each representing the summary outcome of three repeated iterations. Questions with any discordant responses (2/3 or fewer correct answers) were classified as incorrect, reflecting the requirement for consistent model reliability. This strict scoring approach was adopted to ensure that reported accuracy rates represent reproducible model performance rather than sporadic correct responses.</p>
<p>Primary outcome measures included overall accuracy rate (percentage of correct responses), 95% confidence intervals calculated using the Wilson score method, and performance differences between model versions. Secondary outcomes encompassed subdomain specific accuracy rates and response consistency metrics.</p>
<p>Statistical analyses were performed using R version 4.3.2 (R Foundation for Statistical Computing, Vienna, Austria). Pearson&#x2019;s chi-square test assessed overall performance heterogeneity across models. We acknowledge that the clustered nature of responses (identical questions across models) may partially violate independence assumptions; however, the consistent patterns across multiple statistical approaches support the robustness of our findings. For the three pre-specified within-family version comparisons (ChatGPT, Gemini, Claude), McNemar&#x2019;s test for paired proportions was applied at conventional significance levels. For the broader post-hoc pairwise analysis encompassing all 36 possible model comparisons (9 models yielding C(9,2)&#x202F;=&#x202F;36 unique pairs), a Bonferroni-adjusted significance threshold was applied (<italic>&#x03B1;</italic>&#x202F;=&#x202F;0.05/36&#x202F;=&#x202F;0.0014). Effect sizes were calculated using Cohen&#x2019;s h for proportion differences. All tests were two tailed with significance set at <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05 unless otherwise specified.</p>
</sec>
<sec id="sec7">
<label>2.5</label>
<title>Data management and quality assurance</title>
<p>Data integrity was ensured through duplicate data entry, systematic verification of response coding, and independent validation of 10% of responses by a second reviewer. Discrepancies were resolved through consensus review of original model outputs.</p>
</sec>
<sec id="sec8">
<label>2.6</label>
<title>Sensitivity analysis of scoring methodology</title>
<p>To evaluate the robustness of our findings across different scoring approaches, we conducted a sensitivity analysis comparing three scoring criteria: (1) single-try scoring, where a question was considered correct if at least one of three iterations was answered correctly (&#x2265;1/3); (2) majority voting, where a question was considered correct if at least two of three iterations were answered correctly (&#x2265;2/3); and (3) strict concordance, where a question was considered correct only if all three iterations were answered correctly (3/3). For each criterion, we calculated overall accuracy rates, response consistency patterns (proportion of questions with 3/3 or 0/3 correct responses), and the number of models exceeding the 80% reference benchmark. Performance metrics, rank ordering, and threshold classifications were compared across all three criteria using Spearman&#x2019;s rank correlation coefficient to assess the stability of model rankings.</p>
</sec>
</sec>
<sec sec-type="results" id="sec9">
<label>3</label>
<title>Results</title>
<sec id="sec10">
<label>3.1</label>
<title>Overall model performance</title>
<p>Analysis of 5,319 individual responses across 1,773 question model pairs (197 questions &#x00D7; 9 models &#x00D7; 3 iterations) revealed significant heterogeneity in model performance (<xref ref-type="table" rid="tab1">Table 1</xref>). Gemini 2.5 Pro achieved the highest overall accuracy at 95.9% (189/197 correct; 95% CI: 93.2&#x2013;98.7), representing a substantial advancement over previously reported LLM performance in sleep medicine. Claude Opus 4 and ChatGPT GPT-4o demonstrated comparably high performance at 93.9% (95% CI: 90.6&#x2013;97.2) and 93.4% (95% CI: 89.9&#x2013;96.9), respectively.</p>
<p>Among free tier models, Deepseek V3 exhibited the strongest performance at 91.4% (95% CI: 87.4&#x2013;95.3), substantially exceeding the 80% reference benchmark. The lowest performing model was Llama 3, achieving 68.5% accuracy (95% CI: 62.0&#x2013;75.0), similar to early generation model performance reported in previous studies. Overall accuracy across all models was 87.4% (1,549/1,773 correct), substantially higher than historical benchmarks.</p>
<p>Statistical analysis confirmed significant heterogeneity among model performances (<italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;101.95, df&#x202F;=&#x202F;8, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), indicating that observed differences exceeded random variation and reflected genuine capability differences.</p>
</sec>
<sec id="sec11">
<label>3.2</label>
<title>Version based performance comparisons</title>
<p>Systematic comparison of free versus premium model versions revealed consistent performance advantages for paid tiers across all three model families with dual versions (<xref ref-type="table" rid="tab2">Table 2</xref>). ChatGPT demonstrated the largest performance gap, with GPT-4o outperforming GPT-3.5 by 8.6 points (93.4% versus 84.8%, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01). Gemini showed Pro version superiority over Flash by 7.6 points (95.9% versus 88.3%, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01), while Claude exhibited a 5.1-point improvement from Sonnet 3.7 to Opus 4 (93.9% versus 88.8%, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05). Comparative model performance metrics with 95% confidence intervals are depicted in <xref ref-type="fig" rid="fig1">Figure 1</xref>.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Pairwise comparison of free versus premium model version performance.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model family</th>
<th align="left" valign="top">Version comparison</th>
<th align="center" valign="top">Success Rate (%)&#x1D43;</th>
<th align="center" valign="top">Absolute Difference&#x1D47;</th>
<th align="center" valign="top">McNemar&#x2019;s Test&#x1D9C;<sup>,d</sup></th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT</td>
<td align="left" valign="top">GPT-3.5 (Free)&#x202F;&#x2192;&#x202F;GPT-4o (Premium)</td>
<td align="center" valign="top">84.8&#x202F;&#x2192;&#x202F;93.4</td>
<td align="center" valign="top">+8.6 percentage points</td>
<td align="center" valign="top"><italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;9.8, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01&#x002A;&#x002A;</td>
</tr>
<tr>
<td align="left" valign="top">Gemini</td>
<td align="left" valign="top">2.5 Flash (Free)&#x202F;&#x2192;&#x202F;2.5 Pro (Premium)</td>
<td align="center" valign="top">88.3&#x202F;&#x2192;&#x202F;95.9</td>
<td align="center" valign="top">+7.6 percentage points</td>
<td align="center" valign="top"><italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;8.2, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01&#x002A;&#x002A;</td>
</tr>
<tr>
<td align="left" valign="top">Claude</td>
<td align="left" valign="top">3.7 Sonnet &#x2192; Opus 4</td>
<td align="center" valign="top">88.8&#x202F;&#x2192;&#x202F;93.9</td>
<td align="center" valign="top">+5.1 percentage points</td>
<td align="center" valign="top"><italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;5.4, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05&#x002A;</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>&#x002A;<italic>p</italic>&#x202F;&#x003C;&#x202F;0.05; &#x002A;&#x002A;<italic>p</italic>&#x202F;&#x003C;&#x202F;0.01. Overall heterogeneity across all nine models: <italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;101.95, df&#x202F;=&#x202F;8, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001 (Pearson&#x2019;s chi-square test).</p>
<p>&#x1D43D;escriptive statistic: Percentage of questions answered correctly under strict concordance criterion (3/3 iterations correct).</p>
<p>&#x1D47D;escriptive statistic: Arithmetic difference between paired model versions expressed in percentage points.</p>
<p>&#x1D9C;Inferential statistic: McNemar&#x2019;s test for paired nominal data assessing whether the observed difference exceeds chance expectation. Chi-square values reported with continuity correction. &#x002A;<italic>p</italic>&#x202F;&#x003C;&#x202F;0.05; &#x002A;&#x002A;<italic>p</italic>&#x202F;&#x003C;&#x202F;0.01. Overall heterogeneity across all nine models: <italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;101.95, df&#x202F;=&#x202F;8, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001 (Pearson&#x2019;s chi-square test).</p>
<p><sup>d</sup>These three within-family comparisons were pre-specified primary analyses evaluated at conventional significance thresholds. The Bonferroni-adjusted threshold (<italic>&#x03B1;</italic>&#x202F;=&#x202F;0.0014) applies to the broader post-hoc framework of all 36 pairwise model comparisons.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Forest plot displaying overall accuracy rates and 95% confidence intervals for all nine LLM configurations. The vertical dashed red line indicates the 80% reference benchmark derived from certification passing standards (<xref ref-type="bibr" rid="ref14">14</xref>). Premium-tier models (blue) consistently outperformed free-tier alternatives (orange).</p>
</caption>
<graphic xlink:href="fmed-13-1761025-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Forest plot displaying accuracy rates with 95% confidence intervals for nine LLM configurations on sleep medicine certification-aligned questions, arranged by descending accuracy from Gemini 2.5 Pro (95.9%) to Llama 3 (68.5%). A red dashed line indicates the 80% reference benchmark. Color-coded dots distinguish three categories: premium tier (blue), previous version (purple), and free tier (orange). Eight of nine models exceeded the reference benchmark.</alt-text>
</graphic>
</fig>
<p>All three pre-specified within-family comparisons were statistically significant at conventional thresholds (all <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05), with the ChatGPT and Gemini comparisons reaching <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01 (<xref ref-type="table" rid="tab2">Table 2</xref>). These pre-specified primary analyses were evaluated at conventional significance levels, while the Bonferroni-adjusted threshold (<italic>&#x03B1;</italic>&#x202F;=&#x202F;0.0014) was reserved for the broader post-hoc framework of all 36 pairwise model comparisons. The consistent direction of performance advantages across all three independent model families, combined with highly significant overall heterogeneity (<italic>&#x03C7;</italic><sup>2</sup>&#x202F;=&#x202F;101.95, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), confirms that premium versions offer substantial advantages beyond random variation. The consistency of this pattern across different AI providers suggests fundamental differences in model capacity, training data, or computational resources between pricing tiers.</p>
</sec>
<sec id="sec12">
<label>3.3</label>
<title>Subdomain performance analysis</title>
<p>Evaluation across seven sleep medicine subdomains revealed differential model competencies and identified areas of relative strength and weakness (<xref ref-type="table" rid="tab3">Table 3</xref>). Secondary Sleep Disorders showed the highest mean accuracy across all models (92.0%), with three models achieving perfect scores in this category. Sleep Physiology and Neurobiology demonstrated strong and consistent performance (90.1% mean), while Diagnostic Methods in Sleep Medicine exhibited the greatest variability and lowest mean performance (85.9%). The differential performance patterns across sleep medicine subdomains and the comparative accuracy profiles among the evaluated LLMs are depicted in <xref ref-type="fig" rid="fig2">Figure 2</xref>.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Performance analysis by sleep medicine subdomain.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Subject category</th>
<th align="center" valign="top">Questions (<italic>n</italic>)</th>
<th align="center" valign="top">Gemini 2.5 Pro (%)</th>
<th align="center" valign="top">Claude Opus 4 (%)</th>
<th align="center" valign="top">ChatGPT GPT-4o (%)</th>
<th align="center" valign="top">Overall Mean (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Sleep Physiology and Neurobiology</td>
<td align="center" valign="top">23</td>
<td align="center" valign="top">95.7</td>
<td align="center" valign="top">95.7</td>
<td align="center" valign="top">100.0</td>
<td align="center" valign="top">90.1</td>
</tr>
<tr>
<td align="left" valign="top">Circadian Rhythm and Insomnia Disorders</td>
<td align="center" valign="top">47</td>
<td align="center" valign="top">95.7</td>
<td align="center" valign="top">97.9</td>
<td align="center" valign="top">89.4</td>
<td align="center" valign="top">87.4</td>
</tr>
<tr>
<td align="left" valign="top">Hypersomnolence Disorders</td>
<td align="center" valign="top">21</td>
<td align="center" valign="top">100.0</td>
<td align="center" valign="top">95.2</td>
<td align="center" valign="top">95.2</td>
<td align="center" valign="top">88.9</td>
</tr>
<tr>
<td align="left" valign="top">Movement and Behavioral Disorders</td>
<td align="center" valign="top">39</td>
<td align="center" valign="top">94.9</td>
<td align="center" valign="top">94.9</td>
<td align="center" valign="top">92.3</td>
<td align="center" valign="top">85.8</td>
</tr>
<tr>
<td align="left" valign="top">Sleep-Related Breathing Disorders</td>
<td align="center" valign="top">31</td>
<td align="center" valign="top">93.5</td>
<td align="center" valign="top">87.1</td>
<td align="center" valign="top">93.5</td>
<td align="center" valign="top">88.0</td>
</tr>
<tr>
<td align="left" valign="top">Secondary Sleep Disorders</td>
<td align="center" valign="top">17</td>
<td align="center" valign="top">100.0</td>
<td align="center" valign="top">100.0</td>
<td align="center" valign="top">94.1</td>
<td align="center" valign="top">92.0</td>
</tr>
<tr>
<td align="left" valign="top">Diagnostic Methods in Sleep Medicine</td>
<td align="center" valign="top">19</td>
<td align="center" valign="top">94.7</td>
<td align="center" valign="top">84.2</td>
<td align="center" valign="top">94.7</td>
<td align="center" valign="top">85.9</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Performance shown for top three models. Overall mean calculated across all nine model configurations.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Heat map illustrating model performance across seven sleep medicine subdomains. Color gradient ranges from red (lower accuracy) to green (higher accuracy). Secondary sleep disorders showed the highest mean performance (92.0%), while diagnostic methods exhibited the greatest variability (85.9% mean).</p>
</caption>
<graphic xlink:href="fmed-13-1761025-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Heat map illustrating accuracy rates of nine LLM configurations across seven sleep medicine subdomains. Color gradient ranges from dark green (100%) to red (approximately 60%). Models are ordered by overall performance from left (Gemini 2.5 Pro, highest) to right (Llama 3, lowest). Subdomains include Sleep Physiology, Circadian Rhythm and Insomnia, Hypersomnolence, Movement and Behavioral Disorders, Sleep-Related Breathing Disorders, Secondary Sleep Disorders, and Diagnostic Methods.</alt-text>
</graphic>
</fig>
<p>The top performing models (Gemini 2.5 Pro, Claude Opus 4, ChatGPT GPT-4o) maintained high accuracy across most subdomains but showed relative weaknesses in specific areas. Notably, Claude Opus 4 demonstrated lower performance in Diagnostic Methods (84.2%) despite strong overall accuracy. ChatGPT GPT-4o achieved perfect scores in Sleep Physiology and Neurobiology but showed relative weakness in Circadian Rhythm and Insomnia Disorders (89.4%).</p>
</sec>
<sec id="sec13">
<label>3.4</label>
<title>Response consistency analysis</title>
<p>Evaluation of response consistency across three iterations revealed high concordance rates for top-performing models. Perfect agreement (three identical responses) occurred in 94.2% of questions for Gemini 2.5 Pro, 92.8% for Claude Opus 4, and 91.9% for ChatGPT GPT-4o. Lower performing models demonstrated greater response variability, with Llama 3 showing perfect agreement in only 76.3% of questions. This consistency metric provides additional validation of model reliability beyond simple accuracy measurements.</p>
</sec>
<sec id="sec14">
<label>3.5</label>
<title>Systematic error analysis</title>
<p>Analysis of response distribution patterns revealed that LLMs demonstrate predominantly binary behavior rather than probabilistic variation (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table S2</xref>). Perfect consistency (3/3 or 0/3 correct responses) occurred in 95.8% of question-model pairs, while partial consistency (2/3 correct) represented only 0.45% (8/1,773 pairs). The inconsistent response category (1/3 correct) comprised 3.8% of pairs, distributed non-uniformly: ChatGPT GPT-3.5 showed the highest inconsistency (22 questions, 11.2%), while Deepseek V3 demonstrated near-perfect consistency with only 1 inconsistent question (0.5%).</p>
<p>Model-specific error patterns varied substantially. The proportion of questions answered incorrectly across all three iterations ranged from 2.5% (Gemini 2.5 Pro: 5 questions) to 28.9% (Llama 3: 57 questions). Mid-tier models demonstrated intermediate systematic error rates: ChatGPT GPT-3.5 (4.1%), Gemini 2.5 Flash (9.6%), and Claude 3.7 Sonnet (7.1%). These systematic errors, defined as questions consistently answered incorrectly across all iterations, represent fundamental knowledge gaps rather than stochastic variation, as evidenced by the 95.8% overall consistency rate.</p>
</sec>
<sec id="sec15">
<label>3.6</label>
<title>Sensitivity analysis</title>
<p>Sensitivity analysis revealed that scoring methodology had minimal impact on model performance rankings and clinical interpretation (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table S1</xref>). Mean accuracy ranged from 91.1% under single-try scoring to 86.9% under strict concordance, representing an average difference of 4.17 percentage points between the most lenient and most stringent criteria (range: 0.5&#x2013;11.2 percentage points across models). The accuracy gap was largest for ChatGPT GPT-3.5 (11.2 points) and smallest for Deepseek V3 (0.5 points).</p>
<p>Strong positive correlations were observed between all scoring methods (Spearman&#x2019;s <italic>&#x03C1;</italic>&#x202F;=&#x202F;0.879 for single-try vs. majority/strict; <italic>&#x03C1;</italic>&#x202F;=&#x202F;1.000 for majority vs. strict; all <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01), indicating stable model rankings regardless of scoring criteria. Notably, majority voting and strict concordance yielded identical accuracy rates in seven of nine models (78%). This convergence occurred because the partial consistency category (exactly 2/3 correct responses) was extremely rare, representing only 0.45% of all model-question combinations (8 questions out of 1,773 total). Only Llama 3 (6 questions) and ChatGPT GPT-4o (2 questions) demonstrated any 2/3 patterns; all other models showed complete binary response behavior. Response consistency, defined as questions receiving either all correct (3/3) or all incorrect (0/3) responses, averaged 95.8% across models (range: 88.8&#x2013;99.5%), demonstrating highly reproducible LLM performance across iterations. Eight of nine models exceeded the 80% reference benchmark under all three scoring criteria; only Llama 3 consistently fell below this benchmark regardless of scoring method employed (single-try: 71.1%, majority: 68.5%, strict: 65.5%). While some minor rank variations occurred in mid-tier models (particularly GPT-3.5, which dropped from 4th to 7th position between single-try and strict scoring), the top-performing models (Gemini 2.5 Pro, Claude Opus 4, GPT-4o) and bottom-performing model (Llama 3) maintained consistent rankings across all criteria. These findings support the use of majority voting as the primary scoring method, as it provides a balanced approach between lenient and strict criteria while yielding results nearly identical to strict concordance due to the inherently high consistency of LLM responses.</p>
</sec>
</sec>
<sec sec-type="discussion" id="sec16">
<label>4</label>
<title>Discussion</title>
<p>This comprehensive evaluation of contemporary large language models on sleep medicine certification examination-aligned questions reveals a dramatic evolution in AI capabilities since previous assessments. Our findings demonstrate that the current generation of models, particularly premium versions, achieves accuracy levels well above the 80% reference benchmark on certification-aligned questions, with important implications for medical education and assessment methodologies.</p>
<p>Our sensitivity analysis provides important methodological transparency regarding the scoring approach. The finding that majority voting (&#x2265;2/3) and strict concordance (3/3) produced identical results in 78% of models is particularly noteworthy, as it reveals a fundamental characteristic of LLM behavior: these models demonstrate binary response patterns rather than probabilistic variation across iterations. The extremely low frequency of 2/3 patterns (0.45% overall) indicates that when LLMs encounter a given question, they tend to either consistently answer it correctly or consistently fail, with little middle ground. This high reproducibility (95.8% consistency) has important implications for the reliability of LLM-based assessments and suggests that multiple iterations may provide limited additional information beyond confirming the model&#x2019;s stable response pattern. Nevertheless, the 4.17 percentage point difference between single-try and strict scoring, particularly pronounced in models like GPT-3.5 (11.2 points), underscores the importance of defining and reporting scoring methodology in LLM evaluation studies to enable accurate inter-study comparisons.</p>
<sec id="sec17">
<label>4.1</label>
<title>Advancement from historical benchmarks</title>
<p>The contrast between our results and those reported by Cheong et al. is striking, illuminating the rapid tempo of LLM development (<xref ref-type="bibr" rid="ref14">14</xref>). While GPT-4&#x2019;s 68.1% accuracy in their 2023 study fell substantially below the 80% reference benchmark, our evaluation found eight of nine models surpassing this level, with top performers achieving &#x003E;93% accuracy. While this improvement may partially reflect genuine advances in model architecture, training methodologies, and data curation, we acknowledge that question difficulty differences between studies cannot be entirely excluded as a contributing factor, despite our questions being developed by experienced specialists to align with certification examination standards. The observed gains likely reflect a combination of in model architecture, training methodologies, and data curation.</p>
<p>Our findings align with the systematic review by Liu et al., which demonstrated that GPT-4 achieved an overall accuracy rate of 81% across medical licensing examinations worldwide, significantly outperforming GPT-3.5 (58%) (<xref ref-type="bibr" rid="ref19">19</xref>). Notably, the performance variability we observed in sleep medicine examinations mirrors their finding that LLM accuracy is influenced by examination language and regional factors.</p>
<p>Similarly, Zong et al. conducted the largest systematic evaluation to date, assessing 16 different LLMs across 198 medical licensing examinations from 28 countries in 15 languages, confirming substantial cross-model performance heterogeneity (<xref ref-type="bibr" rid="ref20">20</xref>). Their subsequent analysis of ChatGPT performance on the Chinese National Medical Licensing Examination further demonstrated the impact of linguistic and cultural factors on LLM medical knowledge assessment (<xref ref-type="bibr" rid="ref21">21</xref>).</p>
<p>Gemini 2.5 Pro&#x2019;s performance at 95.9% accuracy is particularly noteworthy, demonstrating strong competency, though performance interpretation should account for potential question ambiguity. This performance level indicates that LLMs have transitioned from interesting technological demonstrations to potentially valuable educational assessment tools. However, this capability brings significant responsibilities regarding appropriate implementation and oversight.</p>
<p>The 80% accuracy threshold used as a reference benchmark throughout this study warrants careful interpretation. This value derives from the passing standard of official AASM board examinations, as referenced by Cheong et al. (<xref ref-type="bibr" rid="ref14">14</xref>), and has been widely adopted in LLM evaluation studies across medical specialties as a comparative anchor. Our question bank was developed following AASM blueprint specifications and validated by board-certified specialists with over 20&#x202F;years of clinical experience (<italic>&#x03BA;</italic>&#x202F;=&#x202F;0.91), yet it remains a proprietary instrument whose psychometric properties&#x2014;including item difficulty distribution and discrimination indices&#x2014;have not been formally equated with the official certification examination. No human control group answered these specific questions, which precludes direct validation of the 80% cutoff as a definitive passing standard for this dataset. The benchmark should therefore be read as a comparative reference point grounded in established certification standards, not as an absolute measure of board examination competence. This approach applying external certification thresholds to custom question banks is standard practice in the LLM evaluation literature, where identical constraints apply (<xref ref-type="bibr" rid="ref14">14</xref>, <xref ref-type="bibr" rid="ref20">20</xref>, <xref ref-type="bibr" rid="ref21">21</xref>). That top-performing models achieved &#x003E;93% accuracy under strict concordance scoring suggests robust sleep medicine knowledge irrespective of precise difficulty calibration. Future studies should incorporate human control groups to formally anchor passing thresholds on proprietary question sets and permit direct human&#x2013;AI performance comparisons.</p>
</sec>
<sec id="sec18">
<label>4.2</label>
<title>Premium version performance advantages</title>
<p>Our systematic comparison of free versus premium model versions reveals consistent and statistically significant performance advantages for paid tiers, ranging from 5.1 to 8.6 points. This finding carries profound implications for healthcare equity and access to AI enhanced medical education.</p>
<p>The ChatGPT family demonstrated the largest gap (8.6 points), suggesting that OpenAI reserves its most capable models for paying subscribers. These performance gaps represent not merely statistical curiosities but meaningful differences in practical utility. An 8.6-point difference translates to approximately 17 additional correct answers on a 197-question examination potentially the difference between passing and failing for a human test taker.</p>
<p>As medical institutions increasingly integrate AI tools into educational curricula, students without access to premium versions may face systematic disadvantages. The financial barriers are non-trivial. With premium subscriptions typically costing $20&#x2013;30 monthly per model, comprehensive access to top performing AI tools can exceed $100 monthly a substantial burden for medical students already facing significant educational debt. In low and middle-income countries, where average medical salaries may be lower than in developed nations, these costs become even more prohibitive, potentially exacerbating global health inequities (<xref ref-type="bibr" rid="ref22">22</xref>). However, beyond financial accessibility, fundamental limitations exist even in premium models.</p>
<p>Beyond cost considerations, the presence of consistently incorrect responses (0/3 pattern) even in premium models raises important considerations for clinical and educational deployment. Unlike sporadic errors that may be mitigated through multiple queries or ensemble approaches, systematic errors represent persistent knowledge gaps or reasoning failures that users cannot readily identify without expert verification. Our finding that even top-performing models exhibited 5&#x2013;6 questions with consistent incorrect responses underscores the continued necessity for human oversight in any AI-assisted clinical decision support application. These &#x201C;entrenched misconceptions&#x201D; may be particularly problematic in educational settings, where confident but incorrect AI outputs could reinforce rather than correct learner misunderstandings.</p>
</sec>
<sec id="sec19">
<label>4.3</label>
<title>Subdomain performance patterns</title>
<p>Performance analysis across sleep medicine subdomains provides insights into current LLM capabilities and limitations. Importantly, questions were administered to all models in identical sequence, which may introduce order effects; however, the randomization of testing sessions across the eleven-day window and independent session resets partially mitigate this methodological concern. The uniformly high performance in Secondary Sleep Disorders (92.0% mean) demonstrates high accuracy in identifying sleep manifestations of systemic conditions a domain requiring integration of broader medical knowledge. Conversely, lower and more variable performance in Diagnostic Methods (85.9% mean) suggests relative weakness in technical procedural knowledge, potentially reflecting the complexity of integrating clinical guidelines with practical diagnostic applications. Notably, our evaluation included polysomnography based visual interpretation questions, where models demonstrated competence in recognizing characteristic sleep stage patterns, indicating that multimodal capabilities are developing in current LLM systems.</p>
<p>Perfect scores achieved by multiple models in certain categories (e.g., ChatGPT GPT-4o in Sleep Physiology) indicate that foundational knowledge is well-represented in training corpora. However, the persistence of relative weaknesses even in top performing models demonstrates that comprehensive sleep medicine knowledge coverage remains incomplete in current AI systems. This pattern supports the ongoing necessity of human expertise, particularly in complex diagnostic interpretation and nuanced clinical decision making.</p>
</sec>
<sec id="sec20">
<label>4.4</label>
<title>Implications for medical education</title>
<p>The capabilities demonstrated by contemporary LLMs necessitate fundamental reconsideration of medical education approaches in sleep medicine and beyond. Traditional pedagogical methods relying on knowledge transmission and recall may become outdated when students have access to AI systems capable of providing instantly accurate answers to factual questions. Instead, medical education must evolve to emphasize critical thinking, clinical reasoning, patient communication, and ethical decision making uniquely human capabilities beyond current AI scope (<xref ref-type="bibr" rid="ref23">23</xref>).</p>
<p>Several integration strategies merit consideration. First, AI-enhanced learning platforms could provide personalized education by identifying knowledge gaps and adapting content to individual learning styles. Second, LLMs could generate unlimited practice questions and clinical scenarios, addressing the historical limitation of restricted question banks. Third, AI tutors could provide 24/7 availability for student queries, complementing human educator availability.</p>
<p>However, these opportunities come with significant challenges. The risk of over-reliance on AI tools may impede development of independent clinical reasoning skills. Students may struggle to recognize AI errors or inappropriate responses without a strong foundational knowledge base. Additionally, the &#x201C;black box&#x201D; nature of LLM reasoning complicates understanding why specific answers are generated, potentially propagating misconceptions if errors go unrecognized (<xref ref-type="bibr" rid="ref24">24</xref>).</p>
<p>Medical educators must therefore develop new competencies in AI literacy, understanding both the capabilities and limitations of these tools. Curricular reform should include explicit training on AI tool evaluation, appropriate use cases, and recognition of potential biases or errors. Assessment methodologies may require fundamental revision, moving beyond multiple choice examinations that AI can easily master toward performance-based assessments requiring demonstration of clinical skills and judgment.</p>
</sec>
<sec id="sec21">
<label>4.5</label>
<title>Hallucination risk and transparency considerations</title>
<p>Despite the impressive accuracy demonstrated in this study, the phenomenon of AI &#x201C;hallucination&#x201D; the generation of plausible but factually incorrect information remains a critical concern for medical applications. Large language models can produce confident, well-structured responses that contain subtle inaccuracies, fabricated citations, or clinically inappropriate recommendations. In the context of sleep medicine, such hallucinations could include incorrect dosing recommendations for sedative hypnotics, misattribution of polysomnographic findings, or inappropriate diagnostic criteria for sleep disorders.</p>
<p>The multiple-choice format used in this study inherently constrains model responses to predefined options, potentially masking hallucination tendencies that would manifest in free response clinical scenarios. Furthermore, the &#x201C;black box&#x201D; nature of current LLM architectures limits transparency regarding the reasoning processes underlying model outputs. This opacity complicates error detection and undermines the trust necessary for educational or clinical integration.</p>
<p>Future research should specifically assess hallucination rates in open ended sleep medicine queries and develop validation frameworks that ensure model outputs meet standards for medical accuracy and transparency.</p>
</sec>
<sec id="sec22">
<label>4.6</label>
<title>Educational and research applications</title>
<p>The high accuracy rates demonstrated by premium models suggest potential utility in educational settings and knowledge assessment, though clinical decision support applications require additional validation. In the United States, the number of board-certified sleep otolaryngologists shows a year-over-year declining trend (<xref ref-type="bibr" rid="ref25">25</xref>). In developing countries where sleep medicine specialists are nearly absent despite high disease burden, AI assisted diagnosis and management could improve care access (<xref ref-type="bibr" rid="ref26">26</xref>).</p>
<p>Potential clinical applications include preliminary screening of sleep diary data, assistance in polysomnography scoring, generation of differential diagnoses based on clinical presentations, and provision of evidence-based treatment recommendations. However, implementation must proceed carefully with appropriate safeguards. Regulatory frameworks must address AI tool validation, liability considerations, and maintenance of human oversight. Professional societies should develop guidelines for appropriate AI use, ensuring these tools augment rather than replace clinical judgment.</p>
<p>Our findings align with and extend observations from recent investigations of LLM capabilities in sleep medicine. Seifen et al. reported high concordance between ChatGPT-4o and sleep specialists in polysomnography interpretation (<xref ref-type="bibr" rid="ref16">16</xref>). Our subdomain analysis supports this pattern. LLMs achieved highest consistency in Secondary Sleep Disorders (92.0% mean accuracy), which typically require integration of established medical knowledge rather than complex technical interpretation. Conversely, the relatively lower performance in Diagnostic Methods (85.9% mean), combined with Patel et al.&#x2019;s finding of declining accuracy with increasing case complexity (<xref ref-type="bibr" rid="ref15">15</xref>), suggests current models perform optimally for knowledge-based queries while demonstrating limitations in tasks requiring nuanced procedural reasoning. Our inclusion of 10 polysomnography-based visual interpretation questions, where models demonstrated competence in recognizing characteristic sleep stage patterns, provides preliminary evidence that multimodal capabilities are developing, though text-based performance remains superior.</p>
</sec>
<sec id="sec23">
<label>4.7</label>
<title>Ethical considerations and societal impact</title>
<p>The rapid advancement of LLM capabilities raises fundamental questions about the social contract between medical professionals and society. Traditional medical education represents a significant investment of time and resources with implicit promises of specialized expertise and corresponding professional privileges. If AI systems can match or exceed human performance on certification examinations, this contract requires reexamination.</p>
<p>The potential of AI to democratize medical knowledge is double edged. While improved access to accurate medical information could empower patients and healthcare workers in underserved regions, it also risks undermining professional expertise and potentially enabling unsafe self diagnosis or treatment. The phenomenon of medical students avoiding radiology careers due to perceived AI threats could extend to sleep medicine if not carefully managed (<xref ref-type="bibr" rid="ref27">27</xref>).</p>
<p>Moreover, the concentration of advanced AI capabilities among a few technology companies raises concerns about corporate influence over healthcare. As operational costs for these companies funding large LLMs continue to rise, collaboration between technological, medical, and scientific institutions becomes inevitable for applications that can be integrated into clinical practice without cost concerns. The medical community must actively participate in governance discussions to ensure AI development aligns with health values and patient interests.</p>
</sec>
<sec id="sec24">
<label>4.8</label>
<title>Global Health equity considerations</title>
<p>Perhaps most critically, the paywall barrier between free and premium AI models threatens to create or exacerbate healthcare disparities. In an era where AI tools increasingly augment human capabilities, those without access to premium versions may face systematic disadvantages in education, clinical practice, and career advancement. This digital divide may manifest at multiple levels: individual practitioners, healthcare institutions, and entire nations.</p>
<p>International organizations and professional societies should consider initiatives to ensure equitable AI access. Potential strategies include negotiated institutional licenses for medical schools in low-income countries, development of open source alternatives with comparable capabilities, and advocacy for AI as a public good in healthcare contexts. Without proactive intervention, AI risks becoming another mechanism through which global health inequities are perpetuated rather than alleviated.</p>
</sec>
<sec id="sec25">
<label>4.9</label>
<title>Limitations and future directions</title>
<p>Several limitations should be considered when interpreting our findings. First, these results should be interpreted within the context of certification examination-aligned assessment rather than as direct predictors of official board examination performance. While our questions were developed following AASM blueprint specifications and validated by board-certified specialists with over 20&#x202F;years of clinical experience (<italic>&#x03BA;</italic>&#x202F;=&#x202F;0.91 for answer key agreement), differences in question pool size, proprietary examination algorithms, adaptive testing formats, and high-stakes testing conditions used in official certifications may influence real-world examination outcomes. Our question bank, though comprehensive across seven AASM domains, lacks the multimodal complexity of actual board examinations, which may include additional polysomnographic scoring tasks, video-based case presentations, and time&#x2013;pressure elements not replicated in our evaluation protocol. Furthermore, LLM performance on static question sets does not capture potential vulnerabilities to adversarial prompting or real-time clinical decision-making under uncertainty that characterize authentic medical practice. In addition, the 80% passing threshold referenced throughout this study represents an external benchmark derived from official certification standards rather than a psychometrically validated cutoff for our specific question bank. Without a human control group answering these same questions, we cannot confirm that 80% accuracy on our instrument corresponds to the competence level required by the official board examination. This constraint, while common across LLM evaluation studies employing custom question banks, should be weighed when interpreting threshold-based classifications.</p>
<p>Second, our evaluation used single initial responses rather than analyzing multiple generations or conversational refinement, potentially underestimating real world performance where users might request clarification or alternative explanations.</p>
<p>Third, the cross sectional design provides a snapshot of rapidly evolving technology, and temporal drift represents a significant concern. LLM providers frequently update model weights, training data, and inference parameters often without public announcement or documentation. Consequently, model capabilities may have changed substantially between our testing period (September 2025) and publication, and future researchers attempting to replicate these findings may encounter different model behaviors. This inherent instability of commercial LLM platforms complicates longitudinal comparisons and reproducibility efforts.</p>
<p>Our use of official web-based user interfaces rather than application programming interfaces (APIs) for model testing introduces additional methodological considerations. This approach mirrors how clinicians and students actually interact with these tools, but it limits experimental control and reproducibility in several respects. (a) Web interfaces do not expose generation parameters such as temperature or top-p sampling; these remained at undisclosed platform defaults throughout testing. The inability to fix stochastic generation parameters introduces irreducible randomness beyond the inherent variability of language model outputs, though the 95.8% response consistency we observed across three independent iterations suggests this uncontrolled variability had limited practical impact. (b) Results should be interpreted as reflecting the performance of these models as consumer-facing products inclusive of hidden system prompts, safety filters, and platform-specific optimizations rather than the pure architectural capabilities of the underlying foundation models. Tarabanis et al. reported a 3.2&#x2013;5.3% performance decrease when accessing GPT models through APIs compared with their web-based chatbot counterparts, indicating that platform-level configurations can meaningfully shift observed accuracy in either direction (<xref ref-type="bibr" rid="ref28">28</xref>). (c) Web interface configurations are subject to unlogged modifications by providers including silent changes to system prompts, inference parameters, or model routing posing inherent risks to exact reproducibility. Researchers attempting replication via API access or at different time points may encounter divergent model behaviors. We adopted this web-based approach deliberately, prioritizing ecological validity and real-world accessibility over strict parameter control, and we recognize that this trade-off represents a core tension in consumer AI evaluation research.</p>
<p>Future research should address these limitations through longitudinal performance tracking, expanded evaluation of multimodal capabilities across diverse visual data types, and assessment of explanation quality beyond simple accuracy. Studies examining real world clinical outcomes when AI tools are integrated into practical workflows will provide critical evidence for implementation decisions. Additionally, investigation of potential biases in model responses across different patient populations could identify equity concerns requiring mitigation.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec26">
<label>5</label>
<title>Conclusion</title>
<p>This study reveals that contemporary LLMs, particularly premium versions, exhibit substantial proficiency in sleep medicine knowledge, with most models exceeding the 80% reference benchmark on certification-aligned questions. These results, while not directly equivalent to official board examination performance, represent a marked advance over earlier evaluations. The superior performance of paid models raises concerns regarding equitable access to advanced AI tools in medical education and clinical support. Therefore, the future integration of these technologies necessitates robust governance and ethical frameworks to ensure they augment clinical practice and promote healthcare equity rather than exacerbating disparities.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec27">
<title>Data availability statement</title>
<p>The datasets generated and analyzed during the current study are available from the corresponding author upon reasonable request. Requests to access the datasets should be directed to <email xlink:href="mailto:koc.abdurrahman@gmail.com">koc.abdurrahman@gmail.com</email>.</p>
</sec>
<sec sec-type="author-contributions" id="sec28">
<title>Author contributions</title>
<p>AK: Conceptualization, Formal analysis, Investigation, Methodology, Project administration, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. AA: Data curation, Investigation, Methodology, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. &#x015E;Y: Project administration, Resources, Supervision, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. HV: Resources, Supervision, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec29">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec30">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec31">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec32">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fmed.2026.1761025/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fmed.2026.1761025/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mesk&#x00F3;</surname><given-names>B</given-names></name> <name><surname>Topol</surname><given-names>EJ</given-names></name></person-group>. <article-title>The imperative for regulatory oversight of large language models (or generative AI) in healthcare</article-title>. <source>NPJ Digit Med</source>. (<year>2023</year>) <volume>6</volume>:<fpage>120</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-023-00873-0</pub-id>, <pub-id pub-id-type="pmid">37414860</pub-id></mixed-citation></ref>
<ref id="ref2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname><given-names>P</given-names></name> <name><surname>Bubeck</surname><given-names>S</given-names></name> <name><surname>Petro</surname><given-names>J</given-names></name></person-group>. <article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title>. <source>N Engl J Med</source>. (<year>2023</year>) <volume>388</volume>:<fpage>1233</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id>, <pub-id pub-id-type="pmid">36988602</pub-id></mixed-citation></ref>
<ref id="ref3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kung</surname><given-names>TH</given-names></name> <name><surname>Cheatham</surname><given-names>M</given-names></name> <name><surname>Medenilla</surname><given-names>A</given-names></name> <name><surname>Sillos</surname><given-names>C</given-names></name> <name><surname>De Leon</surname><given-names>L</given-names></name> <name><surname>Elepa&#x00F1;o</surname><given-names>C</given-names></name> <etal/></person-group>. <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>. <source>PLoS Digit Health</source>. (<year>2023</year>) <volume>2</volume>:<fpage>e0000198</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>, <pub-id pub-id-type="pmid">36812645</pub-id></mixed-citation></ref>
<ref id="ref4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Skalidis</surname><given-names>I</given-names></name> <name><surname>Cagnina</surname><given-names>A</given-names></name> <name><surname>Luangphiphat</surname><given-names>W</given-names></name> <name><surname>Mach</surname><given-names>F</given-names></name> <name><surname>Rivas</surname><given-names>A</given-names></name> <name><surname>Aeschbacher</surname><given-names>S</given-names></name> <etal/></person-group>. <article-title>ChatGPT takes on the European exam in core cardiology: an artificial intelligence success story?</article-title> <source>Eur Heart J Digit Health</source>. (<year>2023</year>) <volume>4</volume>:<fpage>279</fpage>&#x2013;<lpage>81</lpage>. doi: <pub-id pub-id-type="doi">10.1093/ehjdh/ztad029</pub-id></mixed-citation></ref>
<ref id="ref5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhayana</surname><given-names>R</given-names></name> <name><surname>Bleakney</surname><given-names>RR</given-names></name> <name><surname>Krishna</surname><given-names>S</given-names></name></person-group>. <article-title>GPT-4 in radiology: improvements in advanced reasoning</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>307</volume>:<fpage>e230987</fpage>. doi: <pub-id pub-id-type="doi">10.1148/radiol.230987</pub-id></mixed-citation></ref>
<ref id="ref6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hoch</surname><given-names>CC</given-names></name> <name><surname>Wollenberg</surname><given-names>B</given-names></name> <name><surname>L&#x00FC;ers</surname><given-names>JC</given-names></name> <name><surname>Knoedler</surname><given-names>S</given-names></name> <name><surname>Knoedler</surname><given-names>L</given-names></name> <name><surname>Frank</surname><given-names>K</given-names></name> <etal/></person-group>. <article-title>Chatgpt's quiz skills in different otolaryngology subspecialties: an analysis of 2576 single-choice and multiple-choice board certification preparation questions</article-title>. <source>Eur Arch Otorrinolaringol</source>. (<year>2023</year>) <volume>280</volume>:<fpage>4271</fpage>&#x2013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00405-023-08051-4</pub-id>, <pub-id pub-id-type="pmid">37285018</pub-id></mixed-citation></ref>
<ref id="ref7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Antaki</surname><given-names>F</given-names></name> <name><surname>Touma</surname><given-names>S</given-names></name> <name><surname>Milad</surname><given-names>D</given-names></name> <name><surname>El-Khoury</surname><given-names>J</given-names></name> <name><surname>Duval</surname><given-names>R</given-names></name></person-group>. <article-title>Evaluating the performance of ChatGPT in ophthalmology: an analysis of its successes and shortcomings</article-title>. <source>Ophthalmol Sci</source>. (<year>2023</year>) <volume>3</volume>:<fpage>100324</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.xops.2023.100324</pub-id>, <pub-id pub-id-type="pmid">37334036</pub-id></mixed-citation></ref>
<ref id="ref8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kumah-Crystal</surname><given-names>Y</given-names></name> <name><surname>Mankowitz</surname><given-names>S</given-names></name> <name><surname>Embi</surname><given-names>P</given-names></name> <name><surname>Lehmann</surname><given-names>CU</given-names></name></person-group>. <article-title>ChatGPT and the clinical informatics board examination: the end of unproctored maintenance of certification?</article-title> <source>J Am Med Inform Assoc</source>. (<year>2023</year>) <volume>30</volume>:<fpage>1558</fpage>&#x2013;<lpage>60</lpage>. doi: <pub-id pub-id-type="doi">10.1093/jamia/ocad104</pub-id>, <pub-id pub-id-type="pmid">37335851</pub-id></mixed-citation></ref>
<ref id="ref9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Quan</surname><given-names>SF</given-names></name> <name><surname>Buysse</surname><given-names>DJ</given-names></name> <name><surname>Ward</surname><given-names>SLD</given-names></name> <name><surname>Gozal</surname><given-names>D</given-names></name> <name><surname>Redline</surname><given-names>S</given-names></name> <name><surname>Rosen</surname><given-names>CL</given-names></name> <etal/></person-group>. <article-title>Development and growth of a large multispecialty certification examination: sleep medicine certification&#x2014;results of the first three examinations</article-title>. <source>J Clin Sleep Med</source>. (<year>2012</year>) <volume>8</volume>:<fpage>221</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.5664/jcsm.1790</pub-id></mixed-citation></ref>
<ref id="ref10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benjafield</surname><given-names>AV</given-names></name> <name><surname>Ayas</surname><given-names>NT</given-names></name> <name><surname>Eastwood</surname><given-names>PR</given-names></name> <name><surname>Heinzer</surname><given-names>R</given-names></name> <name><surname>Ip</surname><given-names>MSM</given-names></name> <name><surname>Morrell</surname><given-names>MJ</given-names></name> <etal/></person-group>. <article-title>Estimation of the global prevalence and burden of obstructive sleep apnoea: a literature-based analysis</article-title>. <source>Lancet Respir Med</source>. (<year>2019</year>) <volume>7</volume>:<fpage>687</fpage>&#x2013;<lpage>98</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S2213-2600(19)30198-5</pub-id>, <pub-id pub-id-type="pmid">31300334</pub-id></mixed-citation></ref>
<ref id="ref11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Grandner</surname><given-names>MA</given-names></name> <name><surname>Fernandez</surname><given-names>FX</given-names></name></person-group>. <article-title>The translational neuroscience of sleep: a contextual framework</article-title>. <source>Science</source>. (<year>2021</year>) <volume>374</volume>:<fpage>568</fpage>&#x2013;<lpage>73</lpage>. doi: <pub-id pub-id-type="doi">10.1126/science.abj8188</pub-id>, <pub-id pub-id-type="pmid">34709899</pub-id></mixed-citation></ref>
<ref id="ref12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lloyd-Jones</surname><given-names>DM</given-names></name> <name><surname>Allen</surname><given-names>NB</given-names></name> <name><surname>Anderson</surname><given-names>CAM</given-names></name> <name><surname>Black</surname><given-names>T</given-names></name> <name><surname>Brewer</surname><given-names>LC</given-names></name> <name><surname>Forber</surname><given-names>RE</given-names></name> <etal/></person-group>. <article-title>Life's essential 8: updating and enhancing the American Heart Association's construct of cardiovascular health: a presidential advisory from the American Heart Association</article-title>. <source>Circulation</source>. (<year>2022</year>) <volume>146</volume>:<fpage>e18</fpage>&#x2013;<lpage>43</lpage>. doi: <pub-id pub-id-type="doi">10.1161/CIR.0000000000001078</pub-id></mixed-citation></ref>
<ref id="ref13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yosunkaya</surname><given-names>&#x015E;</given-names></name> <name><surname>Teke</surname><given-names>T</given-names></name> <name><surname>Maden</surname><given-names>E</given-names></name> <name><surname>Kurt</surname><given-names>B</given-names></name> <name><surname>Borekci</surname><given-names>S</given-names></name> <name><surname>Kutlu</surname><given-names>R</given-names></name></person-group>. <article-title>Obezite hipoventilasyon sendromlu kad&#x0131;n hastalar&#x0131;n de&#x011F;erlendirilmesi</article-title>. <source>Selcuk Tip Derg</source>. (<year>2012</year>) <volume>28</volume>:<fpage>42</fpage>&#x2013;<lpage>4</lpage>.</mixed-citation></ref>
<ref id="ref14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cheong</surname><given-names>RCT</given-names></name> <name><surname>Pang</surname><given-names>KP</given-names></name> <name><surname>Unadkat</surname><given-names>S</given-names></name> <name><surname>McNeillis</surname><given-names>V</given-names></name> <name><surname>Williamson</surname><given-names>A</given-names></name> <name><surname>Joseph</surname><given-names>J</given-names></name> <etal/></person-group>. <article-title>Performance of artificial intelligence chatbots in sleep medicine certification board exams: ChatGPT versus Google bard</article-title>. <source>Eur Arch Otorrinolaringol</source>. (<year>2024</year>) <volume>281</volume>:<fpage>2137</fpage>&#x2013;<lpage>43</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00405-023-08381-3</pub-id>, <pub-id pub-id-type="pmid">38117307</pub-id></mixed-citation></ref>
<ref id="ref15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Patel</surname><given-names>A</given-names></name> <name><surname>Cheung</surname><given-names>J</given-names></name></person-group>. <article-title>Artificial intelligence in sleep medicine: assessing the diagnostic precision of ChatGPT-4</article-title>. <source>J Clin Sleep Med</source>. (<year>2025</year>) <volume>21</volume>:<fpage>1511</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.5664/jcsm.11732</pub-id>, <pub-id pub-id-type="pmid">40265240</pub-id></mixed-citation></ref>
<ref id="ref16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Seifen</surname><given-names>C</given-names></name> <name><surname>Huppertz</surname><given-names>T</given-names></name> <name><surname>Gouveris</surname><given-names>H</given-names></name> <name><surname>Bahr-Hamm</surname><given-names>K</given-names></name> <name><surname>Pordzik</surname><given-names>J</given-names></name> <name><surname>Eckrich</surname><given-names>J</given-names></name> <etal/></person-group>. <article-title>Chasing sleep physicians: ChatGPT-4o on the interpretation of polysomnographic results</article-title>. <source>Eur Arch Otorrinolaringol</source>. (<year>2025</year>) <volume>282</volume>:<fpage>1631</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00405-024-08985-3</pub-id>, <pub-id pub-id-type="pmid">39427271</pub-id></mixed-citation></ref>
<ref id="ref17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Madrid</surname><given-names>J</given-names></name> <name><surname>Diehl</surname><given-names>P</given-names></name> <name><surname>Selig</surname><given-names>M</given-names></name> <name><surname>Rolauffs</surname><given-names>B</given-names></name> <name><surname>Hans</surname><given-names>FP</given-names></name> <name><surname>Busch</surname><given-names>HJ</given-names></name> <etal/></person-group>. <article-title>Performance of plug-in augmented ChatGPT and its ability to quantify uncertainty: simulation study on the German medical board examination</article-title>. <source>JMIR Med Educ</source>. (<year>2025</year>) <volume>11</volume>:<fpage>e58375</fpage>. doi: <pub-id pub-id-type="doi">10.2196/58375</pub-id>, <pub-id pub-id-type="pmid">40116759</pub-id></mixed-citation></ref>
<ref id="ref18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khan</surname><given-names>RA</given-names></name> <name><surname>Jawaid</surname><given-names>M</given-names></name> <name><surname>Khan</surname><given-names>AR</given-names></name> <name><surname>Sajjad</surname><given-names>M</given-names></name></person-group>. <article-title>ChatGPT&#x2014;reshaping medical education and clinical management</article-title>. <source>Pak J Med Sci</source>. (<year>2023</year>) <volume>39</volume>:<fpage>605</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.12669/pjms.39.2.7653</pub-id>, <pub-id pub-id-type="pmid">36950398</pub-id></mixed-citation></ref>
<ref id="ref19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>M</given-names></name> <name><surname>Okuhara</surname><given-names>T</given-names></name> <name><surname>Chang</surname><given-names>X</given-names></name> <name><surname>Shirabe</surname><given-names>R</given-names></name> <name><surname>Nishiie</surname><given-names>Y</given-names></name> <name><surname>Okada</surname><given-names>H</given-names></name> <etal/></person-group>. <article-title>Performance of ChatGPT across different versions in medical licensing examinations worldwide: systematic review and meta-analysis</article-title>. <source>J Med Internet Res</source>. (<year>2024</year>) <volume>26</volume>:<fpage>e60807</fpage>. doi: <pub-id pub-id-type="doi">10.2196/60807</pub-id>, <pub-id pub-id-type="pmid">39052324</pub-id></mixed-citation></ref>
<ref id="ref20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zong</surname><given-names>H</given-names></name> <name><surname>Wu</surname><given-names>R</given-names></name> <name><surname>Cha</surname><given-names>J</given-names></name> <name><surname>Feng</surname><given-names>W</given-names></name> <name><surname>Wu</surname><given-names>E</given-names></name> <name><surname>Li</surname><given-names>J</given-names></name> <etal/></person-group>. <article-title>Large language model-based performance assessment of 198 medical licensing examinations in 28 countries: cross-national comparative study</article-title>. <source>J Med Internet Res</source>. (<year>2024</year>) <volume>26</volume>:<fpage>e66114</fpage>. doi: <pub-id pub-id-type="doi">10.2196/66114</pub-id></mixed-citation></ref>
<ref id="ref21"><label>21.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zong</surname><given-names>H</given-names></name> <name><surname>Li</surname><given-names>J</given-names></name> <name><surname>Wu</surname><given-names>E</given-names></name> <name><surname>Wu</surname><given-names>R</given-names></name> <name><surname>Lu</surname><given-names>J</given-names></name> <name><surname>Shen</surname><given-names>B</given-names></name></person-group>. <article-title>Performance of ChatGPT on Chinese National Medical Licensing Examinations: a five-year examination evaluation study for physicians, pharmacists and nurses</article-title>. <source>BMC Med Educ</source>. (<year>2024</year>) <volume>24</volume>:<fpage>143</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12909-024-05125-7</pub-id>, <pub-id pub-id-type="pmid">38355517</pub-id></mixed-citation></ref>
<ref id="ref22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Oosthuizen</surname><given-names>RM</given-names></name></person-group>. <article-title>The fourth industrial revolution&#x2014;smart technology, artificial intelligence, robotics and algorithms: industrial psychologists in future workplaces</article-title>. <source>Front Artif Intell</source>. (<year>2022</year>) <volume>5</volume>:<fpage>913168</fpage>. doi: <pub-id pub-id-type="doi">10.3389/frai.2022.913168</pub-id>, <pub-id pub-id-type="pmid">35875193</pub-id></mixed-citation></ref>
<ref id="ref23"><label>23.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Masters</surname><given-names>K</given-names></name></person-group>. <article-title>Ethical use of artificial intelligence in health professions education: AMEE guide no. 158</article-title>. <source>Med Teach</source>. (<year>2023</year>) <volume>45</volume>:<fpage>574</fpage>&#x2013;<lpage>84</lpage>. doi: <pub-id pub-id-type="doi">10.1080/0142159X.2023.2186203</pub-id>, <pub-id pub-id-type="pmid">36912253</pub-id></mixed-citation></ref>
<ref id="ref24"><label>24.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Nori</surname><given-names>H</given-names></name> <name><surname>King</surname><given-names>N</given-names></name> <name><surname>McKinney</surname><given-names>SM</given-names></name> <name><surname>Carignan</surname><given-names>D</given-names></name> <name><surname>Horvitz</surname><given-names>E</given-names></name></person-group>. (<year>2023</year>). Capabilities of GPT-4 on medical challenge problems. <italic>arXiv</italic> <comment>[Epub ahead of Preprint]</comment>.</mixed-citation></ref>
<ref id="ref25"><label>25.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname><given-names>PK</given-names></name> <name><surname>Gadkaree</surname><given-names>SK</given-names></name> <name><surname>Li</surname><given-names>J</given-names></name> <name><surname>Yeung</surname><given-names>J</given-names></name> <name><surname>Ishman</surname><given-names>SL</given-names></name></person-group>. <article-title>Characteristics of the dual board-certified sleep otolaryngology workforce</article-title>. <source>Laryngoscope</source>. (<year>2021</year>) <volume>131</volume>:<fpage>E2712</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1002/lary.29725</pub-id></mixed-citation></ref>
<ref id="ref26"><label>26.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Roche</surname><given-names>J</given-names></name> <name><surname>Rae</surname><given-names>DE</given-names></name> <name><surname>Redman</surname><given-names>KN</given-names></name> <name><surname>Khumalo</surname><given-names>NP</given-names></name> <name><surname>Micklesfield</surname><given-names>LK</given-names></name> <name><surname>Kolbe-Alexander</surname><given-names>T</given-names></name> <etal/></person-group>. <article-title>Impact of obstructive sleep apnea on cardiometabolic health in a random sample of older adults in rural South Africa: building the case for the treatment of sleep disorders in underresourced settings</article-title>. <source>J Clin Sleep Med</source>. (<year>2021</year>) <volume>17</volume>:<fpage>1423</fpage>&#x2013;<lpage>34</lpage>. doi: <pub-id pub-id-type="doi">10.5664/jcsm.9214</pub-id></mixed-citation></ref>
<ref id="ref27"><label>27.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Reeder</surname><given-names>K</given-names></name> <name><surname>Lee</surname><given-names>H</given-names></name></person-group>. <article-title>Impact of artificial intelligence on US medical students' choice of radiology</article-title>. <source>Clin Imaging</source>. (<year>2022</year>) <volume>81</volume>:<fpage>67</fpage>&#x2013;<lpage>71</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.clinimag.2021.09.018</pub-id>, <pub-id pub-id-type="pmid">34619566</pub-id></mixed-citation></ref>
<ref id="ref28"><label>28.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tarabanis</surname><given-names>C</given-names></name> <name><surname>Zahid</surname><given-names>S</given-names></name> <name><surname>Mamalis</surname><given-names>M</given-names></name> <name><surname>Zhang</surname><given-names>K</given-names></name> <name><surname>Kalampokis</surname><given-names>E</given-names></name> <name><surname>Jankelson</surname><given-names>L</given-names></name></person-group>. <article-title>Performance of publicly available large language models on internal medicine board-style questions</article-title>. <source>PLOS Digit Health</source>. (<year>2024</year>) <volume>3</volume>:<fpage>e0000604</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pdig.0000604</pub-id>, <pub-id pub-id-type="pmid">39288137</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1531502/overview">Hongxiao Li</ext-link>, Chinese Academy of Medical Sciences and Peking Union Medical College, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1578606/overview">Tzu-Chi Wu</ext-link>, National Chung Hsing University, Taiwan</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1995816/overview">Hui Zong</ext-link>, West China Hospital, Sichuan University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2781415/overview">Julian Madrid</ext-link>, Ortenau Klinikum, Germany</p>
</fn>
</fn-group>
</back>
</article>