<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="brief-report" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2024.1380148</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Medicine</subject>
<subj-group>
<subject>Brief Research Report</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Evaluation of large language models as a diagnostic aid for complex medical cases</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>R&#x00ED;os-Hoyo</surname> <given-names>Alejandro</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2213732/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Shan</surname> <given-names>Naing Lin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Anran</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Pearson</surname> <given-names>Alexander T.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Pusztai</surname> <given-names>Lajos</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/836970/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Howard</surname> <given-names>Frederick M.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Yale Cancer Center, Yale School of Medicine</institution>, <addr-line>New Haven, CT</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Medicine, University of Chicago</institution>, <addr-line>Chicago, IL</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0002">
<p>Edited by: Thomas F. Heston, University of Washington, United States</p>
</fn>
<fn fn-type="edited-by" id="fn0003">
<p>Reviewed by: Abdallah Al-Ani, King Hussein Cancer Center, Jordan</p>
<p>Xia Jing, Clemson University, United States</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Lajos Pusztai, <email>lajos.pusztai@yale.edu</email></corresp>
<corresp id="c002">Frederick M. Howard, <email>frederick.howard@uchospitals.edu</email></corresp>
<fn fn-type="equal" id="fn0001">
<p><sup>&#x2020;</sup>These authors share first authorship</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>20</day>
<month>06</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1380148</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>02</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>06</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 R&#x00ED;os-Hoyo, Shan, Li, Pearson, Pusztai and Howard.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>R&#x00ED;os-Hoyo, Shan, Li, Pearson, Pusztai and Howard</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec id="sec1">
<title>Background</title>
<p>The use of large language models (LLM) has recently gained popularity in diverse areas, including answering questions posted by patients as well as medical professionals.</p>
</sec>
<sec id="sec2">
<title>Objective</title>
<p>To evaluate the performance and limitations of LLMs in providing the correct diagnosis for a complex clinical case.</p>
</sec>
<sec id="sec3">
<title>Design</title>
<p>Seventy-five consecutive clinical cases were selected from the Massachusetts General Hospital Case Records, and differential diagnoses were generated by OpenAI&#x2019;s GPT3.5 and 4 models.</p>
</sec>
<sec id="sec4">
<title>Results</title>
<p>The mean number of diagnoses provided by the Massachusetts General Hospital case discussants was 16.77, by GPT3.5 30 and by GPT4 15.45 (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.0001). GPT4 was more frequently able to list the correct diagnosis as first (22% versus 20% with GPT3.5, <italic>p</italic>&#x2009;=&#x2009;0.86), provide the correct diagnosis among the top three generated diagnoses (42% versus 24%, <italic>p</italic>&#x2009;=&#x2009;0.075). GPT4 was better at providing the correct diagnosis, when the different diagnoses were classified into groups according to the medical specialty and include the correct diagnosis at any point in the differential list (68% versus 48%, <italic>p</italic>&#x2009;=&#x2009;0.0063). GPT4 provided a differential list that was more similar to the list provided by the case discussants than GPT3.5 (Jaccard Similarity Index 0.22 versus 0.12, <italic>p</italic>&#x2009;=&#x2009;0.001). Inclusion of the correct diagnosis in the generated differential was correlated with PubMed articles matching the diagnosis (OR 1.40, 95% CI 1.25&#x2013;1.56 for GPT3.5, OR 1.25, 95% CI 1.13&#x2013;1.40 for GPT4), but not with disease incidence.</p>
</sec>
<sec id="sec5">
<title>Conclusions and relevance</title>
<p>The GPT4 model was able to generate a differential diagnosis list with the correct diagnosis in approximately two thirds of cases, but the most likely diagnosis was often incorrect for both models. In its current state, this tool can at most be used as an aid to expand on potential diagnostic considerations for a case, and future LLMs should be trained which account for the discrepancy between disease incidence and availability in the literature.</p>
</sec>
</abstract>
<kwd-group>
<kwd>large language model (LLM)</kwd>
<kwd>ChatGPT</kwd>
<kwd>complex clinical cases</kwd>
<kwd>diagnosis</kwd>
<kwd>clinical case solving</kwd>
</kwd-group>
<counts>
<fig-count count="2"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="20"/>
<page-count count="6"/>
<word-count count="4137"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Precision Medicine</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec6">
<label>1</label>
<title>Introduction</title>
<p>Large language models (LLMs) are complex, neural network-based models trained on vast amounts of text to accurately interpret human language. LLMs have been applied to a wide range of tasks within medical science, including simplifying radiology reports, accurately responding to questions posted by patients on an internet forum, generating realistic medical abstracts, and predicting in-hospital mortality (<xref ref-type="bibr" rid="ref1 ref2 ref3 ref4">1&#x2013;4</xref>). Although LLMs have shown passable accuracy in answering medical licensing exam questions in numerous studies (<xref ref-type="bibr" rid="ref1 ref2 ref3 ref4 ref5">1&#x2013;5</xref>), it is unclear if this performance can be leveraged to serve as a decision aid in real clinical practice, where cases have nuance beyond that of standardized testing. Given the widespread uptake of LLMs, they have been proposed as a diagnostic decision aid for students, and are likely in use despite the limited knowledge about specific model performance (<xref ref-type="bibr" rid="ref6">6</xref>). Chat GPT (Generative Pre-trained Transformer) is a natural language processing model that became publicly available in November 2022, it provides outputs in response to inputs or prompts, learning its skills from internet data.</p>
<p>Different versions of GPT are currently available, GPT3.5 is a Chatbot based on the GPT3.5 model, whereas the GPT4 foundation features an approximately 1,000-fold increase in model parameters and an expanded context window length, resulting in an enhanced capability of solving complex tasks (<xref ref-type="bibr" rid="ref7 ref8 ref9">7&#x2013;9</xref>). GPT can be used to write computer code, analyze text, draft documents, create conversational agents, and has been shown to proficiently answer different standardized tests (<xref ref-type="bibr" rid="ref7">7</xref>, <xref ref-type="bibr" rid="ref10">10</xref>) it has a considerable semantic medical knowledge and has been shown to be capable of medical reasoning (<xref ref-type="bibr" rid="ref10">10</xref>). This has been reflected by its capabilities in answering medical questions (<xref ref-type="bibr" rid="ref11">11</xref>), simplifying radiology reports, performing well at medical licensing exams, among others (<xref ref-type="bibr" rid="ref1 ref2 ref3 ref4">1&#x2013;4</xref>). It is currently considered an attractive tool in diverse settings of medicine, however these LLMs could potentially contribute to misinformation and exacerbate scientific misconduct in the setting of a lack of accountability and transparency.</p>
<p>This study aimed to characterize the performance and consistency of LLMs in diagnosing a series of challenging case records published from a single institution. In this study, we evaluated OpenAI&#x2019;s GPT-3.5 and GPT-4 models to establish a baseline for models trained on general (as opposed to medical-specific literature), as well as to identify patterns in misdiagnosis to inform fine-tuning of diagnostic decision aids. In this study we used cases from the Massachusetts General Hospital Case Records which have been published since 1923 in the New England Journal of Medicine. These cases have been used as teaching tools illustrating different clinical cases, and the workup of the differential diagnosis of frequently uncommon diseases or uncommon disease presentations (<xref ref-type="bibr" rid="ref12">12</xref>). We introduced the case presentation of these clinical cases and asked GPT to provide a list of the most likely differential diagnosis.</p>
</sec>
<sec sec-type="methods" id="sec7">
<label>2</label>
<title>Methods</title>
<p>Seventy-five sequential clinical cases were retrieved from the case records of the Massachusetts General Hospital, published in the New England Journal of Medicine, from January 2022 to November 2023 (<xref ref-type="bibr" rid="ref12">12</xref>). This period was selected to ensure cases did not overlap with the training data for the LLMs. The case presentation was truncated prior to the discussant&#x2019;s review of the differential diagnosis, and text referencing figures or tables was removed. A uniform prompt requesting a differential diagnosis for the case presentation text was provided to OpenAI&#x2019;s GPT-3.5 (gpt-3.5-turbo) and GPT-4 (gpt-4) models. First, three prompts were tested on a subset of 10 cases for four replicates each. The prompts included (1) &#x2018;<italic>please read the following case, and provide a differential diagnosis for the underlying cause of this presentation</italic>&#x2019;; (2) as per (1) with the modification &#x2018;&#x2026;<italic>provide a thorough and specific list of differential diagnosis&#x2026;</italic>&#x2019;; and (3) as per (2) with the additional sentence <italic>&#x2018;please list the diagnosis that most explains all the features of the presentation first, and include rare diagnoses if they are the best explanation for the presentation</italic>.&#x2019; All prompts yielded similar lists, but the prompt (3) yielded diagnosis lists that most frequently listed the correct diagnosis first, and was chosen for all subsequent analysis. All clinical cases were queried with this prompt, with four replicates performed for each model (<xref rid="SM1" ref-type="supplementary-material">Supplementary Table 1</xref>).</p>
<p>The rank order of the correct diagnosis within the differential diagnosis list was established by consensus of study authors. The overlap between the full list of differential diagnoses provided by GPT and by the case discussant was similarly compared. Finally, accuracy of LLMs was correlated with disease incidence (estimated from literature review of PubMed as well as <ext-link xlink:href="http://cdc.gov" ext-link-type="uri">cdc.gov</ext-link> with references listed in <xref rid="SM1" ref-type="supplementary-material">Supplementary Table 1</xref>, as indexed by Google both with the search term &#x2018;diagnosis&#x2019; incidence), with rare diseases without estimable incidence such as those only described in case reports assigned an incidence of 0.1 per 100,000, as well as representation of the diagnosis in medical literature as assessed by article count returned when searching for the diagnosis (or simplified surrogate term, as listed in <xref rid="SM1" ref-type="supplementary-material">Supplementary Table 1</xref>) in PubMed (conducted with an article cutoff of April 21st, 2023).</p>
<sec id="sec8">
<label>2.1</label>
<title>Statistical analysis</title>
<p>A Mann&#x2013;Whitney U test was used to compare the number of diagnoses provided by case discussants and GPT models. A Fisher&#x2019;s exact test was used to compare whether the first diagnosis was the correct diagnosis, whether among the top three diagnosis was the correct diagnosis, whether the correct diagnosis was in the list of differential diagnosis from GPT3.5 and 4. To assess whether GPT was able to provide the correct diagnosis among different medical specialties, five groups were designated [Group 1: neurology and psychiatry; group 2: oncology and hematology; group 3: infectious diseases, internal medicine, endocrinology and toxicology; group 4 rheumatology, allergy and autoimmune diseases; group 5: others (cardiology, gastroenterology, genetic diseases, dermatology, nephrology and pediatrics)], A Fisher&#x2019;s exact test was used to compare results between GPT 3.5 and 4. A multivariable logistic regression model was used to determine the association between disease incidence and PubMed article count with these same three performance metrics. To assess the similarity between the differential diagnosis lists, the Jaccard similarity index was used (ranging from 0 to 1, 0 reflects no similarity, whereas 1 reflects a complete similarity between the analyzed sets), utilizing each case entry repeat, to test differences between GPT 3.5 and 4, a Mann&#x2013;Whitney test was performed. To assess reproducibility across iterations of each model, intraclass correlation coefficients (ICC) were calculated using the two-way mixed effects, absolute agreement, multiple raters/measurements formulation (<xref ref-type="bibr" rid="ref13">13</xref>), values of &#x003C;0.5 and&#x2009;&#x003E;&#x2009;0.9 reflect poor and excellent reliability, respectively. Statistical analyses and graphs were performed using GraphPad Prism 9.0 (GraphPad Software, Inc., San Diego, CA) and Python version 3.7.5 (Python Software Foundation) using statsmodels 0.13.2.</p>
</sec>
</sec>
<sec sec-type="results" id="sec9">
<label>3</label>
<title>Results</title>
<sec id="sec10">
<label>3.1</label>
<title>Accuracy of GPT models in complex diagnostic challenges</title>
<p>Seventy-five cases from the Massachusetts General Hospital Case Records were introduced to the two GPT models. Compared to the case discussants, who provided a mean of 16.77 [interquartile range (IQR) (representing the distance between the first and the third quartile) 12] diagnoses, GPT4 produced a similar number (mean 15.45, IQR 11, <italic>p</italic>&#x2009;=&#x2009;0.302) of unique diagnoses over four replicates, whereas GPT3.5 listed significantly more diagnoses (mean 30, IQR 10, <italic>p</italic>&#x2009;=&#x2009;&#x003C;0.0001). GPT4 included the correct diagnosis in its differential list in two thirds (68%) of cases, with the correct diagnosis included in the top 3 items in the differential in 42% of cases, in contrast GPT3.5 included the correct diagnosis in its differential list in half (48%, <italic>p</italic>&#x2009;=&#x2009;0.006) of the cases, and the correct diagnosis included in the top three differential diagnoses in 29% (<italic>p</italic>&#x2009;=&#x2009;0.075) of the cases, thus observing that GPT4 outperforming GPT3.5 in both metrics (<xref ref-type="fig" rid="fig1">Figure 1</xref>). GPT4 was able to formulate more specific answers that better depicted the true diagnosis in many cases. For example, in Case 6&#x2013;2022 (Immune checkpoint inhibitor-induced diabetes), GPT3.5 was only able to vaguely link the presentation to immunotherapy - &#x201C;Side effects of cancer treatment: The patient&#x2019;s symptoms could be side effects of cancer treatment such as pembrolizumab&#x2026;&#x201D; - whereas GPT4 concisely answered &#x201C;Pembrolizumab-induced diabetes mellitus.&#x201D;</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Performance of GPT3.5 and GPT4 in providing <bold>(A)</bold> the first diagnosis as the correct diagnosis, <bold>(B)</bold> the correct diagnosis among the top three diagnoses, and <bold>(C)</bold> the correct diagnosis among the entire list of diagnoses.</p>
</caption>
<graphic xlink:href="fmed-11-1380148-g001.tif"/>
</fig>
</sec>
<sec id="sec11">
<label>3.2</label>
<title>Consistency of GPT model diagnostic lists</title>
<p>As the results of GPT models may differ across repetitions, it is important to understand how the prioritization of diagnoses might change if these tools are clinically implemented. Ranking of the correct diagnosis within a differential was more consistent across repetitions for GPT4 (ICC 0.65, 95% CI 0.42&#x2013;0.80) than with GPT3.5 (ICC 0.37, 95% CI&#x2013;0.25 &#x2013; 0.71). The differential diagnosis list generated by GPT4 also had greater overlap with the discussant&#x2019;s list (Jaccard Similarity Index 0.22, IQR 0.12) than GPT3.5 (0.13, IQR 0.076, <italic>p</italic>&#x2009;=&#x2009;&#x003C;0.0001, <xref ref-type="fig" rid="fig2">Figure 2</xref>) &#x2013; although overlap was fair at best.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Jaccard Similarity Index indicating the overlap between GPT3.53/GPT4 and the differential provided by the case discussant.</p>
</caption>
<graphic xlink:href="fmed-11-1380148-g002.tif"/>
</fig>
</sec>
<sec id="sec12">
<label>3.3</label>
<title>Associations of model accuracy with medical specialty and disease incidence</title>
<p>Each case was classified into medical specialties groups (<italic>n</italic>&#x2009;=&#x2009;5), among these groups, GPT4 was numerically and statistically superior to GPT3.5 in all categories except in the Rheumatology, Allergy, and Autoimmune Diseases category (<xref ref-type="table" rid="tab1">Table 1</xref>). We also assessed whether model accuracy was dependent on disease incidence or representation in the literature. PubMed article count for the correct diagnosis was associated with a greater likelihood that the diagnosis would be included in the differential generated by GPT3.5 (Odds Ratio (OR) 1.40, 95% CI 1.25&#x2013;1.56, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) and GPT4 (OR 1.25, 95% CI 1.13&#x2013;1.40, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). Similar trends were seen for likelihood of a diagnosis being listed first or within the top 3 generated diagnoses (<xref ref-type="table" rid="tab2">Table 2</xref>). Conversely, disease incidence was either a neutral or negative effect on the likelihood of a diagnosis being listed by either model.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Performance of GPT 3.5 and 4 in providing the correct diagnosis, according to medical specialty.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th align="center" valign="top">GPT 3.5 (%)</th>
<th align="center" valign="top">GPT 4 (%)</th>
<th align="center" valign="top">OR (95% CI)</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Group 1 (<italic>n</italic>&#x2009;=&#x2009;9)</td>
<td align="center" valign="top">41</td>
<td align="center" valign="top">72</td>
<td align="center" valign="top">5.2 (1.94&#x2013;14.23)</td>
<td align="center" valign="top">0.0019</td>
</tr>
<tr>
<td align="left" valign="top">Group 2 (<italic>n</italic>&#x2009;=&#x2009;24)</td>
<td align="center" valign="top">60</td>
<td align="center" valign="top">83</td>
<td align="center" valign="top">5.6 (2.95&#x2013;10.73)</td>
<td align="center" valign="top">&#x003C;0.0001</td>
</tr>
<tr>
<td align="left" valign="top">Group 3 (<italic>n</italic>&#x2009;=&#x2009;19)</td>
<td align="center" valign="top">23</td>
<td align="center" valign="top">53</td>
<td align="center" valign="top">4.92 (2.39&#x2013;9.77)</td>
<td align="center" valign="top">&#x003C;0.0001</td>
</tr>
<tr>
<td align="left" valign="top">Group 4 (<italic>n</italic>&#x2009;=&#x2009;13)</td>
<td align="center" valign="top">64</td>
<td align="center" valign="top">60</td>
<td align="center" valign="top">1.36 (0.62&#x2013;3.04)</td>
<td align="center" valign="top">0.55</td>
</tr>
<tr>
<td align="left" valign="top">Group 5 (<italic>n</italic>&#x2009;=&#x2009;10)</td>
<td align="center" valign="top">50</td>
<td align="center" valign="top">65</td>
<td align="center" valign="top">2.78 (1.10&#x2013;6.86)</td>
<td align="center" valign="top">0.043</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Odds ratios [OR] comparing GPT 4 vs. 3.5. Group 1: Neurology and Psychiatry, Group 2: Oncology and Hematology, Group 3: Infectious Diseases, Internal Medicine, Toxicology, Group 4: Rheumatology, Autoimmune Diseases, Group 5: Others (Cardiology, Genetic Diseases, Gastroenterology, Dermatology, Nephrology and Pediatrics).</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Performance of GPT 3.5 and 4 in providing the correct diagnosis, according to disease incidence and PubMed articles covering the disease.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th rowspan="2"/>
<th align="center" valign="top" colspan="2">Top diagnosis correct</th>
<th align="center" valign="top" colspan="2">Correct diagnosis in top 3</th>
<th align="center" valign="top" colspan="2">Correct diagnosis in differential</th>
</tr>
<tr>
<th align="center" valign="top">OR (95% CI)</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
<th align="center" valign="top">OR (95% CI)</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
<th align="center" valign="top">OR (95% CI)</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" colspan="7">GPT 3.5</td>
</tr>
<tr>
<td align="left" valign="top">Incidence<break/>(per 10-fold increase)</td>
<td align="center" valign="top">0.80 (0.67&#x2013;0.95)</td>
<td align="center" valign="top">0.01</td>
<td align="center" valign="top">0.74 (0.64&#x2013;0.87)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
<td align="center" valign="top">0.82 (0.74&#x2013;0.92)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
</tr>
<tr>
<td align="left" valign="top">PubMed Articles<break/>(per 10-fold increase)</td>
<td align="center" valign="top">1.32 (1.12&#x2013;1.56)</td>
<td align="center" valign="top">0.001</td>
<td align="center" valign="top">1.42 (1.23&#x2013;1.64)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
<td align="center" valign="top">1.40 (1.25&#x2013;1.56)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
</tr>
<tr>
<td align="left" valign="top" colspan="7">GPT 4</td>
</tr>
<tr>
<td align="left" valign="top">Incidence<break/>(per 10-fold increase)</td>
<td align="center" valign="top">0.90 (0.80&#x2013;1.02)</td>
<td align="center" valign="top">0.108</td>
<td align="center" valign="top">0.90 (0.81&#x2013;0.99)</td>
<td align="center" valign="top">0.036</td>
<td align="center" valign="top">0.90 (0.82&#x2013;0.99)</td>
<td align="center" valign="top">0.033</td>
</tr>
<tr>
<td align="left" valign="top">PubMed Articles<break/>(per 10-fold increase)</td>
<td align="center" valign="top">1.15 (1.01&#x2013;1.30)</td>
<td align="center" valign="top">0.03</td>
<td align="center" valign="top">1.16 (1.04&#x2013;1.28)</td>
<td align="center" valign="top">0.005</td>
<td align="center" valign="top">1.26 (1.13&#x2013;1.40)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Odds ratios [OR] listed for a multivariate logistic regression including both incidence and article count.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="sec13">
<label>4</label>
<title>Discussion</title>
<p>We have demonstrated here a comprehensive characterization of the accuracy and reproducibility of two GPT models in solving complex clinical case scenarios. Whereas high accuracy was seen when evaluating GPT-3 in diagnosing common presentations such as upper respiratory tract infections (<xref ref-type="bibr" rid="ref14">14</xref>), we found that in approximately one third of cases the best model failed to identify the correct diagnosis in complex cases. Thus, although current GPT models are insufficient to replace physician expertise, they may have some clinical utility as a diagnostic checklist (<xref ref-type="bibr" rid="ref15">15</xref>) to reduce error when physicians are presented with a puzzling clinical scenario.</p>
<p>It is worth noting that although GPT3.5 was able to provide a longer list of differential diagnoses, these did not present a better concordance with the Massachusetts General Hospital case discussants diagnoses. Furthermore, GPT4 was not only better at providing the first diagnosis as the correct diagnosis, but it outperformed GPT3.5 in providing the correct diagnosis among the differential diagnosis lists.</p>
<p>A similar study by Zahir and collogues (<xref ref-type="bibr" rid="ref16">16</xref>) used GPT and cases from the Massachusetts General Hospital case records to assess whether the model&#x2019;s diagnoses matched the final case diagnosis, their results found an agreement between GPT4&#x2019;s top diagnosis and the final diagnosis in 39% of the cases, and in 64% of the cases the final diagnosis was included in the differential diagnosis list. These results contrast with ours, since we found that GPT4 was able to provide the correct diagnosis as the first answer in 22% of the cases, whereas it provided the correct diagnosis within the differential diagnosis list in 68% of the cases. In addition, Zhair&#x2019;s study found that GPT4 provided a mean of 9 differential diagnoses, similarly our study found a mean of 9.23 diagnoses.</p>
<p>Another study using a different, medicine-specific large language model called Med-PaLM, was able to provide accurate answers to different questions posted in a multiple-choice and long-form setting. Med-PaLM was superior in solving medical questions when compared to MultiMedQA (6 sets of open data that include similar questions to the United States Medical Licensing Examination (USMLE)), and HealthSearchQA (related to common consumer health related questions). MedPaLM was able to answer accurately different formats of questions, such as multiple choice and long form. In a second part of the study, clinicians from different countries were asked to solve 140 medical questions in long-form answers, the same task was performed by MedPaLM. The answers were assessed by clinicians with specialties in different medical fields, the answers provided by the LLM overall presented outstanding results, however MedPaLM&#x2019;s answers presented higher numbers of incorrect information, which most of the times was clinically significant (<xref ref-type="bibr" rid="ref11">11</xref>).</p>
<p>When formulating a differential diagnosis, disease incidence as well as the severity/consequences of missed diagnosis are often considered (<xref ref-type="bibr" rid="ref17">17</xref>). However, some common diseases are underrepresented in the literature, whereas some rare conditions are given particular emphasis in medical literature and educational materials. In an attempt to refine medical-domain performance, several models have been trained specifically on PubMed, which may be subject to this same bias (<xref ref-type="bibr" rid="ref18">18</xref>). As LLMs are refined as diagnostic decision aids, strategies to align output with true disease prevalence are needed.</p>
</sec>
<sec id="sec14">
<label>5</label>
<title>Limitations</title>
<p>One of the limitations of this study was the lack of publicly available diagnostic challenges with curated differential diagnosis lists, resulting in our use of a single source of cases which was only modest in size. The small sample size may lead to lower accuracy in precisely quantifying the difference in performance between the GPT models tested. Additionally, the Massachusetts General Hospital Case Records present complex cases that may not represent the most frequent case presentations &#x2013; which may be more straightforward with higher diagnostic accuracy from AI models.</p>
<p>As the GPT models evaluated were trained on data collected on or before September 2021, and thus performance for certain diagnoses with changing epidemiology [such as monkeypox (<xref ref-type="bibr" rid="ref19">19</xref>)] may be underestimated. We chose to evaluate OpenAI&#x2019;s GPT models in this study rather other LLMs due to their widespread uptake (<xref ref-type="bibr" rid="ref20">20</xref>), as it is most likely to be in current use by physicians and trainees, and as such characterization of performance is most urgent. Furthermore, we used a single prompt to evaluate model performance in our primary analysis. Although preliminary analysis suggested that performance was similar across prompts, it is possible that modifications of the prompt may change the relative accuracy of GPT3.5 and 4 models.</p>
<p>Finally, although we found that disease incidence was either not associated or negatively associated with model accuracy, incidence is difficult to establish and these estimates represent our best efforts to define incidence through literature review. Incidence can vary widely depending on the population studied and across geographic regions, and these results may differ with alternate approaches to estimate incidence.</p>
</sec>
<sec sec-type="conclusions" id="sec15">
<label>6</label>
<title>Conclusion</title>
<p>In this study we demonstrated that OpenAI&#x2019;s GPT-4 model outperformed GPT-3.5 in correctly diagnosing challenging clinical cases, but misdiagnosis was common, and at best such models might be used as decision aids in their current state. In training LLMs specifically as diagnostic aids, steps should be taken to account for the overrepresentation of some diagnoses in the medical literature. It is important to take into consideration certain aspect of using LLM in medicine, such as a negative impact in critical thinking, ethical considerations, as well as potentially detrimental consequences for the patient, thus the use of LLM in clinical medicine might not be ready for a global integration into clinical workflows.</p>
</sec>
<sec sec-type="data-availability" id="sec16">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref rid="SM1" ref-type="supplementary-material">Supplementary material</xref>, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="author-contributions" id="sec17">
<title>Author contributions</title>
<p>AR-H: Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Visualization, Validation, Supervision, Software, Project administration, Methodology, Investigation, Formal analysis, Data curation, Conceptualization. NS: Writing &#x2013; review &#x0026; editing, Methodology, Investigation, Formal analysis, Data curation. AL: Writing &#x2013; review &#x0026; editing, Software, Methodology, Investigation, Formal analysis. AP: Writing &#x2013; review &#x0026; editing, Software, Methodology, Investigation, Formal analysis, Data curation. LP: Writing &#x2013; review &#x0026; editing, Visualization, Validation, Supervision, Software, Resources, Project administration, Methodology, Investigation, Funding acquisition, Formal analysis, Data curation, Conceptualization. FH: Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Visualization, Validation, Supervision, Software, Resources, Project administration, Methodology, Investigation, Funding acquisition, Formal analysis, Data curation, Conceptualization.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="sec18">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. AR-H is supported by a grant from the Spanish Society of Medical Oncology (SEOM), LP is supported by the Susan Komen Leadership Award, and FH is supported by the NIH/NCI grant K08CA283261 and the Cancer Research Foundation.</p>
</sec>
<sec sec-type="COI-statement" id="sec19">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="sec20">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec21">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fmed.2024.1380148/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fmed.2024.1380148/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.XLSX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kung</surname> <given-names>TH</given-names></name> <name><surname>Cheatham</surname> <given-names>M</given-names></name> <name><surname>Medenilla</surname> <given-names>A</given-names></name> <name><surname>Sillos</surname> <given-names>C</given-names></name> <name><surname>De Leon</surname> <given-names>L</given-names></name> <name><surname>Elepa&#x00F1;o</surname> <given-names>C</given-names></name> <etal/></person-group>. <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>. <source>PLOS Digital Health</source>. (<year>2023</year>) <volume>2</volume>:<fpage>e0000198</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>, PMID: <pub-id pub-id-type="pmid">36812645</pub-id></citation></ref>
<ref id="ref2"><label>2.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ayers</surname> <given-names>JW</given-names></name> <name><surname>Poliak</surname> <given-names>A</given-names></name> <name><surname>Dredze</surname> <given-names>M</given-names></name> <name><surname>Leas</surname> <given-names>EC</given-names></name> <name><surname>Zhu</surname> <given-names>Z</given-names></name> <name><surname>Kelley</surname> <given-names>JB</given-names></name> <etal/></person-group>. <article-title>Comparing physician and artificial intelligence Chatbot responses to patient questions posted to a public social media forum</article-title>. <source>JAMA Intern Med</source>. (<year>2023</year>) <volume>183</volume>:<fpage>589</fpage>&#x2013;<lpage>96</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id>, PMID: <pub-id pub-id-type="pmid">37115527</pub-id></citation></ref>
<ref id="ref3"><label>3.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yeo</surname> <given-names>YH</given-names></name> <name><surname>Samaan</surname> <given-names>JS</given-names></name> <name><surname>Ng</surname> <given-names>WH</given-names></name> <name><surname>Ting</surname> <given-names>P-S</given-names></name> <name><surname>Trivedi</surname> <given-names>H</given-names></name> <name><surname>Vipani</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Assessing the performance of ChatGPT in answering questions regarding cirrhosis and hepatocellular carcinoma</article-title>. <source>Clin Mol Hepatol</source>. (<year>2023</year>) <volume>29</volume>:<fpage>721</fpage>&#x2013;<lpage>32</lpage>. doi: <pub-id pub-id-type="doi">10.3350/cmh.2023.0089</pub-id></citation></ref>
<ref id="ref4"><label>4.</label><citation citation-type="book"><person-group person-group-type="author"><name><surname>Zheleiko</surname> <given-names>I.</given-names></name></person-group> <source>Natural language processing in lifelong learning choices: a case of Finland</source>. <publisher-loc>Lappeenranta</publisher-loc>. <publisher-name>Lahti University of Technology LUT</publisher-name> (<year>2023</year>), <fpage>12</fpage>&#x2013;<lpage>26</lpage>.</citation></ref>
<ref id="ref5"><label>5.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>CA</given-names></name> <name><surname>Howard</surname> <given-names>FM</given-names></name> <name><surname>Markov</surname> <given-names>NS</given-names></name> <name><surname>Dyer</surname> <given-names>EC</given-names></name> <name><surname>Ramesh</surname> <given-names>S</given-names></name> <name><surname>Luo</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>Comparing scientific abstracts generated by ChatGPT to real abstracts with detectors and blinded human reviewers</article-title>. <source>NPJ Digit Med</source>. (<year>2023</year>) <volume>6</volume>:<fpage>75</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-023-00819-6</pub-id>, PMID: <pub-id pub-id-type="pmid">37100871</pub-id></citation></ref>
<ref id="ref6"><label>6.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tsang</surname> <given-names>R</given-names></name></person-group>. <article-title>Practical applications of ChatGPT in undergraduate medical education</article-title>. <source>J Med Educat Curri Develop</source>. (<year>2023</year>) <volume>10</volume>:<fpage>238212052311784</fpage>. doi: <pub-id pub-id-type="doi">10.1177/23821205231178449</pub-id>, PMID: <pub-id pub-id-type="pmid">37255525</pub-id></citation></ref>
<ref id="ref7"><label>7.</label><citation citation-type="other"><article-title>Open AI chat GPT</article-title>. Accessed June 11, 2023.</citation></ref>
<ref id="ref8"><label>8.</label><citation citation-type="book"><person-group person-group-type="author"><name><surname>Metz</surname> <given-names>C</given-names></name></person-group>. <source>Open AI Plans to Up the Ante in Tech&#x2019;s A.I. Race</source> <publisher-name>The New York Times</publisher-name>.</citation></ref>
<ref id="ref9"><label>9.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Koubaa</surname> <given-names>A</given-names></name></person-group>. <article-title>A concise showdown</article-title>. <source>TechRxiv</source>. (<year>2023</year>). doi: <pub-id pub-id-type="doi">10.36227/techrxiv.22312330.v1</pub-id></citation></ref>
<ref id="ref10"><label>10.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Clusmann</surname> <given-names>J</given-names></name> <name><surname>Kolbinger</surname> <given-names>FR</given-names></name> <name><surname>Muti</surname> <given-names>HS</given-names></name> <name><surname>Carrero</surname> <given-names>ZI</given-names></name> <name><surname>Eckardt</surname> <given-names>J</given-names></name> <name><surname>Laleh</surname> <given-names>NG</given-names></name> <etal/></person-group>. <article-title>The future landscape of large language models in medicine</article-title>. <source>Commun Med</source>. (<year>2023</year>) <volume>3</volume>:<fpage>141</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id>, PMID: <pub-id pub-id-type="pmid">37816837</pub-id></citation></ref>
<ref id="ref11"><label>11.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Singhal</surname> <given-names>K</given-names></name> <name><surname>Azizi</surname> <given-names>S</given-names></name> <name><surname>Tu</surname> <given-names>T</given-names></name> <name><surname>Mahdavi</surname> <given-names>SS</given-names></name> <name><surname>Wei</surname> <given-names>J</given-names></name> <name><surname>Chung</surname> <given-names>HW</given-names></name> <etal/></person-group>. <article-title>Large language models encode clinical knowledge</article-title>. <source>Nature</source>. (<year>2022</year>) <volume>620</volume>:<fpage>1</fpage>&#x2013;<lpage>44</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id></citation></ref>
<ref id="ref12"><label>12.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Harris</surname> <given-names>NL</given-names></name></person-group>. <article-title>Case Records of the Massachusetts General Hospital &#x2014; continuing to learn from the patient</article-title>. <source>N Engl J Med</source>. (<year>2003</year>) <volume>348</volume>:<fpage>2252</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1056/NEJMe030079</pub-id>, PMID: <pub-id pub-id-type="pmid">12773655</pub-id></citation></ref>
<ref id="ref13"><label>13.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>McGraw</surname> <given-names>KO</given-names></name> <name><surname>Wong</surname> <given-names>SP</given-names></name></person-group>. <article-title>&#x201C;Forming inferences about some Intraclass correlations coefficients&#x201D;: correction</article-title>. <source>Psychol Methods</source>. (<year>1996</year>) <volume>1</volume>:<fpage>390</fpage>&#x2013;<lpage>06</lpage>. doi: <pub-id pub-id-type="doi">10.1037/1082-989X.1.4.390</pub-id></citation></ref>
<ref id="ref14"><label>14.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hirosawa</surname> <given-names>T</given-names></name> <name><surname>Harada</surname> <given-names>Y</given-names></name> <name><surname>Yokose</surname> <given-names>M</given-names></name> <name><surname>Sakamoto</surname> <given-names>T</given-names></name> <name><surname>Kawamura</surname> <given-names>R</given-names></name> <name><surname>Shimizu</surname> <given-names>T</given-names></name></person-group>. <article-title>Diagnostic accuracy of differential-diagnosis lists generated by generative Pretrained transformer 3 Chatbot for clinical vignettes with common chief complaints: a pilot study</article-title>. <source>Int J Environ Res Public Health</source>. (<year>2023</year>) <volume>20</volume>:<fpage>3378</fpage>. doi: <pub-id pub-id-type="doi">10.3390/ijerph20043378</pub-id>, PMID: <pub-id pub-id-type="pmid">36834073</pub-id></citation></ref>
<ref id="ref15"><label>15.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>K&#x00E4;mmer</surname> <given-names>JE</given-names></name> <name><surname>Schauber</surname> <given-names>SK</given-names></name> <name><surname>Hautz</surname> <given-names>SC</given-names></name> <name><surname>Stroben</surname> <given-names>F</given-names></name> <name><surname>Hautz</surname> <given-names>WE</given-names></name></person-group>. <article-title>Differential diagnosis checklists reduce diagnostic error differentially: a randomised experiment</article-title>. <source>Med Educ</source>. (<year>2021</year>) <volume>55</volume>:<fpage>1172</fpage>&#x2013;<lpage>82</lpage>. doi: <pub-id pub-id-type="doi">10.1111/medu.14596</pub-id>, PMID: <pub-id pub-id-type="pmid">34291481</pub-id></citation></ref>
<ref id="ref16"><label>16.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kanjee</surname> <given-names>Z</given-names></name> <name><surname>Crowe</surname> <given-names>B</given-names></name> <name><surname>Rodman</surname> <given-names>A</given-names></name></person-group>. <article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title>. <source>JAMA</source>. (<year>2023</year>) <volume>330</volume>:<fpage>78</fpage>&#x2013;<lpage>80</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id>, PMID: <pub-id pub-id-type="pmid">37318797</pub-id></citation></ref>
<ref id="ref17"><label>17.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bond</surname> <given-names>WF</given-names></name> <name><surname>Schwartz</surname> <given-names>LM</given-names></name> <name><surname>Weaver</surname> <given-names>KR</given-names></name> <name><surname>Levick</surname> <given-names>D</given-names></name> <name><surname>Giuliano</surname> <given-names>M</given-names></name> <name><surname>Graber</surname> <given-names>ML</given-names></name></person-group>. <article-title>Differential diagnosis generators: an evaluation of currently available computer programs</article-title>. <source>J Gen Intern Med</source>. (<year>2012</year>) <volume>27</volume>:<fpage>213</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11606-011-1804-8</pub-id>, PMID: <pub-id pub-id-type="pmid">21789717</pub-id></citation></ref>
<ref id="ref18"><label>18.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>R</given-names></name> <name><surname>Sun</surname> <given-names>L</given-names></name> <name><surname>Xia</surname> <given-names>Y</given-names></name> <name><surname>Qin</surname> <given-names>T</given-names></name> <name><surname>Zhang</surname> <given-names>S</given-names></name> <name><surname>Poon</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>BioGPT: generative pre-trained transformer for biomedical text generation and mining</article-title>. <source>Brief Bioinform</source>. (<year>2022</year>) <volume>23</volume>:<fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbac409</pub-id>, PMID: <pub-id pub-id-type="pmid">36156661</pub-id></citation></ref>
<ref id="ref19"><label>19.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Basgoz</surname> <given-names>N</given-names></name> <name><surname>Brown</surname> <given-names>CM</given-names></name> <name><surname>Smole</surname> <given-names>SC</given-names></name> <name><surname>Madoff</surname> <given-names>LC</given-names></name> <name><surname>Biddinger</surname> <given-names>PD</given-names></name> <name><surname>Baugh</surname> <given-names>JJ</given-names></name> <etal/></person-group>. <article-title>Case 24-2022: a 31-year-old man with perianal and penile ulcers, rectal pain, and rash</article-title>. <source>N Engl J Med</source>. (<year>2022</year>) <volume>387</volume>:<fpage>547</fpage>&#x2013;<lpage>56</lpage>. doi: <pub-id pub-id-type="doi">10.1056/NEJMcpc2201244</pub-id>, PMID: <pub-id pub-id-type="pmid">35704401</pub-id></citation></ref>
<ref id="ref20"><label>20.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Bhaimiya</surname> <given-names>S.</given-names></name></person-group> <source>OpenAI cofounder Elon Musk said the non-profit he helped create is now focused on &#x2018;maximum-profit,&#x2019; which is &#x2018;not what I intended at all&#x2019;. Business Insider</source>. (<year>2023</year>).</citation></ref>
</ref-list>
</back>
</article>