<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="brief-report" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1644093</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Brief Research Report</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Limitations of broadly trained LLMs in interpreting orthopedic Walch glenoid classifications</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>ElSayed</surname> <given-names>Adam</given-names></name>
<uri xlink:href="https://loop.frontiersin.org/people/2706155/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Updegrove</surname> <given-names>Gary F.</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3164534/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
</contrib-group>
<aff><institution>Penn State Health Milton S. Hershey Medical Center</institution>, <addr-line>Hershey, PA</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: Herwig Unger, University of Hagen, Germany</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: Eric Chun-Pu Chu, EC Healthcare, Hong Kong SAR, China</p>
<p>Mingzheng Zhang, Jishou University, China</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Gary F. Updegrove, <email>gupdegrove@pennstatehealth.psu.edu</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>28</day>
<month>08</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1644093</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>24</day>
<month>07</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 ElSayed and Updegrove.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>ElSayed and Updegrove</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Artificial intelligence (AI) integration in medical practice has grown substantially, with physician use nearly doubling from 38% in 2023 to 68% in 2024. Recent advances in large language models (LLMs) include multimodal inputs, showing potential for medical image interpretation and clinical software integrations. This study evaluated the accuracy of two popular LLMs, Claude 3.5 Sonnet and DeepSeek R1, in interpreting glenoid diagrams using Walch glenoid classification in preoperative shoulder reconstruction applications. Test images included seven black-white Walch glenoid diagrams from Radiopedia. LLMs were accessed via Perplexity.ai without specialized medical training. LLMs were tested across multiple conversation threads with prompt instructions of varying length, ranging from 22 to 864 words for DeepSeek and 127 to 840 words for Claude. Performance differed significantly between models. DeepSeek achieved 44% accuracy (7/16), while Claude had 0% accuracy (0/16). DeepSeek showed a mild positive correlation between instruction length and response accuracy. Common errors across both LLMs included misclassifying A2 as either A1 (32%) or B2 (20%). Results highlight limitations in broadly trained LLMs&#x2019; ability to interpret even simplified medical diagrams. DeepSeek&#x2019;s continuous learning feature and open-source dataset integration exhibited superior accuracy, although it was still insufficient for clinical applications. These limitations stem from LLM training data containing primarily text instead of medical images, creating pattern recognition deficiencies when interpreting visual medical information. Despite AI&#x2019;s growing adoption in healthcare, this study concludes that as of February 2025, publicly available broadly trained LLMs lack the consistency and accuracy necessary for reliable medical image interpretation, emphasizing the need for specialized training before clinical implementation.</p>
</abstract>
<kwd-group>
<kwd>Claude 3.5-sonnet</kwd>
<kwd>orthopaedic surgery</kwd>
<kwd>DeepSeek R1</kwd>
<kwd>Walch glenoid morphology</kwd>
<kwd>large language model</kwd>
<kwd>shoulder osteoarthritis</kwd>
<kwd>Walch glenoid type</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="0"/>
<equation-count count="0"/>
<ref-count count="21"/>
<page-count count="6"/>
<word-count count="3881"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medicine and Public Health</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<title>Introduction</title>
<p>Artificial intelligence (AI) is an inevitable evolution to digital workflows, with online large language model (LLM) chatbot, such as ChatGPT, significantly increasing AI&#x2019;s accessibility and usage across fields such as computer science, customer service, and even medicine. AMA&#x2019;s 2025 Augmented Intelligence Research Survey reveals that physicians&#x2019; use of AI in clinical settings has nearly doubled from 38% in 2023 to 68% in 2024 (<xref ref-type="bibr" rid="ref2">AMA, 2025</xref>), which highlights AI&#x2019;s rapidly growing impact on physician workflows. This growth is likely to continue given the rapid evolution of LLMs; March 2023 marked the release of GPT 4.0 and its innovations to multimodal input, which allows LLMs to interpret image and text inputs simultaneously (<xref ref-type="bibr" rid="ref17">Thirunavukarasu et al., 2023</xref>). Since then, ChatGPT competitors, such as Claude 3.5 Sonnet, are proving themselves more accurate than ChatGPT 4.0 at diagnosing acute ischemic stroke from medical imaging (<xref ref-type="bibr" rid="ref7">Koyun and Taskent, 2025</xref>), and most recently, DeepSeek&#x2019;s groundbreaking development of a free-to-use and open-source reasoning model, R1, has opened new possibilities for AI integrations in clinical software (<xref ref-type="bibr" rid="ref16">Temsah et al., 2025</xref>). Newer studies similarly recognize superior diagnostic capabilities from Claude 3.5 Sonnet and DeepSeek R1 compared to ChatGPT 4.0, identifying them both as enhancements to disease classification and clinical decision-making (<xref ref-type="bibr" rid="ref8">Kurokawa et al., 2024</xref>; <xref ref-type="bibr" rid="ref5">Gupta and Pande, 2025</xref>).</p>
<p>Orthopedic surgery and radiology have been particularly invested in LLMs, with ChatGPT 4.0 being regarded as a powerful tool with the potential to improve accuracy, efficiency, cost of care, and patient outcomes due to decreased delays in diagnosis (<xref ref-type="bibr" rid="ref14">Srivastav et al., 2023</xref>). In shoulder surgery, Walch glenoid classification is the most widely used assessment of glenoid morphology and wear in preparation for shoulder reconstruction procedures. This classification was based on axillary shoulder radiographs and axial computed tomography (CT) images to evaluate glenoid erosion patterns in primary glenohumeral osteoarthritis (<xref ref-type="bibr" rid="ref21">Zimmer et al., 2020</xref>).</p>
<p>The Walch glenoid classification has evolved over time, with recent updates, including glenoid types A, B, C, and D. Type A features centered humeral heads with concentric wear and no subluxation, classified as A1 if a line from anterior to posterior glenoid rim does not transect the humeral head, and A2 if it does (<xref ref-type="bibr" rid="ref4">Bercik et al., 2016</xref>; <xref ref-type="bibr" rid="ref3">Barnsley et al., 2025</xref>). Type B involves asymmetric wear with posterior subluxation: B1 has only posterior joint space narrowing and B2 shows biconcave humeral head with posterior rim-erosion and retroversion &#x003C;15&#x00B0;, while B3 exhibits monoconcave humeral head with posterior wear and retroversion &#x003E;15&#x00B0; and/or &#x003E;70% posterior subluxation (<xref ref-type="bibr" rid="ref4">Bercik et al., 2016</xref>; <xref ref-type="bibr" rid="ref3">Barnsley et al., 2025</xref>). Type C presents dysplastic humeral heads with retroversion &#x003E;25&#x00B0; not from erosion (<xref ref-type="bibr" rid="ref4">Bercik et al., 2016</xref>; <xref ref-type="bibr" rid="ref3">Barnsley et al., 2025</xref>). Type D displays anteversion and/or anterior subluxation &#x003C;40% regardless of concavity (<xref ref-type="bibr" rid="ref4">Bercik et al., 2016</xref>; <xref ref-type="bibr" rid="ref3">Barnsley et al., 2025</xref>).</p>
<p>Despite the widespread adoption of the Walch classification system, studies have consistently reported inconsistent reliability among orthopedic surgeons (<xref ref-type="bibr" rid="ref13">Schaefer et al., 2024</xref>). While recent research has demonstrated the potential of deep learning models for Samilson&#x2013;Prieto glenohumeral osteoarthritis classification on radiographs (<xref ref-type="bibr" rid="ref9">Magn&#x00E9;li et al., 2024</xref>), there remains a significant gap in evaluating LLMs specifically for glenoid morphology classification. Given the importance of classifying glenoid wear, combined with the lack of existing literature surrounding potential AI use cases, this initial study aims to explore the applications of publicly accessible and broadly trained LLMs in utilizing Walch glenoid classification to distinguish images of glenoid diagrams. Considering the disproportionately low amounts of research on non-ChatGPT LLMs, this study focuses on comparing the accuracy of Claude 3.5 Sonnet and DeepSeek R1 in Walch glenoid classifications, in addition to analyzing common mistakes and correlations between accuracy and prompt wordcount.</p>
</sec>
<sec sec-type="methods" id="sec2">
<title>Methods</title>
<p>The seven images used for this study were obtained from Radiopedia (<xref ref-type="bibr" rid="ref6">Knipe, 2025</xref>), with each image illustrating a black-white diagram corresponding with each Walch type (<xref ref-type="fig" rid="fig1">Figure 1A</xref>). Additional figures were not deemed necessary due to the seven images&#x2019; coverage of all Walch glenoid types, including A1, A2, B1, B2, B3, C, and D. An 87.5% of queries (28/32) analyzed accuracy in interpreting glenoid types A1, A2, B1, and B2 reflecting their combined 91% prevalence among primary glenohumeral arthritis cases (<xref ref-type="bibr" rid="ref21">Zimmer et al., 2020</xref>). The AI models utilized in this study, Claude 3.5 Sonnet and DeepSeek R1, were accessed via Perplexity&#x2014;a publicly accessible website where &#x201C;Pro&#x201D; subscription users can switch between popular LLMs such as ChatGPT and Claude, and recently the addition of DeepSeek R1. LLMs used were default models without specialized medical training.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p><bold>(A)</bold> Radiopedia&#x2019;s Walch glenoid classification reference image (reproduced with permission from <xref ref-type="bibr" rid="ref6">Knipe (2025)</xref> following approval from <ext-link xlink:href="https://radiopaedia.org/" ext-link-type="uri">https://radiopaedia.org/</ext-link> (ID:202513-3867), licensed under CC-BY-SA). <bold>(B)</bold> Annotated version of Radiopedia&#x2019;s Walch glenoid classification diagram for AI use.</p>
</caption>
<graphic xlink:href="frai-08-1644093-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating the Walch classification of shoulder joint morphology. Part A shows different shapes of glenoid wear patterns labeled A1, A2, B1, B2, B3, C, and D. Part B provides detailed labeled diagrams comparing eccentric and concentric glenoid wear patterns, explaining mono-concave and bi-concave formations with annotations on anterior, posterior, and humeral head positions.</alt-text>
</graphic>
</fig>
<p>Various conversation threads were opened with both Claude 3.5 Sonnet and DeepSeek R1, with the initial prompt containing instructions of varying length detailing how to use the Walch glenoid classification to interpret future diagrams. All prompts were uploaded alongside Radiopedia&#x2019;s Walch glenoid classification with illustrative annotations distinguishing each glenoid&#x2019;s visual characteristics (<xref ref-type="fig" rid="fig1">Figure 1B</xref>), although prompts varied in word count. No additional context was provided for queries aside from the instructions. Examples of prompts can be found in the supplementary materials.</p>
<p>After receiving acknowledgment of the initial instructions, most threads were asked follow-up queries to test their use of prior instructions in classifying an attached screenshot of one of the seven Radiopedia Walch glenoid diagrams. To better analyze the accuracy of the initial prompt, two threads were prompted to classify a Walch glenoid diagram in the same initial query that contained instructions.</p>
<p>After each LLM provided its response, metrics of each response were collected in a spreadsheet, including each prompt, prompt wordcount, AI responses, LLM name, correct Walch glenoid classification, and AI&#x2019;s Walch glenoid classification. LLM performance was evaluated by comparing the accuracy of the AI&#x2019;s classification with the correct Walch glenoid classification for a given prompt. Prompt wordcount was collected to analyze correlations between the length of prompts and the accuracy of AI outputs.</p>
</sec>
<sec sec-type="results" id="sec3">
<title>Results</title>
<p>A total of seven conversation threads with 16 queries running DeepSeek R1 were compared to a total of 10 conversation threads with 16 queries running Claude 3.5 Sonnet, which was adjusted to remove any AI responses that were neither correct nor incorrect (5 DeepSeek R1 and 18 Claude 3.5 Sonnet) due to being acknowledgments of instructions.</p>
<p>In total, DeepSeek R1 saw 44% total accuracy (7/16) compared to Claude Sonnet&#x2019;s 0% (0/16) total accuracy (<xref ref-type="fig" rid="fig2">Figure 2</xref>). For DeepSeek R1, the number of follow-up queries ranged from 0 to 6, while Claude 3.5 Sonnet ranged from 1 to 3. Threads with 0 follow-ups represent two conversation threads where initial instructions were combined with a diagram to be interpreted. No other queries combined initial instructions with a Walch glenoid classification test.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Total accurate responses/total responses per LLM.</p>
</caption>
<graphic xlink:href="frai-08-1644093-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart titled "Total Query Accuracy per LLM" compares accuracy of Claude 3.5 Sonnet and DeepSeek R1. Claude 3.5 Sonnet shows 0% correct and 100% incorrect for 16 queries. DeepSeek R1 shows 44% correct and 56% incorrect for 16 queries.</alt-text>
</graphic>
</fig>
<p>Additionally, the relationship was observed between the word count of a conversation thread&#x2019;s initial prompt and the accuracy of the AI&#x2019;s future responses within that thread (<xref ref-type="fig" rid="fig3">Figure 3</xref>). Initial instruction word counts ranged from 22 to 864 for DeepSeek R1 and from 127 to 840 for Claude 3.5 Sonnet. DeepSeek R1 demonstrated a mildly positive correlation, with the most accurate thread (50%) having the second-highest word count (840). Claude 3.5 Sonnet yielded no correct responses regardless of word count.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Percentage of accurate responses/wordcount of initial prompt per LLM.</p>
</caption>
<graphic xlink:href="frai-08-1644093-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Scatter plot showing thread accuracy percentage against instruction word count. DeepSeek R1 is represented by blue dots, and Claude 3.5 Sonnet by orange dots. A dotted line indicates a linear trend line for DeepSeek R1, with accuracy showing an upward trend as word count increases from 0 to 900.</alt-text>
</graphic>
</fig>
<p>Finally, incorrect answers were analyzed to observe the most frequent mistakes made by each LLM (<xref ref-type="fig" rid="fig4">Figure 4</xref>). In total, the most common mistake across both LLMs (32%) was classifying A2 as A1, with 20% of mistakes being from classifying A2 as B2. DeepSeek R1&#x2019;s most common mistake (22%) was classifying A2 as B2, with all other mistakes tied at 11% frequency. Claude 3.5 Sonnet&#x2019;s most common mistake (63%) was classifying A2 as A1, followed by classifying A2 as B2 (19%) or B3 (13%), respectively.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Percentage of specific incorrect input&#x2013;output combinations/total incorrect responses, per LLM.</p>
</caption>
<graphic xlink:href="frai-08-1644093-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart titled "Frequency of Mistakes per LLM." It shows mistake frequencies across various classification pairs. Mistakes are categorized into blue for Total, orange for DeepSeek R1, and green for Claude 3.5 Sonnet. A2-A1 has the highest frequency, especially in green. Other pairs show varying lower frequencies, with consistent appearances in the blue and orange categories.</alt-text>
</graphic>
</fig>
</sec>
<sec sec-type="discussion" id="sec4">
<title>Discussion</title>
<p>This study was originally supposed to observe the accuracy of publicly available LLMs in using Walch glenoid classification to classify deidentified computed tomography (CT) glenoid images. However, the AI&#x2019;s inconsistencies and inaccuracies with Radiopedia&#x2019;s straightforward and high-contrast classification diagrams (<xref ref-type="bibr" rid="ref6">Knipe, 2025</xref>) proved that moderate image interpretation skills, such as consistent and accurate identification of curves and spaces, are beyond the capabilities of today&#x2019;s publicly accessible, broadly trained LLMs. Thus, given their shortcomings in these simple tasks, we concluded neither Claude 3.5 Sonnet nor DeepSeek R1 can reliably reproduce accurate analysis of diagrams, let alone real clinical imaging such as CT scans, in the context of Walch glenoid classifications.</p>
<p>Mid-January 2025 marked the release of DeepSeek R1, which, in this study, demonstrated significantly increased performance compared to Claude 3.5 Sonnet. Analysis of mistake frequencies shown in <xref ref-type="fig" rid="fig4">Figure 4</xref> highlights DeepSeek R1&#x2019;s superior attention to detail in multimodal medical image interpretation, including wear patterns and humeral head positioning, compared to Claude 3.5 Sonnet. While Claude 3.5 Sonnet exhibits a pronounced vulnerability to A2-A1 misclassification (63%), representing the critical distinction between centered and decentered glenoids, DeepSeek R1 maintains relatively uniform error rates (0&#x2013;22%) across all classification pairs. Recent comparative studies reveal that while LLMs, such as Claude 3.5 Sonnet, may excel in language-driven or broad-context tasks, their overconfidence despite incorrect outputs for certain complicated tasks, including multimodal clinical diagnoses, causes disproportionately clustered error patterns (<xref ref-type="bibr" rid="ref15">Suh et al., 2024</xref>) similar to Claude 3.5&#x2019;s error distribution pattern in this study. These findings underscore the importance of evaluating both overall accuracy and the distribution of errors, as models with more consistent performance profiles, such as DeepSeek R1, may offer greater clinical utility and safety (<xref ref-type="bibr" rid="ref1">Ahmed et al., 2025</xref>).</p>
<p>This innovative &#x201C;reasoning&#x201D; model outperforms other LLMs due to its continuous learning feature, entailing ongoing automatic integration of publicly available open-source datasets in its training data (<xref ref-type="bibr" rid="ref16">Temsah et al., 2025</xref>), with potential inclusion of medical illustration data from sources such as Radiopedia or other Creative Commons licensed repositories. DeepSeek R1&#x2019;s open-source code also uniquely empowers a global community of researchers and developers to collaboratively improve and customize its capabilities for specific use cases, including clinical image interpretation (<xref ref-type="bibr" rid="ref16">Temsah et al., 2025</xref>). Moreover, another study identified DeepSeek R1 as more comprehensive and readable when discussing orthopedic surgical procedures compared to other LLMs (<xref ref-type="bibr" rid="ref19">Zhou et al., 2025</xref>). Although speculative, these characteristics offer plausible explanations for DeepSeek R1&#x2019;s superior results. Nonetheless, DeepSeek&#x2019;s performance is only impressive when compared to other LLMs, with the study concluding that DeepSeek R1&#x2019;s outputs are &#x201C;fair&#x201D; under the DISCERN criteria despite being the best, highlighting the needs for improvement and personalization (<xref ref-type="bibr" rid="ref19">Zhou et al., 2025</xref>).</p>
<p>For all LLMs, including Claude 3.5 Sonnet and DeepSeek R1, AI performance is significantly limited by the type of data it is trained on, which directly influences an LLM&#x2019;s ability to recognize patterns and synthesize multimodal information (<xref ref-type="bibr" rid="ref14">Srivastav et al., 2023</xref>). Traditionally, publicly accessible LLMs are broadly trained using text data, such as websites, social media, and books, to predict word sequence patterns to create responses to prompts (<xref ref-type="bibr" rid="ref11">Parillo et al., 2024</xref>); out of these LLMs, none of them have had exposure to medical data, such as patient records, lab data, or medical imaging (<xref ref-type="bibr" rid="ref10">Mesk&#x00F3; and Topol, 2023</xref>). This lack of medical data has been directly correlated with decreased performance in ChatGPT, particularly when exposed to novel image types, conditions, or patient populations (<xref ref-type="bibr" rid="ref14">Srivastav et al., 2023</xref>).</p>
<p>Other studies analyzing broadly trained LLMs also report similar findings. One analysis of ChatGPT-4&#x202F;V&#x2019;s accuracy in answering electrocardiogram multiple-choice questions found the AI particularly weak at reading visual parameters, such as PR intervals (<xref ref-type="bibr" rid="ref20">Zhu et al., 2024</xref>). Another study using a broadly trained GPT-4 model for mammographic interpretation reported a high frequency of hallucinations and concluded that future clinical applications of LLMs require rigorous training and validation to be considered reliable (<xref ref-type="bibr" rid="ref12">Pesapane et al., 2025</xref>).</p>
<p>Risks of untrained AI in medical applications include &#x201C;hallucination&#x201D; responses that are confident despite being incorrect, if not made-up, which can have drastic effects if used to influence patient care (<xref ref-type="bibr" rid="ref10">Mesk&#x00F3; and Topol, 2023</xref>). Broadly trained LLMs are at particular risk of hallucinations due to training data often including misinformation and biases (<xref ref-type="bibr" rid="ref11">Parillo et al., 2024</xref>). Furthermore, accuracy is further impaired by AI&#x2019;s lack of access to electronic medical records, which prevents the formulation of case-specific answers (<xref ref-type="bibr" rid="ref11">Parillo et al., 2024</xref>). Considering Walch glenoid classification&#x2019;s role in surgical planning and implant selection, AI misclassifications carry significant risk for impacting patient-survival rates by increasing revision rates, decreasing prosthetic longevity, and decreasing functional outcomes (<xref ref-type="bibr" rid="ref18">Vo et al., 2017</xref>).</p>
<sec id="sec5">
<title>Limitations</title>
<p>This study assessed a limited cohort (<italic>n</italic>&#x202F;=&#x202F;16), which restricts the generalizability of the findings, including the mildly positive correlation between prompt wordcount and DeepSeek&#x2019;s performance. Additionally, there was disproportionate analysis across various glenoid types, with 76% (24/32) of queries testing analysis of type A1 or A2 glenoids. AI training utilized only seven Radiopedia diagrams and selected peer-reviewed sources, which were all provided simultaneously, potentially increasing hallucination frequency compared to longitudinal exposure to a larger quantity of literature and reference images. Variability in prompt wording and length between trials introduced confounding output differences. Additionally, differences in conversation thread length complicate longitudinal comparisons. Finally, LLM access through Perplexity.ai, instead of their native platforms, risks potential platform-specific performance biases.</p>
<p>Future investigations on orthopedic applications of multimodal AI LLMs should include larger sample sizes, utilization of consistent prompts between conversation threads, and emphasis on clinical applicability by analyzing different LLMs, including models pre-trained on clinical datasets that include annotated radiographs and CT scans.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec6">
<title>Conclusion</title>
<p>Although AI integrations show promise to benefit both patient care and provider workflows, especially with exponentially evolving capabilities of newer reasoning models, maximizing the potential of LLMs requires extensive database training to provide outputs that are case-specific and medically accurate (<xref ref-type="bibr" rid="ref11">Parillo et al., 2024</xref>). In conclusion, distinguishing the potential of AI in improving healthcare workflows from its current capabilities is extremely important, particularly given its increasing use among providers. In the context of image interpretation, our study demonstrated that publicly available and broadly trained LLMs as of February 2025 did not have the ability to consistently and accurately recognize and interpret Walch glenoid classification diagrams, let alone radiographic images.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec7">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="sec8">
<title>Author contributions</title>
<p>AE: Conceptualization, Investigation, Validation, Project administration, Writing &#x2013; review &#x0026; editing, Supervision, Funding acquisition, Resources, Software, Formal analysis, Writing &#x2013; original draft, Data curation, Visualization, Methodology. GU: Resources, Validation, Conceptualization, Project administration, Visualization, Investigation, Writing &#x2013; review &#x0026; editing, Funding acquisition, Methodology, Data curation, Software, Formal analysis, Writing &#x2013; original draft, Supervision.</p>
</sec>
<sec sec-type="funding-information" id="sec9">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research and/or publication of this article.</p>
</sec>
<ack>
<p>Permission was sought and obtained from <ext-link xlink:href="https://www.Radiopaedia.org" ext-link-type="uri">Radiopaedia.org</ext-link> (ID:202513-3867) during the research process.</p>
</ack>
<sec sec-type="COI-statement" id="sec10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec11">
<title>Generative AI statement</title>
<p>The authors declare that no Gen AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="sec12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec13">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2025.1644093/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/frai.2025.1644093/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ahmed</surname> <given-names>S.</given-names></name> <name><surname>Sakib</surname> <given-names>S. K.</given-names></name> <name><surname>Das</surname> <given-names>A. B.</given-names></name></person-group> (<year>2025</year>). <article-title>Can large language models challenge CNNS in medical image analysis?</article-title> <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2505.23503</pub-id></citation></ref>
<ref id="ref2"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll1">AMA</collab></person-group>. Physician enthusiasm grows for health care AI. American Medical Association. (<year>2025</year>). Available online at: <ext-link xlink:href="https://www.ama-assn.org/press-center/press-releases/ama-physician-enthusiasm-grows-health-care-ai#:~:text=About%20three%20in%20five%20(66,slightly%20from%205625%20in%202023" ext-link-type="uri">https://www.ama-assn.org/press-center/press-releases/ama-physician-enthusiasm-grows-health-care-ai#:~:text=About%20three%20in%20five%20(66,slightly%20from%205625%20in%202023</ext-link>.</citation></ref>
<ref id="ref3"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Barnsley</surname> <given-names>L.</given-names></name> <name><surname>Knipe</surname> <given-names>H.</given-names></name> <name><surname>Rasuli</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Walch classification of glenoid morphology</article-title>. doi: <pub-id pub-id-type="doi">10.53347/rID-74419</pub-id></citation></ref>
<ref id="ref4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bercik</surname> <given-names>M. J.</given-names></name> <name><surname>Kruse</surname> <given-names>K.</given-names> <suffix>2nd</suffix></name> <name><surname>Yalizis</surname> <given-names>M.</given-names></name> <name><surname>Gauci</surname> <given-names>M. O.</given-names></name> <name><surname>Chaoui</surname> <given-names>J.</given-names></name> <name><surname>Walch</surname> <given-names>G.</given-names></name></person-group> (<year>2016</year>). <article-title>A modification to the Walch classification of the glenoid in primary glenohumeral osteoarthritis using three-dimensional imaging</article-title>. <source>J. Shoulder Elb. Surg.</source> <volume>25</volume>, <fpage>1601</fpage>&#x2013;<lpage>1606</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jse.2016.03.010</pub-id>, PMID: <pub-id pub-id-type="pmid">27282738</pub-id></citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname> <given-names>G. K.</given-names></name> <name><surname>Pande</surname> <given-names>P.</given-names></name></person-group> (<year>2025</year>). <article-title>LLMs in disease diagnosis: a comparative study of DeepSeek-R1 and O3 mini across chronic health conditions</article-title>. <source>arXiv</source>:<fpage>10486</fpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2503.10486</pub-id></citation></ref>
<ref id="ref6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Knipe</surname> <given-names>H.</given-names></name></person-group> (<year>2025</year>). <article-title>Walch classification of glenoid morphology in primary glenohumeral osteoarthritis (illustration)</article-title>. <source>Case study</source>. doi: <pub-id pub-id-type="doi">10.53347/rID-95800</pub-id></citation></ref>
<ref id="ref7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Koyun</surname> <given-names>M.</given-names></name> <name><surname>Taskent</surname> <given-names>I.</given-names></name></person-group> (<year>2025</year>). <article-title>Evaluation of advanced artificial intelligence algorithms' diagnostic efficacy in acute ischemic stroke: a comparative analysis of ChatGPT-4o and Claude 3.5 sonnet models</article-title>. <source>J. Clin. Med.</source> <volume>14</volume>:<fpage>571</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jcm14020571</pub-id>, PMID: <pub-id pub-id-type="pmid">39860577</pub-id></citation></ref>
<ref id="ref8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kurokawa</surname> <given-names>R.</given-names></name> <name><surname>Ohizumi</surname> <given-names>Y.</given-names></name> <name><surname>Kanzawa</surname> <given-names>J.</given-names></name> <name><surname>Kurokawa</surname> <given-names>M.</given-names></name> <name><surname>Sonoda</surname> <given-names>Y.</given-names></name> <name><surname>Nakamura</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Diagnostic performances of Claude 3 opus and Claude 3.5 sonnet from patient history and key images in radiology's "diagnosis please" cases</article-title>. <source>Jpn. J. Radiol.</source> <volume>42</volume>, <fpage>1399</fpage>&#x2013;<lpage>1402</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11604-024-01634-z</pub-id>, PMID: <pub-id pub-id-type="pmid">39096483</pub-id></citation></ref>
<ref id="ref9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Magn&#x00E9;li</surname> <given-names>M.</given-names></name> <name><surname>Axenhus</surname> <given-names>M.</given-names></name> <name><surname>Fagrell</surname> <given-names>J.</given-names></name> <name><surname>Ling</surname> <given-names>P.</given-names></name> <name><surname>Gisl&#x00E9;n</surname> <given-names>J.</given-names></name> <name><surname>Demir</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Artificial intelligence can be used in the identification and classification of shoulder osteoarthritis and avascular necrosis on plain radiographs: a training study of 7, 139 radiograph sets</article-title>. <source>Acta Orthop.</source> <volume>95</volume>, <fpage>319</fpage>&#x2013;<lpage>324</lpage>. doi: <pub-id pub-id-type="doi">10.2340/17453674.2024.40905</pub-id>, PMID: <pub-id pub-id-type="pmid">38884536</pub-id></citation></ref>
<ref id="ref10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mesk&#x00F3;</surname> <given-names>B.</given-names></name> <name><surname>Topol</surname> <given-names>E. J.</given-names></name></person-group> (<year>2023</year>). <article-title>The imperative for regulatory oversight of large language models (or generative AI) in healthcare</article-title>. <source>NPJ Digit. Med.</source> <volume>6</volume>:<fpage>120</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-023-00873-0</pub-id>, PMID: <pub-id pub-id-type="pmid">37414860</pub-id></citation></ref>
<ref id="ref11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Parillo</surname> <given-names>M.</given-names></name> <name><surname>Vaccarino</surname> <given-names>F.</given-names></name> <name><surname>Beomonte Zobel</surname> <given-names>B.</given-names></name> <name><surname>Mallio</surname> <given-names>C. A.</given-names></name></person-group> (<year>2024</year>). <article-title>ChatGPT and radiology report: potential applications and limitations</article-title>. <source>Radiol. Med.</source> <volume>129</volume>, <fpage>1849</fpage>&#x2013;<lpage>1863</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11547-024-01915-7</pub-id>, PMID: <pub-id pub-id-type="pmid">39508933</pub-id></citation></ref>
<ref id="ref12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pesapane</surname> <given-names>F.</given-names></name> <name><surname>Nicosia</surname> <given-names>L.</given-names></name> <name><surname>Rotili</surname> <given-names>A.</given-names></name> <name><surname>Penco</surname> <given-names>S.</given-names></name> <name><surname>Dominelli</surname> <given-names>V.</given-names></name> <name><surname>Trentin</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>A preliminary investigation into the potential, pitfalls, and limitations of large language models for mammography interpretation</article-title>. <source>Discov. Oncol.</source> <volume>16</volume>:<fpage>233</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s12672-025-02005-4</pub-id>, PMID: <pub-id pub-id-type="pmid">39992569</pub-id></citation></ref>
<ref id="ref13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schaefer</surname> <given-names>E. J.</given-names></name> <name><surname>Haislup</surname> <given-names>B.</given-names></name> <name><surname>Trent</surname> <given-names>S.</given-names></name> <name><surname>Sequeira</surname> <given-names>S.</given-names></name> <name><surname>Tarapore</surname> <given-names>R.</given-names></name> <name><surname>Lindsey</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Reliability of the Walch classification for characterization of primary Glenohumeral arthritis: a systematic review</article-title>. <source>J. Am. Acad. Orthop. Surg.</source> <volume>32</volume>, <fpage>e861</fpage>&#x2013;<lpage>e868</lpage>. doi: <pub-id pub-id-type="doi">10.5435/JAAOS-D-22-01086</pub-id>, PMID: <pub-id pub-id-type="pmid">38748901</pub-id></citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Srivastav</surname> <given-names>S.</given-names></name> <name><surname>Chandrakar</surname> <given-names>R.</given-names></name> <name><surname>Gupta</surname> <given-names>S.</given-names></name> <name><surname>Babhulkar</surname> <given-names>V.</given-names></name> <name><surname>Agrawal</surname> <given-names>S.</given-names></name> <name><surname>Jaiswal</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>ChatGPT in radiology: the advantages and limitations of artificial intelligence for medical imaging diagnosis</article-title>. <source>Cureus</source> <volume>15</volume>:<fpage>e41435</fpage>. doi: <pub-id pub-id-type="doi">10.7759/cureus.41435</pub-id>, PMID: <pub-id pub-id-type="pmid">37546142</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Suh</surname> <given-names>P. S.</given-names></name> <name><surname>Shim</surname> <given-names>W. H.</given-names></name> <name><surname>Suh</surname> <given-names>C. H.</given-names></name> <name><surname>Heo</surname> <given-names>H.</given-names></name> <name><surname>Park</surname> <given-names>K. J.</given-names></name> <name><surname>Kim</surname> <given-names>P. H.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Comparing large language model and human reader accuracy with New England journal of medicine image challenge case image inputs</article-title>. <source>Radiology</source> <volume>313</volume>:<fpage>e241668</fpage>. doi: <pub-id pub-id-type="doi">10.1148/radiol.241668</pub-id>, PMID: <pub-id pub-id-type="pmid">39656125</pub-id></citation></ref>
<ref id="ref16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Temsah</surname> <given-names>A.</given-names></name> <name><surname>Alhasan</surname> <given-names>K.</given-names></name> <name><surname>Altamimi</surname> <given-names>I.</given-names></name> <name><surname>Jamal</surname> <given-names>A.</given-names></name> <name><surname>Al-Eyadhy</surname> <given-names>A.</given-names></name> <name><surname>Malki</surname> <given-names>K. H.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>DeepSeek in healthcare: revealing opportunities and steering challenges of a new open-source artificial intelligence frontier</article-title>. <source>Cureus.</source> <volume>17</volume>:<fpage>e79221</fpage>. doi: <pub-id pub-id-type="doi">10.7759/cureus.79221</pub-id>, PMID: <pub-id pub-id-type="pmid">39974299</pub-id></citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Thirunavukarasu</surname> <given-names>A. J.</given-names></name> <name><surname>Ting</surname> <given-names>D. S. J.</given-names></name> <name><surname>Elangovan</surname> <given-names>K.</given-names></name> <name><surname>Gutierrez</surname> <given-names>L.</given-names></name> <name><surname>Tan</surname> <given-names>T. F.</given-names></name> <name><surname>Ting</surname> <given-names>D. S. W.</given-names></name></person-group> (<year>2023</year>). <article-title>Large language models in medicine</article-title>. <source>Nat. Med.</source> <volume>29</volume>, <fpage>1930</fpage>&#x2013;<lpage>1940</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>, PMID: <pub-id pub-id-type="pmid">37460753</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vo</surname> <given-names>K. V.</given-names></name> <name><surname>Hackett</surname> <given-names>D. J.</given-names></name> <name><surname>Gee</surname> <given-names>A. O.</given-names></name> <name><surname>Hsu</surname> <given-names>J. E.</given-names></name></person-group> (<year>2017</year>). <article-title>Classifications in brief: Walch classification of primary Glenohumeral osteoarthritis</article-title>. <source>Clin. Orthop. Relat. Res.</source> <volume>475</volume>, <fpage>2335</fpage>&#x2013;<lpage>2340</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11999-017-5317-6</pub-id>, PMID: <pub-id pub-id-type="pmid">28315182</pub-id></citation></ref>
<ref id="ref19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>M.</given-names></name> <name><surname>Pan</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Song</surname> <given-names>X.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name></person-group> (<year>2025</year>). <article-title>Evaluating AI-generated patient education materials for spinal surgeries: comparative analysis of readability and DISCERN quality across ChatGPT and DeepSeek models</article-title>. <source>Int. J. Med. Inform.</source> <volume>198</volume>:<fpage>105871</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2025.105871</pub-id></citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>L.</given-names></name> <name><surname>Mou</surname> <given-names>W.</given-names></name> <name><surname>Wu</surname> <given-names>K.</given-names></name> <name><surname>Lai</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>A.</given-names></name> <name><surname>Yang</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Multimodal ChatGPT-4V for electrocardiogram interpretation: promise and limitations</article-title>. <source>J. Med. Internet Res.</source> <volume>26</volume>:<fpage>e54607</fpage>. doi: <pub-id pub-id-type="doi">10.2196/54607</pub-id>, PMID: <pub-id pub-id-type="pmid">38764297</pub-id></citation></ref>
<ref id="ref21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zimmer</surname> <given-names>Z. R.</given-names></name> <name><surname>Carducci</surname> <given-names>M. P.</given-names></name> <name><surname>Mahendraraj</surname> <given-names>K. A.</given-names></name> <name><surname>Jawa</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>Evolution of the Walch classification and its importance on the B2 glenoid</article-title>. <source>J. Shoulder Elbow Arthroplasty</source> <volume>4</volume>:<fpage>4</fpage>. doi: <pub-id pub-id-type="doi">10.1177/2471549220903815</pub-id>, PMID: <pub-id pub-id-type="pmid">40735054</pub-id></citation></ref>
</ref-list>
</back>
</article>