<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Endocrinol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Endocrinology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Endocrinol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-2392</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fendo.2026.1667159</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Comparative assessment of large language models in diabetic foot infection management: alignment with IWGDF/IDSA guidelines</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Wu</surname><given-names>Hongxia</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Deng</surname><given-names>Jiayi</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Qiu</surname><given-names>Xu</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Xu</surname><given-names>Li</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Lu</surname><given-names>Lumeng</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Fan</surname><given-names>Mingna</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Yu</surname><given-names>Danni</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Liu</surname><given-names>Chuanbo</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2145247/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname><given-names>Zhaohuan</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname><given-names>Kai</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname><given-names>Yuyan</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhou</surname><given-names>Haifang</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chang</surname><given-names>Liyang</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3209521/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname><given-names>Hanbin</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3136487/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Emergency Department, Hangzhou Traditional Chinese Medicine Hospital Affiliated to Zhejiang Chinese Medical University</institution>, <city>Hangzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Pain, The Affiliated Hangzhou First People&#x2019;s Hospital, Westlake University School of Medicine</institution>, <city>Hangzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>The Fourth Clinical School of Medicine, Zhejiang Chinese Medical University</institution>, <city>Hangzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>Nursing Department, Hangzhou Traditional Chinese Medicine Hospital Affiliated to Zhejiang Chinese Medical University</institution>, <city>Hangzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Plastic and Cosmetic Surgery, The Affiliated Hangzhou First People&#x2019;s Hospital, Westlake University School of Medicine</institution>, <city>Hangzhou</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff6"><label>6</label><institution>Department of Vascular and Hernia Surgery, The First People&#x2019;s Hospital of Hangzhou Lining District</institution>, <city>Hangzhou</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Hanbin Wang, <email xlink:href="mailto:wanghanbin@hospital.westlake.edu.cn">wanghanbin@hospital.westlake.edu.cn</email>; Liyang Chang, <email xlink:href="mailto:changliyang@163.com">changliyang@163.com</email>; Haifang Zhou, <email xlink:href="mailto:783206521@qq.com">783206521@qq.com</email>; Yuyan Wang, <email xlink:href="mailto:fish251010@hotmail.com">fish251010@hotmail.com</email></corresp>
<fn fn-type="equal" id="fn003">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-24">
<day>24</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1667159</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>09</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>07</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Wu, Deng, Qiu, Xu, Lu, Fan, Yu, Liu, Chen, Wang, Wang, Zhou, Chang and Wang.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Wu, Deng, Qiu, Xu, Lu, Fan, Yu, Liu, Chen, Wang, Wang, Zhou, Chang and Wang</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-24">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Objective</title>
<p>To assess the clinical utility of artificial intelligence (AI) models (ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7) in aligning with international guidelines for diabetic foot infection (DFI) management.</p>
</sec>
<sec>
<title>Background</title>
<p>AI systems have demonstrated their potential application value in numerous fields. However, the specific effects of these technologies in the medical and health sector still require in-depth exploration. DFI is a relatively common and serious complication among diabetic patients, and the accurate transmission of relevant information is of great significance. Therefore, it is particularly important to evaluate whether artificial intelligence can serve as an effective clinical auxiliary tool.</p>
</sec>
<sec>
<title>Methods</title>
<p>Responses from ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7 were evaluated against DFI guidelines using four clinical dimensions (Accuracy, Overconclusiveness, Supplementary Value, and Completeness) using a 5-point Likert scale, and assessed for readability using Flesch Reading Ease (FRE) and Flesch&#x2013;Kincaid Grade Level (FKGL). Statistical analyses included ANOVA and <italic>post hoc</italic> comparisons.</p>
</sec>
<sec>
<title>Results</title>
<p>No significant differences were found across models for Accuracy and Overconclusiveness (<italic>p</italic> &gt; 0.05). However, Supplementary Value differed significantly (<italic>p</italic> &lt; 0.001), the performance of Grok-3 is superior to that of ChatGPT-4o (<italic>p</italic> &lt; 0.0001), DeepSeek-R1 (<italic>p</italic>=0.003), and Claude-3.7 (<italic>p</italic> &lt; 0.0001). Meanwhile, there are significant differences in terms of Completeness (<italic>p</italic>=0.005), Grok-3 outperforms ChatGPT-4o (<italic>p</italic>=0.016)and Claude-3.7 (<italic>p</italic>=0.010) significantly.Readability also varied: DeepSeek-R1 responses were more complex than ChatGPT-4o (<italic>p</italic> = 0.046).</p>
</sec>
<sec>
<title>Conclusion</title>
<p>All models perform comparably in terms of accuracy and in avoiding over-conclusions. Grok-3 outperformed the other models in the dimensions of complementarity and completeness. DeepSeek-R1 generated the most complex text. These findings validate the feasibility of AI in the standardized management of DFI, but the models still need to be further verified through clinical trials to determine their value in the real-world decision-making process.</p>
</sec>
</abstract>
<kwd-group>
<kwd>adherence</kwd>
<kwd>artificial intelligence</kwd>
<kwd>diabetic foot infection</kwd>
<kwd>guideline</kwd>
<kwd>large language models</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was supported by the Medical and Health Science and Technology Project of Zhejiang Provincial (Grant: 2025KY1168), Natural Science Foundation of Hangzhou (Grant: 2024SZRYBH180006), Medical Health Science and Technology Project of Zhejiang Provincial Health Commission (Grant: 2022KY948), The Construction Fund of Key Medical Disciplines of Hangzhou (Grant: 2025HZPY06; 0020200484; 2025HZZD04), and Zhejiang Province Key Discipline Construction Fund for Traditional Chinese Medicine (Grant: 2024-XK-55).</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="0"/>
<equation-count count="0"/>
<ref-count count="38"/>
<page-count count="8"/>
<word-count count="3653"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Clinical Diabetes</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<title>Introduction</title>
<p>Diabetic foot infection (DFI) has become a major challenge in the global medical field (<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>), affecting approximately 19% to 34% of diabetic patients. Within five years of diagnosis, 17% of patients may progress to lower limb amputation (<xref ref-type="bibr" rid="B3">3</xref>). Despite the complexity of the treatment process, the guidelines jointly released by the International Working Group on the Diabetic Foot (IWGDF) and the Infectious Diseases Society of America (IDSA) in 2023 (<xref ref-type="bibr" rid="B4">4</xref>) provide a gold standard approach for assessing the severity of infection (IDSA/IWGDF 2023 classification), selecting antibiotics based on local antimicrobial susceptibility patterns, and establishing surgical referral criteria. With the rapid development of artificial intelligence technology, large language models (LLMs) have gradually become important tools for patients, medical students, and clinicians to obtain relevant information. However, due to the fact that these models have not been strictly validated to ensure the medical accuracy of their output information, concerns have been raised about their reliability in providing medical advice (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>).</p>
<p>The advent of generative artificial intelligence (AI) models, exemplified by systems like ChatGPT, represents a significant milestone in the evolution of healthcare technologies. These advanced AI systems leverage deep learning architectures, particularly transformer-based neural networks, to process and generate human-like text (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>). A key feature of these models is their ability to employ a multi-step &#x201c;Chain-of-Thought&#x201d; (CoT) reasoning based on probabilistic inference, which enables them to excel in structured reasoning tasks and generate logically coherent suggestions to support clinical decision-making (<xref ref-type="bibr" rid="B9">9</xref>, <xref ref-type="bibr" rid="B10">10</xref>). By breaking down complex medical queries into sequential reasoning steps, generative AI can mimic the cognitive processes of healthcare professionals, offering potential benefits in diagnostic accuracy, treatment planning, and patient education. However, the rapid integration of AI technologies into healthcare also raises critical concerns regarding their alignment with evidence-based clinical guidelines. LLMs such as ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7 while increasingly accessible to both patients and clinicians, face unresolved questions about their reliability in delivering medically accurate recommendations without rigorous validation (<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B12">12</xref>). Although prior studies have evaluated LLMs in specialties such as orthopedic pathology and stroke severity (<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>), the existing review and bibliometric studies reflect that LLMs applications in DFI remain unexplored (<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B16">16</xref>). DFI is a complex condition requiring integrated management of infection severity grading, antimicrobial stewardship, and surgical timing decisions.</p>
<p>Our study conducts a systematic evaluation of LLMs adherence to the 2023 IWGDF/IDSA guidelines, which are widely regarded as the benchmark for managing DFI. By analyzing model-generated responses to standardized clinical scenarios spanning infection classification, antibiotic selection, and surgical referral criteria, we quantify fidelity to evidence-based protocols and identify critical discrepancies in clinical reasoning. These findings will facilitate the implementation of LLMs. They ensure patient safety by following the recommendations of the guidelines, and reducing the workload of doctors through automated and context-based decision support, thereby promoting the safe integration of artificial intelligence into the diabetes foot care pathway.</p>
</sec>
<sec id="s2">
<title>Methods</title>
<p>This study utilized a suite of publicly accessible LLMs, specifically ChatGPT-4o, DeepSeek-R1 with Deep Think functionality, Grok-3, and Claude-3.7, to conduct its analyses. Given the non-human subject nature of the research, institutional review board (IRB) approval was not required, in accordance with ethical research guidelines.</p>
<p>To ensure the impartiality and reliability of the responses generated by these models, a rigorous methodology was employed. Each question was submitted independently to the respective versions of ChatGPT-4o, DeepSeek-R1, Grok-3, and Claude-3.7 without any prior prompting or contextual carryover from previous interactions. To further enhance the integrity of the results, a fresh chat session was initiated for every individual question, thereby minimizing any potential residual influence from prior queries. In the application of these AI models, the phenomenon of the priming effect is a critical consideration. The priming effect refers to the subtle influence that prior input data or contextual cues may exert on a model&#x2019;s output, potentially leading to biased tendencies, defined response patterns, or skewed interpretations. To mitigate the risk of such priming effects, strict temporal separation was maintained between interactions with different model versions. This was achieved by establishing a new session window for each question, ensuring that no residual context from previous queries could inadvertently shape the responses. This methodological rigor was implemented to uphold the objectivity and reproducibility of the findings. Furthermore, models including ChatGPT-4o, DeepSeek-R1, Grok-3, and Claude-3.7 are openly available. Their documented significance in contemporary medical literature indicates a strong potential for enhancing clinical workflows (<xref ref-type="bibr" rid="B17">17</xref>&#x2013;<xref ref-type="bibr" rid="B20">20</xref>).</p>
<p>A 5-point Likert scale was used to assess the accuracy and completeness of the four models responses: Accuracy, Overconclusiveness, Supplementary Value and Completeness (<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B21">21</xref>).</p>
<p>Accuracy:</p>
<list list-type="order">
<list-item>
<p>Completely incorrect</p></list-item>
<list-item>
<p>More incorrect than correct [&gt; 75% incorrect]</p></list-item>
<list-item>
<p>Approximately equal correct and incorrect</p></list-item>
<list-item>
<p>More correct than incorrect [&gt; 75% correct]</p></list-item>
<list-item>
<p>Completely correct</p></list-item>
</list>
<p>Overconclusiveness:</p>
<list list-type="order">
<list-item>
<p>Non-overconclusive [0% conflicting]</p></list-item>
<list-item>
<p>Minimally overconclusive [&lt;25% conflicting]</p></list-item>
<list-item>
<p>Partially overconclusive [50% conflicting]</p></list-item>
<list-item>
<p>Mostly overconclusive [&gt;75% conflicting]</p></list-item>
<list-item>
<p>Fully overconclusive [100% conflicting]</p></list-item>
</list>
<p>Supplementary Value:</p>
<list list-type="order">
<list-item>
<p>No supplementary value [0% added]</p></list-item>
<list-item>
<p>Low supplementary value [25% added]</p></list-item>
<list-item>
<p>Moderate supplementary value [50% added]</p></list-item>
<list-item>
<p>High supplementary value [&gt;75% added]</p></list-item>
<list-item>
<p>Exceptional supplementary value [100% novel]</p></list-item>
</list>
<p>Completeness:</p>
<list list-type="order">
<list-item>
<p>Very incomplete [0&#x2013;25%]</p></list-item>
<list-item>
<p>Incomplete [25&#x2013;50%]</p></list-item>
<list-item>
<p>Moderate [50&#x2013;75%]</p></list-item>
<list-item>
<p>Complete [&gt; 75%]</p></list-item>
<list-item>
<p>Very complete [100%]</p></list-item>
</list>
<p>Additionally, to assess the readability of each model&#x2019;s responses, we calculated the Flesch Reading Ease (FRE) and Flesch&#x2013;Kincaid Grade Level (FKGL) scores for each model&#x2019;s responses (<xref ref-type="bibr" rid="B21">21</xref>). A higher FRE score indicates easier readability, while a lower FKGL suggests the text is suitable for readers at a lower grade level.</p>
<p>The evaluation of the responses of the four models, namely ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7, was carried out by 3 independent reviewers to ensure the reliability of the scoring process. In cases of disagreement, a fourth author was consulted for resolution. SPSS 26.0 was used for statistical analysis. One-way analysis of variance (ANOVA) was applied to compare performance differences across models under identical dimensions. Fleiss&#x2019; kappa, serving as a generalization of this statistic, was used in SPSS to evaluate the consistency among the three raters for the ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7 response qualities. <italic>p</italic> &lt; 0.05 was considered to have statistical significance.</p>
</sec>
<sec id="s3" sec-type="results">
<title>Results</title>
<p>In this study, we evaluated the performance of four LLMs&#x2014;ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7. There were a total of 7 clinical scenarios that were included in the consensus practice guidelines on DFI. The outputs were evaluated using a 5-point Likert scale across four dimensions: Accuracy, Overconclusiveness, Supplementary Value, and Completeness. This approach allowed for a comprehensive comparison of each model&#x2019;s alignment with established clinical guidelines. The responses generated by all evaluated models were systematically documented in <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Table&#xa0;1</bold></xref>.</p>
<p>Using a 5-point Likert scale, no significant differences were observed among the four models in terms of Accuracy (<italic>p</italic> = 1), Overconclusiveness(<italic>p</italic> = 0.410). Significant differences in Supplementary Value were observed among the ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7 (one-way ANOVA: F(3, 24) = 17.67, <italic>p</italic> &lt; 0.001, R&#xb2; = 0.688). <italic>Post hoc</italic> Tukey&#x2019;s HSD tests revealed Grok-3 had significantly higher supplementarity than ChatGPT-4o (<italic>p</italic> &lt; 0.0001), DeepSeek-R1 (<italic>p</italic> = 0.003), Claude-3.7 (<italic>p</italic> &lt; 0.0001). No significant differences were detected among ChatGPT-4o, DeepSeek-R1 and Claude-3.7 (all <italic>p</italic> &gt; 0.05). Despite the ordinal pattern Grok-3 (3.049 &#xb1; 0.300) &gt; DeepSeek-R1 (2.047 &#xb1; 0.489) &gt; ChatGPT-4o (1.476 &#xb1; 0.504) &gt; Claude-3.7 (1.429 &#xb1; 0.535), statistical significance was confined to the contrasts between Grok-3 and the remaining groups. Among the four AI models, namely ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7, there are significant differences in terms of completeness (one-way ANOVA: F(3, 24) = 5.622, <italic>p</italic> = 0.005, R&#xb2; = 0.413). <italic>Post hoc</italic> Tukey&#x2019;s HSD tests revealed Grok-3 had significantly higher supplementarity than ChatGPT-4o (<italic>p</italic> = 0.016), Claude-3.7 (<italic>p</italic> = 0.010). No other pairwise comparisons reached statistical significance (adjusted <italic>p</italic> &gt; 0.05 for all other model combinations) (<xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>).</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Comparison among models. <bold>(A)</bold> Comparison of accuracy value scores across models. <bold>(B)</bold> Comparison of overconclusiveness value scores across models. <bold>(C)</bold> Comparison of Supplementary Value Scores Across Models. <bold>(D)</bold> Comparison of Completeness Value Scores Across Models. *<italic>p</italic> &lt; 0.05,**<italic>p</italic> &lt; 0.01,****<italic>p</italic> &lt; 0.0001, ns, not significant.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fendo-17-1667159-g001.tif">
<alt-text content-type="machine-generated">Four-panel grouped bar chart shows evaluation results for ChatGPT-4o, DeepSeek-R1, Grok-3, and Claude-3.7. Panel A measures accuracy, with all models scoring similarly. Panel B assesses overconclusiveness, with no significant differences. Panel C, supplementary value, highlights Grok-3 scoring significantly higher than others. Panel D compares completeness, with Grok-3 and DeepSeek-R1 rated above Claude-3.7 and ChatGPT-4o. Asterisks indicate statistically significant differences.</alt-text>
</graphic></fig>
<p>Significant differences in FRE scores were observed among the four models. One-way ANOVA showed notable group differences (F(3, 24) = 3.993, <italic>p</italic> = 0.019, R&#xb2; = 0.3329). <italic>Post hoc</italic> tests revealed DeepSeek-R1 had significantly lower readability than ChatGPT-4o (adjusted <italic>p</italic> = 0.046, 95% CI: 0.1296 to 17.37). No other pairwise comparisons reached statistical significance (adjusted <italic>p</italic> &gt; 0.05 for all other model-strategy combinations) (<xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>).</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Comparison of FRE scores across models. *<italic>p</italic> &lt; 0.05.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fendo-17-1667159-g002.tif">
<alt-text content-type="machine-generated">Bar chart comparing Flesch Reading Ease scores for ChatGPT-4o, DeepSeek-R1, Grok-3, and Claude-3.7. ChatGPT-4o has the highest mean score, marked statistically significant versus DeepSeek-R1, with error bars representing variation.</alt-text>
</graphic></fig>
<p>No significant differences were observed among the four models in terms of FKGL (<italic>p</italic> = 0.128) (<xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>).</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Comparison of FKGL across models. ns, not significant.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fendo-17-1667159-g003.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x201c;Flesch-Kincaid Grade Level&#x201d; compares ChatGPT-4o, DeepSeek-R1, Grok-3, and Claude-3.7. All models score around grade level fifteen, with error bars and no significant difference indicated.</alt-text>
</graphic></fig>
<p>The Kappa consistency analysis revealed that the consistency among the evaluators reached an almost perfect level (&#x3ba; = 0.890, <italic>p</italic> = 0.001). According to the Landis &amp; Koch criteria, this result was highly statistically significant, indicating that the consistency among the evaluators was not caused by random factors.</p>
</sec>
<sec id="s4" sec-type="discussion">
<title>Discussion</title>
<p>DFI is one of the most serious complications of diabetes. Approximately 6.3% of diabetic patients worldwide are at risk of foot ulcers, and half of them will subsequently develop an infection. Such infections account for 85% of all diabetic-related amputations each year (<xref ref-type="bibr" rid="B22">22</xref>&#x2013;<xref ref-type="bibr" rid="B24">24</xref>). Its pathogenesis is driven by neuropathy, ischemia, and immune disorders, leading to difficult-to-heal multi-microbial infections (<xref ref-type="bibr" rid="B25">25</xref>, <xref ref-type="bibr" rid="B26">26</xref>). Dickson et&#xa0;al. (<xref ref-type="bibr" rid="B27">27</xref>) analyzed 2.4 million emergency cases involving DFI in the United States from 2012 to 2021. The results showed that patients with diabetic foot infection had a threefold higher chance of hospitalization compared to those without DFI (OR = 3.002, <italic>p</italic> &lt; 0.001), and their hospital stay was prolonged by 55%, significantly increasing the social and economic burden (<xref ref-type="bibr" rid="B1">1</xref>). As various forms of AI are integrated into medical settings, the reliability of AI-generated content needs to be fully assessed. This study evaluates the concordance of ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7 with the 2023 international consensus guidelines for DFI. The findings highlight both the strengths and limitations of AI models in supporting clinical decision-making, emphasizing the need for careful integration and human oversight to ensure optimal patient care.</p>
<p>Across the dimensions of Accuracy and Overconclusiveness, no statistically significant differences were observed among the ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7. This aligns with recent advancements in AI models&#x2019; capacity to process structured clinical guidelines and generate evidence-based recommendations (<xref ref-type="bibr" rid="B28">28</xref>). For instance, ChatGPT-4o leverages its robust training framework to maintain guideline compliance despite architectural constraints, DeepSeek-R1&#x2019;s open-source architecture allows for rapid adaptation to domain-specific protocols, Grok-3 minimizes speculative outputs through its deep search and structured validation mechanisms, Claude-3.7 allows it to balance efficiency with deep, stepwise analysis in complex clinical scenarios while strictly avoiding speculative outputs (<xref ref-type="bibr" rid="B29">29</xref>, <xref ref-type="bibr" rid="B30">30</xref>). Notably, the absence of overconclusiveness in four models suggests robust alignment with guideline principles. This restraint helps avoid speculative recommendations in areas where evidence is insufficient, thereby providing a critical safeguard against iatrogenic risks in DFI management.</p>
<p>In contrast, the models demonstrated statistically significant differences in terms of Supplementary Value, with Grok-3 generally outperforming ChatGPT-4o, DeepSeek-R1 and Claude-3.7. For instance, when addressing pathogen identification in diabetic foot infections to guide antibiotic therapy, Grok introduced novel testing methods like polymerase chain reaction (PCR) and next-generation sequencing (NGS), along with operational details such as debridement techniques. This finding indicates that model architecture substantially influence the depth, contextuality, and educational value of responses (<xref ref-type="bibr" rid="B20">20</xref>). At the same time, these supplementary information can further enhance the quality of clinical decision-making by providing broader background knowledge or the latest research findings that have not yet been included in the guidelines. For instance, AI can deeply analyze the pathophysiological mechanisms of DFI and its related risk factors, which is of great reference value for patient education and for clinicians who wish to acquire the latest knowledge. However, such supplementary information needs to undergo strict evaluation and validation. As pointed out by Giuffre et&#xa0;al., the accuracy and relevance of these details may vary, and there is a risk of deviating from the guideline-oriented nursing process due to information overload (<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B31">31</xref>).</p>
<p>Although the models demonstrated generally high accuracy, their outputs exhibited deficiencies in completeness. These omissions frequently involved critical details such as pathogenesis mechanisms, including biofilm dynamics in chronic ulcers, and comprehensive multimodal treatment protocols that combine offloading with antibiotic stewardship. This limitation may originate from the reliance of LLMs on textual data rather than integrated multimodal inputs such as imaging or biomarkers, which are essential for holistic DFI care. For instance, while the IWGDF guidelines emphasize the importance of the Wound, Ischemia, and foot Infection (WIfI) classification system for ischemia assessment, AI models often fail to explicitly contextualize perfusion scores within broader treatment algorithms. A similar limitation has been observed in studies on AI-driven diabetic retinopathy tools (<xref ref-type="bibr" rid="B32">32</xref>). It is worth noting that Grok-3 performed better in terms of completeness than ChatGPT-4o and Claude-3.7, which may be related to its Chain of Thought reasoning mechanism and DeepSearch capabilities, enabling stepwise decomposition of complex clinical scenarios and real-time integration of multimodal evidence.</p>
<p>The Fleiss&#x2019; Kappa value of 0.890, indicating substantial agreement (<xref ref-type="bibr" rid="B33">33</xref>), highlights the high level of consistency among human evaluators in assessing the quality and accuracy of AI-generated outputs. This robust inter-rater reliability suggests that evaluators share a common understanding of the evaluation criteria, reinforcing the reliability of the assessment process.</p>
<p>However, notable discrepancies in grading the &#x201c;completeness&#x201d; dimension reveal the influence of subjective biases inherent in guideline-based evaluations. These inconsistencies likely stem from differences in individual interpretation of guidelines, varying levels of domain expertise, or ambiguity in defining &#x201c;completeness&#x201d; within the context of AI outputs. Such subjectivity underscores the limitations of relying solely on human judgment for evaluating complex AI-generated content. To address these challenges, future studies could enhance the objectivity and reproducibility of evaluations by integrating automated adherence metrics. For instance, employing established semantic similarity indices, such as cosine similarity or BERT-based embeddings, to measure the alignment between AI outputs and guideline excerpts could provide a standardized, quantitative approach to assessing completeness (<xref ref-type="bibr" rid="B34">34</xref>, <xref ref-type="bibr" rid="B35">35</xref>). These automated metrics would reduce reliance on subjective interpretation by offering a data-driven evaluation of how closely AI responses adhere to predefined guidelines. Additionally, incorporating natural language processing (NLP) techniques, such as topic modeling or keyword extraction, could further refine the evaluation process by identifying key thematic elements in both AI outputs and guidelines (<xref ref-type="bibr" rid="B36">36</xref>). Combining these automated tools with human oversight could create a hybrid evaluation framework that balances objectivity with the nuanced understanding that human evaluators bring.</p>
<p>This study conducted a comparative evaluation of text generated by four advanced large language models, ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7, with a focus on their readability and applicability in healthcare communication. The assessment employed two well-established readability metrics: the FRE score, which measures text comprehensibility on a scale from 0 (extremely difficult) to 100 (very easy), and the FKGL, which estimates the U.S. school grade level required to understand the text (<xref ref-type="bibr" rid="B37">37</xref>, <xref ref-type="bibr" rid="B38">38</xref>). The findings indicated that ChatGPT-4o-generated text was marginally more readable than DeepSeek-R1, based on both FRE scores and FKGL. Nonetheless, when evaluated against the standard FRE benchmark, all evaluated texts from the four models, received a classification of &#x201c;extremely difficult&#x201d;, defined by scores below 30. This suggests that, irrespective of the model used, the texts posed significant comprehension challenges for general readers, particularly patients or members of the public seeking accessible health information.</p>
<p>The FKGL results reinforced this conclusion, indicating that the texts required a reading proficiency equivalent to college-level education or higher, far exceeding the capabilities of typical lay audiences. The elevated reading difficulty was primarily attributed to the extensive incorporation of specialized medical terminology within the background prompts guiding the language models. These prompts were designed to ensure clinical accuracy and relevance, embedding domain-specific jargon which are standard in medical discourse but often unfamiliar to non-experts. While this technical language enhances the perceived precision and specificity of the content for clinical professionals such as physicians, nurses, or medical researchers, it creates a substantial cognitive barrier for lay readers. For example, patients or caregivers attempting to access health information may find such terminology confusing or intimidating, hindering their ability to make informed decisions about their care. In contrast, in clinical contexts where precision is critical, including the documentation of diagnostic rationales, treatment protocols, or research findings, the use of technical language is often essential to avoid ambiguity and ensure alignment with standardized medical guidelines.</p>
<p>This trade-off highlights a key tension: while technical rigor supports professional accuracy, it compromises accessibility for broader audiences, limiting the practical utility of AI-generated health content for non-specialist users. Based on this, a &#x201c;clinical language dynamic adaptation module&#x201d; can be developed, which can dynamically adjust the density of terms and the complexity of sentence structures by real-time identification of the user&#x2019;s identity, such as patients, grassroots medical staff, or specialists, thereby achieving a balance between professional rigor and cognitive accessibility.</p>
<p>The study has several limitations. Firstly, the inherent stochasticity of generative AI outputs persists despite protocol-mandated dialogue resetting between queries. This variability stems from probabilistic decoding mechanisms inherent in transformer-based architectures and may affect response reproducibility across sessions.</p>
<p>Secondly, the exclusive use of English for prompts and data collection limits the generalizability of our results in multilingual settings and non-English-speaking populations. To address this constraint, subsequent research should adopt a tiered multilingual evaluation framework. We recommend beginning with languages prevalent among diabetic populations such as Spanish, Mandarin, and Arabic, with subsequent expansion to regional languages. This approach should include professional translation and back-translation of standardized diabetic foot infection scenarios, cultural adaptation of clinical contexts by native-speaking healthcare professionals, and comprehensive assessment of both linguistic accuracy and cultural appropriateness in model responses. Such a framework would incorporate metrics for medical terminology consistency, conceptual equivalence, and practical applicability across diverse healthcare environments.</p>
<p>Furthermore, the study does not evaluate the impact of model fine-tuning or domain-specific adaptation on performance. General-purpose LLMs may struggle with specialized medical terminology or nuanced patient-provider interactions without targeted optimization. The absence of such adaptations could potentially skew readability metrics including FRE score and FKGL, leading to inaccurate estimates of text comprehensibility in real-world clinical scenarios.</p>
<p>Finally, the dynamic nature of AI models necessitates systematic long-term performance tracking. We propose a structured framework involving quarterly assessments across three key dimensions: guideline adherence using standardized clinical vignettes, temporal consistency in response quality through longitudinal analysis of identical queries, and adaptation to evolving medical evidence via dynamic benchmarking against updated IWGDF/IDSA guidelines.</p>
<p>Beyond such technical assessments, clinical validation represents an essential next step. Randomized controlled trials comparing LLM-assisted decision-making with standard care should examine patient-centered outcomes including diagnostic accuracy, antibiotic selection appropriateness, time to infection resolution, and patient satisfaction metrics. Concurrent evaluation of implementation challenges, particularly workflow integration and clinician acceptance, will be crucial for establishing the practical utility of AI-assisted approaches in diabetic foot infection management.</p>
</sec>
<sec id="s5" sec-type="conclusions">
<title>Conclusion</title>
<p>This study has confirmed the potential of tools such as ChatGPT-4o, DeepSeek-R1, Grok-3, and Claude-3.7 as auxiliary tools for the management of diabetic foot lesions. All models demonstrated high accuracy in meeting the standards of the International Working Group on Diabetic Foot/Guidelines of the American Diabetes Association. However, Grok-3 provided more contextual information and richer content in its responses. The text readability of all four models was very challenging, especially DeepSeek-R1, which might be more suitable for professional users who require detailed and domain-specific content. With the development of LLMs, they show potential as clinical auxiliary tools in the field of DFI management, but careful prompt design and domain validation remain crucial.</p>
<p>Furthermore, we propose the directions for future research. These include long-term performance tracking, multilingual evaluations, and model adaptations for healthcare contexts. In conclusion, while AI models show promise in supporting DFI management, their integration into clinical practice requires careful validation, human oversight, and strategies to improve accessibility for diverse audiences to ensure effective and equitable healthcare communication.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Material</bold></xref>. Further inquiries can be directed to the corresponding authors.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>HXW: Funding acquisition, Methodology, Formal analysis, Software, Writing &#x2013; original draft, Data curation. JD: Data curation, Software, Writing &#x2013; original draft, Methodology. XQ: Formal analysis, Methodology, Writing &#x2013; original draft, Data curation. LX: Methodology, Writing &#x2013; original draft, Software. LL: Writing &#x2013; original draft, Data curation. MF: Data curation, Writing &#x2013; original draft. DY: Formal analysis, Writing &#x2013; original draft. CL: Validation, Writing &#x2013; original draft. ZC: Writing &#x2013; original draft, Validation. KW: Validation, Writing &#x2013; original draft. YW: Validation, Supervision, Writing &#x2013; review &amp; editing, Funding acquisition. HZ: Supervision, Writing &#x2013; review &amp; editing, Validation, Funding acquisition. LC: Methodology, Validation, Supervision, Writing &#x2013; review &amp; editing. HBW: Writing &#x2013; review &amp; editing, Funding acquisition, Supervision, Project administration.</p></sec>
<ack>
<title>Acknowledgments</title>
<p>We used AI tools ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3.7 to answer questions from the research survey, and the full answers are attached to the <xref ref-type="supplementary-material" rid="SM1"><bold>Supplementary Materials</bold></xref>.</p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. During the preparation of this work the authors used ChatGPT-4o, DeepSeek-R1, Grok-3 and Claude-3-7 in order to assess the clinical utility of AI models in aligning with international guidelines for DFI management. After using this tool/service, the authors reviewed and edited the content as needed and take full responsibility for the content of the publication.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<sec id="s12" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fendo.2026.1667159/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fendo.2026.1667159/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Maity</surname> <given-names>S</given-names></name>
<name><surname>Leton</surname> <given-names>N</given-names></name>
<name><surname>Nayak</surname> <given-names>N</given-names></name>
<name><surname>Jha</surname> <given-names>A</given-names></name>
<name><surname>Anand</surname> <given-names>N</given-names></name>
<name><surname>Thompson</surname> <given-names>K</given-names></name>
<etal/>
</person-group>. 
<article-title>A systematic review of diabetic foot infections: pathogenesis, diagnosis, and management strategies</article-title>. <source>Front Clin Diabetes Healthc</source>. (<year>2024</year>) <volume>5</volume>:<elocation-id>1393309</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fcdhc.2024.1393309</pub-id>, PMID: <pub-id pub-id-type="pmid">39165660</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<label>2</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>P</given-names></name>
<name><surname>Lu</surname> <given-names>J</given-names></name>
<name><surname>Jing</surname> <given-names>Y</given-names></name>
<name><surname>Tang</surname> <given-names>S</given-names></name>
<name><surname>Zhu</surname> <given-names>D</given-names></name>
<name><surname>Bi</surname> <given-names>Y</given-names></name>
</person-group>. 
<article-title>Global epidemiology of diabetic foot ulceration: a systematic review and meta-analysis &#x2020;</article-title>. <source>Ann Med</source>. (<year>2017</year>) <volume>49</volume>:<page-range>106&#x2013;16</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1080/07853890.2016.1231932</pub-id>, PMID: <pub-id pub-id-type="pmid">27585063</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<label>3</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Peters</surname> <given-names>EJ</given-names></name>
<name><surname>Lipsky</surname> <given-names>BA</given-names></name>
</person-group>. 
<article-title>Diagnosis and management of infection in the diabetic foot</article-title>. <source>Med Clin North Am</source>. (<year>2013</year>) <volume>97</volume>:<page-range>911&#x2013;46</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.mcna.2013.04.005</pub-id>, PMID: <pub-id pub-id-type="pmid">23992901</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<label>4</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Senneville</surname> <given-names>&#xc9;</given-names></name>
<name><surname>Albalawi</surname> <given-names>Z</given-names></name>
<name><surname>van Asten</surname> <given-names>SA</given-names></name>
<name><surname>Abbas</surname> <given-names>ZG</given-names></name>
<name><surname>Allison</surname> <given-names>G</given-names></name>
<name><surname>Arag&#xf3;n-S&#xe1;nchez</surname> <given-names>J</given-names></name>
<etal/>
</person-group>. 
<article-title>IWGDF/IDSA guidelines on the diagnosis and treatment of diabetes-related foot infections (IWGDF/IDSA 2023)</article-title>. <source>Diabetes Metab Res Rev</source>. (<year>2024</year>) <volume>40</volume>:<fpage>e3687</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/dmrr.3687</pub-id>, PMID: <pub-id pub-id-type="pmid">37779323</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<label>5</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mondal</surname> <given-names>A</given-names></name>
<name><surname>Naskar</surname> <given-names>A</given-names></name>
<name><surname>Roy Choudhury</surname> <given-names>B</given-names></name>
<name><surname>Chakraborty</surname> <given-names>S</given-names></name>
<name><surname>Biswas</surname> <given-names>T</given-names></name>
<name><surname>Sinha</surname> <given-names>S</given-names></name>
<etal/>
</person-group>. 
<article-title>Evaluating the performance and safety of large language models in generating type 2 diabetes mellitus management plans: A comparative study with physicians using real patient records</article-title>. <source>Cureus</source>. (<year>2025</year>) <volume>17</volume>:<fpage>e80737</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.7759/cureus.80737</pub-id>, PMID: <pub-id pub-id-type="pmid">40248538</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<label>6</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Aydin</surname> <given-names>S</given-names></name>
<name><surname>Karabacak</surname> <given-names>M</given-names></name>
<name><surname>Vlachos</surname> <given-names>V</given-names></name>
<name><surname>Margetis</surname> <given-names>K</given-names></name>
</person-group>. 
<article-title>Large language models in patient education: a scoping review of applications in medicine</article-title>. <source>Front Med (Lausanne)</source>. (<year>2024</year>) <volume>11</volume>:<elocation-id>1477898</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fmed.2024.1477898</pub-id>, PMID: <pub-id pub-id-type="pmid">39534227</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<label>7</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Moor</surname> <given-names>M</given-names></name>
<name><surname>Banerjee</surname> <given-names>O</given-names></name>
<name><surname>Abad</surname> <given-names>ZSH</given-names></name>
<name><surname>Krumholz</surname> <given-names>HM</given-names></name>
<name><surname>Leskovec</surname> <given-names>J</given-names></name>
<name><surname>Topol</surname> <given-names>EJ</given-names></name>
<etal/>
</person-group>. 
<article-title>Foundation models for generalist medical artificial intelligence</article-title>. <source>Nature</source>. (<year>2023</year>) <volume>616</volume>:<page-range>259&#x2013;65</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41586-023-05881-4</pub-id>, PMID: <pub-id pub-id-type="pmid">37045921</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<label>8</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Singhal</surname> <given-names>K</given-names></name>
<name><surname>Azizi</surname> <given-names>S</given-names></name>
<name><surname>Tu</surname> <given-names>T</given-names></name>
<name><surname>Mahdavi</surname> <given-names>SS</given-names></name>
<name><surname>Wei</surname> <given-names>J</given-names></name>
<name><surname>Chung</surname> <given-names>HW</given-names></name>
<etal/>
</person-group>. 
<article-title>Large language models encode clinical knowledge</article-title>. <source>Nature</source>. (<year>2023</year>) <volume>620</volume>:<page-range>172&#x2013;80</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>, PMID: <pub-id pub-id-type="pmid">37438534</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<label>9</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shah</surname> <given-names>HA</given-names></name>
<name><surname>Househ</surname> <given-names>M</given-names></name>
</person-group>. 
<article-title>Chain of thought strategy for smaller LLMs for medical reasoning</article-title>. <source>Stud Health Technol Inform</source>. (<year>2025</year>) <volume>327</volume>:<page-range>783&#x2013;7</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3233/SHTI250466</pub-id>, PMID: <pub-id pub-id-type="pmid">40380574</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<label>10</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li&#xe9;vin</surname> <given-names>V</given-names></name>
<name><surname>Hother</surname> <given-names>CE</given-names></name>
<name><surname>Motzfeldt</surname> <given-names>AG</given-names></name>
<name><surname>Winther</surname> <given-names>O</given-names></name>
</person-group>. 
<article-title>Can large language models reason about medical questions</article-title>? <source>Patterns (N Y)</source>. (<year>2024</year>) <volume>5</volume>:<elocation-id>100943</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.patter.2024.100943</pub-id>, PMID: <pub-id pub-id-type="pmid">38487804</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<label>11</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Giuffr&#xe8;</surname> <given-names>M</given-names></name>
<name><surname>Kresevic</surname> <given-names>S</given-names></name>
<name><surname>You</surname> <given-names>K</given-names></name>
<name><surname>Dupont</surname> <given-names>J</given-names></name>
<name><surname>Huebner</surname> <given-names>J</given-names></name>
<name><surname>Grimshaw</surname> <given-names>AA</given-names></name>
<etal/>
</person-group>. 
<article-title>Systematic review: The use of large language models as medical chatbots in digestive diseases</article-title>. <source>Aliment Pharmacol Ther</source>. (<year>2024</year>) <volume>60</volume>:<page-range>144&#x2013;66</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/apt.18058</pub-id>, PMID: <pub-id pub-id-type="pmid">38798194</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<label>12</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Fatima</surname> <given-names>A</given-names></name>
<name><surname>Shafique</surname> <given-names>MA</given-names></name>
<name><surname>Alam</surname> <given-names>K</given-names></name>
<name><surname>Fadlalla Ahmed</surname> <given-names>TK</given-names></name>
<name><surname>Mustafa</surname> <given-names>MS</given-names></name>
</person-group>. 
<article-title>ChatGPT in medicine: A cross-disciplinary systematic review of ChatGPT&#x2019;s (artificial intelligence) role in research, clinical practice, education, and patient interaction</article-title>. <source>Med (Baltimore)</source>. (<year>2024</year>) <volume>103</volume>:<fpage>e39250</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1097/MD.0000000000039250</pub-id>, PMID: <pub-id pub-id-type="pmid">39121303</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<label>13</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mejia</surname> <given-names>MR</given-names></name>
<name><surname>Arroyave</surname> <given-names>JS</given-names></name>
<name><surname>Saturno</surname> <given-names>M</given-names></name>
<name><surname>Ndjonko</surname> <given-names>LCM</given-names></name>
<name><surname>Zaidat</surname> <given-names>B</given-names></name>
<name><surname>Rajjoub</surname> <given-names>R</given-names></name>
<etal/>
</person-group>. 
<article-title>Use of ChatGPT for determining clinical and surgical treatment of lumbar disc herniation with radiculopathy: A North American spine society guideline comparison</article-title>. <source>Neurospine</source>. (<year>2024</year>) <volume>21</volume>:<page-range>149&#x2013;58</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.14245/ns.2347052.526</pub-id>, PMID: <pub-id pub-id-type="pmid">38291746</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<label>14</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gu</surname> <given-names>Z</given-names></name>
<name><surname>Jia</surname> <given-names>W</given-names></name>
<name><surname>Piccardi</surname> <given-names>M</given-names></name>
<name><surname>Yu</surname> <given-names>P</given-names></name>
</person-group>. 
<article-title>Empowering large language models for automated clinical assessment with generation-augmented retrieval and hierarchical chain-of-thought</article-title>. <source>Artif Intell Med</source>. (<year>2025</year>) <volume>162</volume>:<elocation-id>103078</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.artmed.2025.103078</pub-id>, PMID: <pub-id pub-id-type="pmid">39978047</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<label>15</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sheng</surname> <given-names>B</given-names></name>
<name><surname>Pushpanathan</surname> <given-names>K</given-names></name>
<name><surname>Guan</surname> <given-names>Z</given-names></name>
<name><surname>Lim</surname> <given-names>QH</given-names></name>
<name><surname>Lim</surname> <given-names>ZW</given-names></name>
<name><surname>Yew</surname> <given-names>SME</given-names></name>
<etal/>
</person-group>. 
<article-title>Artificial intelligence for diabetes care: current and future prospects</article-title>. <source>Lancet Diabetes Endocrinol</source>. (<year>2024</year>) <volume>12</volume>:<page-range>569&#x2013;95</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S2213-8587(24)00154-2</pub-id>, PMID: <pub-id pub-id-type="pmid">39054035</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<label>16</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tao</surname> <given-names>Y</given-names></name>
<name><surname>Hou</surname> <given-names>J</given-names></name>
<name><surname>Zhou</surname> <given-names>G</given-names></name>
<name><surname>Zhang</surname> <given-names>D</given-names></name>
</person-group>. 
<article-title>Artificial intelligence applied to diabetes complications: a bibliometric analysis</article-title>. <source>Front Artif Intell</source>. (<year>2025</year>) <volume>8</volume>:<elocation-id>1455341</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/frai.2025.1455341</pub-id>, PMID: <pub-id pub-id-type="pmid">39959916</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<label>17</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>YQ</given-names></name>
<name><surname>Yu</surname> <given-names>T</given-names></name>
<name><surname>Song</surname> <given-names>ZQ</given-names></name>
<name><surname>Wang</surname> <given-names>CY</given-names></name>
<name><surname>Luo</surname> <given-names>JT</given-names></name>
<name><surname>Xiao</surname> <given-names>Y</given-names></name>
<etal/>
</person-group>. 
<article-title>Application of large language models in drug-induced osteotoxicity prediction</article-title>. <source>J Chem Inf Model</source>. (<year>2025</year>) <volume>65</volume>:<page-range>3370&#x2013;9</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1021/acs.jcim.5c00275</pub-id>, PMID: <pub-id pub-id-type="pmid">40114317</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<label>18</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Duey</surname> <given-names>AH</given-names></name>
<name><surname>Nietsch</surname> <given-names>KS</given-names></name>
<name><surname>Zaidat</surname> <given-names>B</given-names></name>
<name><surname>Ren</surname> <given-names>R</given-names></name>
<name><surname>Ndjonko</surname> <given-names>LCM</given-names></name>
<name><surname>Shrestha</surname> <given-names>N</given-names></name>
<etal/>
</person-group>. 
<article-title>Thromboembolic prophylaxis in spine surgery: an analysis of ChatGPT recommendations</article-title>. <source>Spine J</source>. (<year>2023</year>) <volume>23</volume>:<page-range>1684&#x2013;91</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.spinee.2023.07.015</pub-id>, PMID: <pub-id pub-id-type="pmid">37499880</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<label>19</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Agarwal</surname> <given-names>M</given-names></name>
<name><surname>Sharma</surname> <given-names>P</given-names></name>
<name><surname>Wani</surname> <given-names>P</given-names></name>
</person-group>. 
<article-title>Evaluating the accuracy and reliability of large language models (ChatGPT, Claude, DeepSeek, Gemini, Grok, and Le Chat) in answering item-analyzed multiple-choice questions on blood physiology</article-title>. <source>Cureus</source>. (<year>2025</year>) <volume>17</volume>:<fpage>e81871</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.7759/cureus.81871</pub-id>, PMID: <pub-id pub-id-type="pmid">40342473</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<label>20</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sandmann</surname> <given-names>S</given-names></name>
<name><surname>Hegselmann</surname> <given-names>S</given-names></name>
<name><surname>Fujarski</surname> <given-names>M</given-names></name>
<name><surname>Bickmann</surname> <given-names>L</given-names></name>
<name><surname>Wild</surname> <given-names>B</given-names></name>
<name><surname>Eils</surname> <given-names>R</given-names></name>
<etal/>
</person-group>. 
<article-title>Benchmark evaluation of DeepSeek large language models in clinical decision-making</article-title>. <source>Nat Med</source>. (<year>2025</year>) <volume>31</volume>:<page-range>2546&#x2013;2549</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41591-025-03727-2</pub-id>, PMID: <pub-id pub-id-type="pmid">40267970</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<label>21</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>S</given-names></name>
<name><surname>Wang</surname> <given-names>Y</given-names></name>
<name><surname>Jiang</surname> <given-names>L</given-names></name>
<name><surname>Chang</surname> <given-names>Y</given-names></name>
<name><surname>Zhang</surname> <given-names>S</given-names></name>
<name><surname>Zhao</surname> <given-names>K</given-names></name>
<etal/>
</person-group>. 
<article-title>Assessing the clinical support capabilities of ChatGPT 4o and ChatGPT 4o mini in managing lumbar disc herniation</article-title>. <source>Eur J Med Res</source>. (<year>2025</year>) <volume>30</volume>:<fpage>45</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s40001-025-02296-x</pub-id>, PMID: <pub-id pub-id-type="pmid">39844276</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<label>22</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cortes-Penfield</surname> <given-names>NW</given-names></name>
<name><surname>Armstrong</surname> <given-names>DG</given-names></name>
<name><surname>Brennan</surname> <given-names>MB</given-names></name>
<name><surname>Fayfman</surname> <given-names>M</given-names></name>
<name><surname>Ryder</surname> <given-names>JH</given-names></name>
<name><surname>Tan</surname> <given-names>TW</given-names></name>
<etal/>
</person-group>. 
<article-title>Evaluation and management of diabetes-related foot infections</article-title>. <source>Clin Infect Dis</source>. (<year>2023</year>) <volume>77</volume>:<fpage>e1</fpage>&#x2013;<lpage>e13</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/cid/ciad255</pub-id>, PMID: <pub-id pub-id-type="pmid">37306693</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<label>23</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Armstrong</surname> <given-names>DG</given-names></name>
<name><surname>Boulton</surname> <given-names>AJM</given-names></name>
<name><surname>Bus</surname> <given-names>SA</given-names></name>
</person-group>. 
<article-title>Diabetic foot ulcers and their recurrence</article-title>. <source>N Engl J Med</source>. (<year>2017</year>) <volume>376</volume>:<page-range>2367&#x2013;75</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1056/NEJMra1615439</pub-id>, PMID: <pub-id pub-id-type="pmid">28614678</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<label>24</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Edmonds</surname> <given-names>M</given-names></name>
<name><surname>Manu</surname> <given-names>C</given-names></name>
<name><surname>Vas</surname> <given-names>P</given-names></name>
</person-group>. 
<article-title>The current burden of diabetic foot disease</article-title>. <source>J Clin Orthop Trauma</source>. (<year>2021</year>) <volume>17</volume>:<fpage>88</fpage>&#x2013;<lpage>93</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jcot.2021.01.017</pub-id>, PMID: <pub-id pub-id-type="pmid">33680841</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<label>25</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Nauriyal</surname> <given-names>V</given-names></name>
<name><surname>Byers</surname> <given-names>K</given-names></name>
</person-group>. 
<article-title>Diabetic foot infections: Questions for an infectious disease consultant</article-title>. <source>Semin Vasc Surg</source>. (<year>2025</year>) <volume>38</volume>:<fpage>85</fpage>&#x2013;<lpage>93</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1053/j.semvascsurg.2025.01.009</pub-id>, PMID: <pub-id pub-id-type="pmid">40086926</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<label>26</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bekaryssova</surname> <given-names>D</given-names></name>
<name><surname>Yessirkepov</surname> <given-names>M</given-names></name>
<name><surname>Rakisheva</surname> <given-names>AU</given-names></name>
<name><surname>Bakytzhan</surname> <given-names>A</given-names></name>
</person-group>. 
<article-title>Diabetic foot in the context of rheumatic diseases: pathogenesis and treatment approaches</article-title>. <source>Rheumatol Int</source>. (<year>2025</year>) <volume>45</volume>:<fpage>132</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00296-025-05890-8</pub-id>, PMID: <pub-id pub-id-type="pmid">40314825</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<label>27</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dickson</surname> <given-names>MC</given-names></name>
<name><surname>Skrepnek</surname> <given-names>GH</given-names></name>
</person-group>. 
<article-title>Hospitalization and health resource utilization in emergency department cases of diabetic foot infections in the U.S. from 2012 to 2021: A nationally representative analysis</article-title>. <source>J Clin Med</source>. (<year>2024</year>) <volume>13</volume>:<elocation-id>5361</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/jcm13185361</pub-id>, PMID: <pub-id pub-id-type="pmid">39336851</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<label>28</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hinostroza Fuentes</surname> <given-names>VG</given-names></name>
<name><surname>Karim</surname> <given-names>HA</given-names></name>
<name><surname>Tan</surname> <given-names>MJT</given-names></name>
<name><surname>AlDahoul</surname> <given-names>N</given-names></name>
</person-group>. 
<article-title>AI with agency: a vision for adaptive, efficient, and ethical healthcare</article-title>. <source>Front Digit Health</source>. (<year>2025</year>) <volume>7</volume>:<elocation-id>1600216</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fdgth.2025.1600216</pub-id>, PMID: <pub-id pub-id-type="pmid">40400543</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<label>29</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dincer</surname> <given-names>HA</given-names></name>
<name><surname>Dogu</surname> <given-names>D</given-names></name>
</person-group>. 
<article-title>Evaluating artificial intelligence in patient education: DeepSeek-V3 versus ChatGPT-4o in answering common questions on laparoscopic cholecystectomy</article-title>. <source>ANZ J Surg</source>. (<year>2025</year>) <volume>95</volume>:<page-range>2322&#x2013;2328</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/ans.70198</pub-id>, PMID: <pub-id pub-id-type="pmid">40495650</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<label>30</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Uldin</surname> <given-names>H</given-names></name>
<name><surname>Saran</surname> <given-names>S</given-names></name>
<name><surname>Gandikota</surname> <given-names>G</given-names></name>
<name><surname>Iyengar</surname> <given-names>KP</given-names></name>
<name><surname>Vaishya</surname> <given-names>R</given-names></name>
<name><surname>Parmar</surname> <given-names>Y</given-names></name>
<etal/>
</person-group>. 
<article-title>A comparison of performance of DeepSeek-R1 model-generated responses to musculoskeletal radiology queries against ChatGPT-4 and ChatGPT-4o - A feasibility study</article-title>. <source>Clin Imaging</source>. (<year>2025</year>) <volume>123</volume>:<elocation-id>110506</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.clinimag.2025.110506</pub-id>, PMID: <pub-id pub-id-type="pmid">40381536</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<label>31</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wagner</surname> <given-names>MW</given-names></name>
<name><surname>Ertl-Wagner</surname> <given-names>BB</given-names></name>
</person-group>. 
<article-title>Accuracy of information and references using ChatGPT-3 for retrieval of clinical radiological information</article-title>. <source>Can Assoc Radiol J</source>. (<year>2024</year>) <volume>75</volume>:<fpage>69</fpage>&#x2013;<lpage>73</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1177/08465371231171125</pub-id>, PMID: <pub-id pub-id-type="pmid">37078489</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<label>32</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wei</surname> <given-names>H</given-names></name>
<name><surname>Shi</surname> <given-names>P</given-names></name>
<name><surname>Miao</surname> <given-names>J</given-names></name>
<name><surname>Zhang</surname> <given-names>M</given-names></name>
<name><surname>Bai</surname> <given-names>G</given-names></name>
<name><surname>Qiu</surname> <given-names>J</given-names></name>
<etal/>
</person-group>. 
<article-title>CauDR: A causality-inspired domain generalization framework for fundus-based diabetic retinopathy grading</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>175</volume>:<elocation-id>108459</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108459</pub-id>, PMID: <pub-id pub-id-type="pmid">38701588</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<label>33</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Landis</surname> <given-names>JR</given-names></name>
<name><surname>Koch</surname> <given-names>GG</given-names></name>
</person-group>. 
<article-title>The measurement of observer agreement for categorical data</article-title>. <source>Biometrics</source>. (<year>1977</year>) <volume>33</volume>:<page-range>159&#x2013;74</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.2307/2529310</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<label>34</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>L</given-names></name>
<name><surname>Zhai</surname> <given-names>Y</given-names></name>
<name><surname>Gao</surname> <given-names>J</given-names></name>
<name><surname>Wang</surname> <given-names>L</given-names></name>
<name><surname>Hou</surname> <given-names>L</given-names></name>
<name><surname>Zhao</surname> <given-names>J</given-names></name>
</person-group>. 
<article-title>Stacking-BERT model for Chinese medical procedure entity normalization</article-title>. <source>Math Biosci Eng</source>. (<year>2023</year>) <volume>20</volume>:<page-range>1018&#x2013;36</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3934/mbe.2023047</pub-id>, PMID: <pub-id pub-id-type="pmid">36650800</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<label>35</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ji</surname> <given-names>Z</given-names></name>
<name><surname>Wei</surname> <given-names>Q</given-names></name>
<name><surname>Xu</surname> <given-names>H</given-names></name>
</person-group>. 
<article-title>BERT-based ranking for biomedical entity normalization</article-title>. <source>AMIA Jt Summits Transl Sci Proc</source>. (<year>2020</year>) <volume>2020</volume>:<page-range>269&#x2013;77</page-range>.
</mixed-citation>
</ref>
<ref id="B36">
<label>36</label>
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Chris van</surname> <given-names>L</given-names></name>
<name><surname>Albert</surname> <given-names>G</given-names></name>
<name><surname>Emiel van</surname> <given-names>M</given-names></name>
<name><surname>Sander</surname> <given-names>W</given-names></name>
<name><surname>Emiel</surname> <given-names>K</given-names></name>
</person-group>. (<year>2019</year>). 
<article-title>Best practices for the human evaluation of automatically generated text</article-title>, in: <conf-name>Proceedings of the 12th International Conference on Natural Language Generation, 355&#x2013;368, Tokyo, Japan</conf-name>, . 
<publisher-name>Association for Computational Linguistics</publisher-name>. doi:&#xa0;<pub-id pub-id-type="doi">10.18653/v1/W19-8643</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<label>37</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sarangi</surname> <given-names>PK</given-names></name>
<name><surname>Lumbani</surname> <given-names>A</given-names></name>
<name><surname>Swarup</surname> <given-names>MS</given-names></name>
<name><surname>Panda</surname> <given-names>S</given-names></name>
<name><surname>Sahoo</surname> <given-names>SS</given-names></name>
<name><surname>Hui</surname> <given-names>P</given-names></name>
<etal/>
</person-group>. 
<article-title>Assessing ChatGPT&#x2019;s proficiency in simplifying radiological reports for healthcare professionals and patients</article-title>. <source>Cureus</source>. (<year>2023</year>) <volume>15</volume>:<fpage>e50881</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.7759/cureus.50881</pub-id>, PMID: <pub-id pub-id-type="pmid">38249202</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<label>38</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Grilo</surname> <given-names>A</given-names></name>
<name><surname>Marques</surname> <given-names>C</given-names></name>
<name><surname>Corte-Real</surname> <given-names>M</given-names></name>
<name><surname>Carolino</surname> <given-names>E</given-names></name>
<name><surname>Caetano</surname> <given-names>M</given-names></name>
</person-group>. 
<article-title>Assessing the quality and reliability of ChatGPT&#x2019;s responses to radiotherapy-related patient queries: comparative study with GPT-3.5 and GPT-4</article-title>. <source>JMIR Cancer</source>. (<year>2025</year>) <volume>11</volume>:<fpage>e63677</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.2196/63677</pub-id>, PMID: <pub-id pub-id-type="pmid">40239208</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2886848">Hong Sun</ext-link>, Jiaxing University, China</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2570577">Huseyin Canbolat</ext-link>, Ankara Y&#x131;ld&#x131;r&#x131;m Beyaz&#x131;t University, T&#xfc;rkiye</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2975661">Yasodha Krishna Janapati</ext-link>, United States International University - Africa, Kenya</p></fn>
</fn-group>
</back>
</article>