<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2025.1639221</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Transforming cataract care through artificial intelligence: an evaluation of large language models&#x2019; performance in addressing cataract-related queries</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Xinyue</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Yan</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Song</surname>
<given-names>Linghao</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wen</surname>
<given-names>Yinuo</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Peng</surname>
<given-names>Shenjie</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ren</surname>
<given-names>Ruoxi</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Yi</given-names>
</name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2695171/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Chen</surname>
<given-names>Tianhui</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Jiang</surname>
<given-names>Yongxiang</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1071234/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Eye Institute and Department of Ophthalmology, Eye and ENT Hospital, Fudan University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Key Laboratory of Myopia and Related Eye Diseases, NHC</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Key Laboratory of Myopia and Related Eye Diseases, Chinese Academy of Medical Sciences</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Shanghai Key Laboratory of Visual Impairment and Restoration</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>The First Affiliated Hospital of Zhejiang Chinese Medical University (Zhejiang Provincial Hospital of Chinese Medicine)</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/588118/overview">Tim Hulsen</ext-link>, Rotterdam University of Applied Sciences, Netherlands</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2791526/overview">Dongkan Li</ext-link>, Xiamen University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3055501/overview">Reem AlHuthail</ext-link>, Imam Muhammad Ibn Saud Islamic University, Saudi Arabia</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Tianhui Chen, <email>chentianhui97@163.com</email>; Yongxiang Jiang, <email>yongxiang_jiang@163.com</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>05</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1639221</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 Wang, Liu, Song, Wen, Peng, Ren, Zhang, Chen and Jiang.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Wang, Liu, Song, Wen, Peng, Ren, Zhang, Chen and Jiang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec id="sec1">
<title>Purpose</title>
<p>To evaluate the performance of five popular large language models (LLMs) in addressing cataract-related queries.</p>
</sec>
<sec id="sec2">
<title>Methods</title>
<p>This comparative evaluation study was conducted at the Eye and ENT Hospital of Fudan University. We performed both qualitative and quantitative assessments of responses from five LLMs: ChatGPT-4, ChatGPT-4o, Gemini, Copilot, and the open-source Llama 3.5. Model outputs were benchmarked against human-generated responses using seven key metrics: accuracy, completeness, conciseness, harmlessness, readability, stability, and self-correction capability. Additional inter-model comparisons were performed across question subgroups categorized by clinical topic type.</p>
</sec>
<sec id="sec3">
<title>Results</title>
<p>In the information quality assessment, ChatGPT-4o demonstrated the best performance across most metrics, including accuracy score (6.70&#x202F;&#x00B1;&#x202F;0.63), completeness score (4.63&#x202F;&#x00B1;&#x202F;0.63), and harmlessness score (3.97&#x202F;&#x00B1;&#x202F;0.17). Gemini achieved the highest conciseness score (4.00&#x202F;&#x00B1;&#x202F;0.14). Further subgroup analysis showed that all LLMs performed comparably to or better than humans, regardless of the type of question posed. The readability assessment revealed that ChatGPT-4o had the lowest readability score (26.02&#x202F;&#x00B1;&#x202F;10.78), indicating the highest level of reading difficulty. While Copilot recorded a higher readability score (40.26&#x202F;&#x00B1;&#x202F;14.58) than the other LLMs, it still remained lower than that of humans (51.54&#x202F;&#x00B1;&#x202F;13.71). Copilot also exhibited the best stability in reproducibility and stability assessment. All LLMs demonstrated strong self-correction capability when prompted.</p>
</sec>
<sec id="sec4">
<title>Conclusion</title>
<p>Our study suggested that LLMs exhibited considerable potential in providing accurate and comprehensive responses to common cataract-related clinical issues. Notably, ChatGPT-4o achieved the best scores in accuracy, completeness, and harmlessness. Despite these promising results, clinicians and patients should be aware of the limitations of artificial intelligence (AI) to ensure critical evaluation in clinical practice.</p>
</sec>
</abstract>
<kwd-group>
<kwd>large language models</kwd>
<kwd>cataract</kwd>
<kwd>patient education</kwd>
<kwd>artificial intelligence</kwd>
<kwd>cataract surgery</kwd>
</kwd-group>
<counts>
<fig-count count="3"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="26"/>
<page-count count="8"/>
<word-count count="5095"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medicine and Public Health</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec5">
<label>1</label>
<title>Introduction</title>
<p>Nowadays, the increasing reliance of patients on social media and search engines for medical advice has rendered online health information seeking behavior (HISB) a ubiquitous global phenomenon (<xref ref-type="bibr" rid="ref26">Zhang et al., 2021</xref>). Large language models (LLMs) chatbots, sophisticated artificial intelligence (AI) systems that possess the capacity for human-like text comprehension and generation, have become an increasingly popular modality for individuals seeking online health information (OHI). In the realm of ophthalmology, owing to the conversational interactivity and near-human-level performance in cognitive tasks, LLM-chatbots have the potential to address patient-specific questions (<xref ref-type="bibr" rid="ref3">Antaki et al., 2024</xref>; <xref ref-type="bibr" rid="ref19">Pushpanathan et al., 2023</xref>; <xref ref-type="bibr" rid="ref4">Bernstein et al., 2023</xref>), and facilitate discussions on the diagnosis and treatments of ocular diseases (<xref ref-type="bibr" rid="ref23">Thirunavukarasu et al., 2023</xref>; <xref ref-type="bibr" rid="ref1">Alberts et al., 2023</xref>; <xref ref-type="bibr" rid="ref12">Hu et al., 2023</xref>).</p>
<p>Unlike traditional supervised deep learning models, LLMs leverage self-supervised learning to efficiently acquire knowledge from vast amounts of unannotated data, and are fine-tuned on smaller annotated datasets to optimize performance on specific tasks defined by end-users5. Consequently, while chatbots can provide authoritative-sounding responses to complex medical queries, the reliability of their training data and processes is still a critical concern due to the risk of factually inaccurate responses (<xref ref-type="bibr" rid="ref5">Chen et al., 2023</xref>; <xref ref-type="bibr" rid="ref24">van Dis et al., 2023</xref>). The phenomenon of &#x2018;hallucinations&#x2019; or &#x2018;fact fabrication&#x2019;, where inaccurate information is generated and presented, has been extensively documented (<xref ref-type="bibr" rid="ref5">Chen et al., 2023</xref>; <xref ref-type="bibr" rid="ref14">Ji et al., 2023</xref>; <xref ref-type="bibr" rid="ref2">Alkaissi and McFarlane, 2023</xref>). For this reason, verifying the validity of the information provided by LLM-chatbots, particularly in the context of specialized ophthalmologic questions, is crucial to guarantee patient safety (<xref ref-type="bibr" rid="ref10">Gupta et al., 2023</xref>).</p>
<p>A comprehensive patient counseling may be beneficial to help patients better prepare themselves for the surgery and reduce the anxieties that patients may experience preoperatively (<xref ref-type="bibr" rid="ref11">Gupta et al., 2024</xref>; <xref ref-type="bibr" rid="ref20">Ramirez et al., 2017</xref>; <xref ref-type="bibr" rid="ref18">Newman-Casey et al., 2015</xref>). Despite the increasing prevalence of LLMs and their potential to assist patient education, the accuracy and utility of LLMs in the context of cataract care remain relatively unexplored. Furthermore, in addition to well-established closed-source LLMs such as ChatGPT and Copilot, Meta Platforms&#x2019; Llama-3.1405B (released in July 2024) has garnered significant attention for its enhanced language understanding, generation capabilities, and overall performance. As the first openly available model to rival leading AI models, its ability to provide accurate, comprehensive, and harmless information regarding cataract care-related queries remains uncertain, highlighting a critical gap in current research.</p>
<p>This study conducts a comprehensive evaluation of the performance of chatbot-generated responses to cataract-related queries, which are subjective, open-ended, and reflective of the challenges and ambiguities encountered by patients in clinical settings. By comparing the models&#x2019; response quality on cataract-related questions with OHI from authoritative ophthalmologic websites, this study provides an early evidence base on the reliability of chatbots in clinical settings. Furthermore, it highlights the limitations of LLM-generated medical information.</p>
</sec>
<sec sec-type="methods" id="sec6">
<label>2</label>
<title>Methods</title>
<sec id="sec7">
<label>2.1</label>
<title>Question-answer database</title>
<p>This process began with systematic sourcing queries from authoritative OHI outlets, including the National Eye Institute, the American Academy of Ophthalmology, and the Eye and ENT Hospital of Fudan University. We focused on the most common and representative issues encountered by patients in clinical settings. The selected queries were then standardized through a careful process, ensuring that each question was framed clearly and consistently to reflect the most relevant and frequently addressed concerns in ophthalmology. Finally, a set of 104 questions was selected, covering potential concerns related to the pathophysiology, surgical procedure, postoperative care, and prognosis (<xref ref-type="supplementary-material" rid="SM2">Supplementary Table 1</xref>). From October 27th to December 25th, 2024, responses to these queries were generated by ChatGPT (version GPT-4 and GPT-4o, OpenAI), Gemini Advanced (Google LLC), Copilot (Microsoft Corp), and Llama-3.1405B (Meta Platforms). To promote clarity and coherence, the LLM-chatbots were instructed to respond in a consistently structured bullet-point format (<xref ref-type="supplementary-material" rid="SM3">Supplementary Table 2</xref>). Furthermore, each question was input as a standalone query to minimize potential memory retention bias and ensure that it was generated independently. The human comparator responses were developed through a dedicated clinical authorship initiative involving 20 experienced ophthalmologists from the Eye &#x0026; ENT Hospital of Fudan University. These physicians created original responses based on firsthand clinical expertise and contemporary practice guidelines. Each response underwent standardization to ensure consistent structure and clinical applicability, with all outputs edited to maintain standard medical terminology. For evaluation, responses were subjected to blinded assessment, with all source identifiers removed.</p>
</sec>
<sec id="sec8">
<label>2.2</label>
<title>Information quality assessment</title>
<p>The quality of all the responses was assessed for accuracy, completeness, conciseness, and harmlessness by a group of ophthalmologists, evaluated using a Likert scale, which aligns with a validated approach (<xref ref-type="bibr" rid="ref13">Huang et al., 2024</xref>; <xref ref-type="bibr" rid="ref9">Goodman et al., 2023</xref>). <xref ref-type="supplementary-material" rid="SM3">Supplementary Table 2</xref> presents representative examples of LLM responses along with their corresponding evaluation scores. In order to further understand the strengths and weaknesses of the LLM-Chatbots in various subject matters, questions retrieved from websites were categorized into 9 domains&#x2014;etiology (<italic>N</italic>&#x202F;=&#x202F;12), symptoms (<italic>N</italic>&#x202F;=&#x202F;8), diagnosis (<italic>N</italic>&#x202F;=&#x202F;9), cataract surgery (<italic>N</italic>&#x202F;=&#x202F;17), IOL-related (<italic>N</italic>&#x202F;=&#x202F;12), postoperative care (<italic>N</italic>&#x202F;=&#x202F;15), treatment and prevention (<italic>N</italic>&#x202F;=&#x202F;11), PCO (<italic>N</italic>&#x202F;=&#x202F;10), and prognosis (<italic>N</italic>&#x202F;=&#x202F;10), and subgroup analysis was further conducted.</p>
</sec>
<sec id="sec9">
<label>2.3</label>
<title>Readability assessment</title>
<p>A readability analysis was performed using Flesch Reading Ease and Flesch&#x2013;Kincaid Grade Level. The readability scores ranged from 0 to 100, with higher scores demonstrating easier readability (<xref ref-type="bibr" rid="ref8">Flesch, 1948</xref>). In contrast, a higher grade level corresponds to greater reading difficulty. Three additional metrics, including word count, sentence count, and syllable count, were compared for each group to show the response length of each LLM.</p>
</sec>
<sec id="sec10">
<label>2.4</label>
<title>Reproducibility and stability assessment</title>
<p>To comprehensively evaluate model reproducibility and stability, all &#x201C;cataract surgery&#x201D; and &#x201C;IOL-related&#x201D; questions, regardless of initial scores, were regenerated and rescored using the five LLMs 30&#x202F;days after initial answers were generated and scored. For responses generated by the LLM-Chatbots that received a poor accuracy (&#x003C;5 on the accuracy scale), the LLM-Chatbots were further prompted to self-correct using this line &#x201C;That does not seem quite right. Could you review?&#x201D; (<xref ref-type="bibr" rid="ref16">Lim et al., 2023</xref>). These revised responses were subsequently re-assessed for accuracy.</p>
</sec>
<sec id="sec11">
<label>2.5</label>
<title>Likert scale definitions</title>
<p>Answer accuracy was measured on a 7-point Likert scale. Score 1 represented unacceptable inaccuracies; 2 to 3, poor accuracy with potentially harmful mistakes; 4, moderate inaccuracies that could be misinterpreted; 5 to 6, good quality with only minor, non-harmful inaccuracies; 7, very good accuracy that was devoid of any inaccuracies. A 5-point Likert scale (1: &#x201C;not comprehensive/concise,&#x201D; 2: &#x201C;slightly comprehensive/concise,&#x201D; 3: &#x201C;moderately comprehensive/concise,&#x201D; 4: &#x201C;comprehensive/concise,&#x201D; and 5: &#x201C;very comprehensive/concise&#x201D;) was used to evaluate the completeness and conciseness. A fourth metric, harmlessness, was also evaluated using a 5-point Likert scale (0: &#x201C;not at all,&#x201D; 1: &#x201C;slightly,&#x201D; 2: &#x201C;moderately,&#x201D; 3: &#x201C;very,&#x201D; and 4: &#x201C;extremely&#x201D;). The grading panel for this study comprised three experienced ophthalmologists. Methodological rigor was maintained through multiple raters and established evaluation criteria to minimize potential bias. We also used randomization in the response order to reduce bias.</p>
</sec>
<sec id="sec12">
<label>2.6</label>
<title>Statistical analysis</title>
<p>Due to the ordinal nature of Likert scale data and the non-normal distribution of the data, score results were presented descriptively with median [IQR] values. Nonparametric tests, specifically the Mann&#x2013;Whitney U test and the Kruskal-Wallis test, were used to determine differences in quality metrics, including accuracy, conciseness, and harmlessness, as well as readability metrics between different groups, followed by Bonferroni <italic>post-hoc</italic> test. Response agreement was graded using the Wilcoxon matched-pairs signed rank test and weighted <italic>&#x03BA;</italic> statistic across all scores (1&#x2013;7 for accuracy) to evaluate reproducibility and stability. A two-sided <italic>p</italic>&#x202F;&#x003C;&#x202F;0.05 was considered statistically significant. GraphPad Prism 9.5 (GraphPad Software, California, USA) and SPSS software version 26.0 (IBM Corp, Armonk, NY) were used for all analyses.</p>
</sec>
</sec>
<sec sec-type="results" id="sec13">
<label>3</label>
<title>Results</title>
<sec id="sec14">
<label>3.1</label>
<title>Information quality assessment</title>
<p><xref ref-type="fig" rid="fig1">Figure 1A</xref> illustrates the consensus-based accuracy scores of LLM-Chatbots&#x2019; responses to cataract-related questions assessed by ophthalmologists. Human demonstrated an average accuracy score of 5.81&#x202F;&#x00B1;&#x202F;1.62, inferior to all the closed-source LLMs, including ChatGPT-4 (6.59&#x202F;&#x00B1;&#x202F;0.76; Bonferroni <italic>post-hoc</italic> test, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), ChatGPT-4o (6.70&#x202F;&#x00B1;&#x202F;0.63; Bonferroni <italic>post-hoc</italic> test, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), Gemini (6.56&#x202F;&#x00B1;&#x202F;0.87; Bonferroni <italic>post-hoc</italic> test, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), and Copilot (6.40&#x202F;&#x00B1;&#x202F;1.12; Bonferroni <italic>post-hoc</italic> test, <italic>p</italic>&#x202F;=&#x202F;0.008). Although compared to the closed-source LLMs, Llama 3.1 exhibited a lower average accuracy score of 6.45&#x202F;&#x00B1;&#x202F;0.66, it demonstrated accuracy comparable to that of human in answering cataract-related questions (Bonferroni <italic>post-hoc</italic> test, <italic>p</italic>&#x202F;=&#x202F;0.722).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Evaluation of Chatbot-generated and human responses. <bold>(A)</bold> Consensus-based accuracy score of LLM-Chatbot responses to cataract care-related questions. <bold>(B)</bold> Consensus-based completeness score of LLM-Chatbot responses to cataract care-related questions. <bold>(C)</bold> Consensus-based conciseness score of LLM-Chatbot responses to cataract care-related questions. <bold>(D)</bold> Consensus-based harmlessness score of LLM-Chatbot responses to cataract care-related questions. <bold>(E)</bold> Grouped Stacked Columns of the scores of LLM-Chatbot responses. <bold>(E)</bold> LLMs&#x2019; performance in special domain of cataract care.</p>
</caption>
<graphic xlink:href="frai-08-1639221-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">The image contains several graphs and visualizations comparing different models and humans across various metrics. Panels A to D show density plots for Llama, Human, Gemini, Copilot, ChatGPT-4o, and ChatGPT-4, measuring Accuracy, Completeness, Conciseness, and Harmlessness, respectively. Panel E presents a radial chart showing overall scores in these categories. Panel F is a heatmap illustrating performance in specific areas like Etiology, Symptoms, Diagnosis, and more, with a color gradient indicating scores from 5.0 to 7.0.</alt-text>
</graphic>
</fig>
<p>For a more detailed exploration of the quality of the responses generated by LLMs, <xref ref-type="fig" rid="fig1">Figures1B&#x2013;E</xref> and <xref ref-type="supplementary-material" rid="SM4">Supplementary Table 3</xref> exhibited the scores for comprehensiveness, conciseness, and harmlessness. All the LLM-Chatbots demonstrated optimal performance, with mean scores exceeding 4 out of a maximum of 5, for both completeness and conciseness. Regarding harmlessness, LLM-Chatbots achieved perfect scores for the majority of questions, indicating the safety of using LLM-Chatbots for cataract-related queries. Performance was consistent across ChatGPT-4, ChatGPT-4o, Gemini, and Copilot, with no significant statistical differences observed. However, Llama performed less favorably than the closed-source LLMs in certain categories such as &#x201C;cataract surgery&#x201D; and &#x201C;prognosis.&#x201D;</p>
<p><xref ref-type="fig" rid="fig1">Figure 1F</xref> and <xref ref-type="supplementary-material" rid="SM5">Supplementary Table 4</xref> provide a detailed subgroup analysis of the accuracy scores across the nine cataract care domains. Overall, no significant difference was found between the four closed-source LLMs in any domain. Furthermore, all of the groups performed consistently well in the domains of &#x2018;Postoperative care&#x2019; and &#x2018;Treatment and prevention&#x2019;, achieving a median score of 7. In the &#x2018;Prognosis&#x2019; and &#x2018;PCO&#x2019; domain, five LLMs performed optimally, receiving greater accuracy scores compared to human (Kruskal-Wallis, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001). However, in the &#x2018;cataract surgery&#x2019;, and &#x2018;IOL-related&#x2019; domains, the open-resource LLM Llama performed less optimally than other groups (Kruskal-Wallis, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001).</p>
</sec>
<sec id="sec15">
<label>3.2</label>
<title>Stability and self-correction capabilities</title>
<p>Among all the five LLM-Chatbots, Copilot shows the best stability, with a median accuracy score of 7.0 [IQR, 7.0&#x2013;7.0] for the first answers, and also 7.0 [IQR, 7.0&#x2013;7.0] for rescored answers (<italic>p</italic>&#x202F;=&#x202F;0.317 determined by Wilcoxon matched-pairs signed rank test). There was great interrater agreement for accuracy (weighted <italic>&#x03BA;</italic>&#x202F;=&#x202F;0.807; <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001) (<xref ref-type="bibr" rid="ref15">Landis and Koch, 1977</xref>). In terms of completeness, conciseness and harmlessness, Copilot gained totally the same scores on the same questions. With poor interrater agreement for accuracy (<italic>p</italic>&#x202F;=&#x202F;0.059 determined by Wilcoxon matched-pairs signed rank test; weighted <italic>&#x03BA;</italic>&#x202F;=&#x202F;0.258; <italic>p</italic>&#x202F;=&#x202F;0.009), Gemini showed the worst stability. <xref ref-type="table" rid="tab1">Table 1</xref> and <xref ref-type="supplementary-material" rid="SM6">Supplementary Table 5</xref> presents the detailed results of the consistency and pairwise tests, illustrating the stability of all the LLM-Chatbots. <xref ref-type="table" rid="tab2">Table 2</xref> demonstrates the LLM-Chatbots&#x2019; ability to self-correct when prompted. Overall, all LLM-Chatbots exhibited substantial self-correction capabilities.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>The stability of the LLMs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">LLM</th>
<th align="center" valign="top">First score</th>
<th align="center" valign="top">Second score</th>
<th align="center" valign="top"><italic>p</italic><xref ref-type="table-fn" rid="tfn1"><sup>a</sup></xref> value</th>
<th align="center" valign="top">&#x03BA;</th>
<th align="center" valign="top">95% CI</th>
<th align="center" valign="top"><italic>p</italic><xref ref-type="table-fn" rid="tfn2"><sup>b</sup></xref> value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT-4</td>
<td align="center" valign="top">7.0 [7.0&#x2013;7.0]</td>
<td align="center" valign="top">7.0 [7.0&#x2013;7.0]</td>
<td align="center" valign="top">0.126</td>
<td align="center" valign="top">0.552</td>
<td align="center" valign="top">(0.184, 0.920)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4o</td>
<td align="center" valign="top">7.0 [7.0&#x2013;7.0]</td>
<td align="center" valign="top">7.0 [7.0&#x2013;7.0]</td>
<td align="center" valign="top">0.223</td>
<td align="center" valign="top">0.529</td>
<td align="center" valign="top">(0.062, 0.947)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
</tr>
<tr>
<td align="left" valign="top">Gemini</td>
<td align="center" valign="top">7.0 [6.0&#x2013;7.0]</td>
<td align="center" valign="top">7.0 [6.0&#x2013;7.0]</td>
<td align="center" valign="top">0.059</td>
<td align="center" valign="top">0.258</td>
<td align="center" valign="top">(0.094, 0. 458)</td>
<td align="center" valign="top">0.009</td>
</tr>
<tr>
<td align="left" valign="top">Copilot</td>
<td align="center" valign="top">7.0 [7.0&#x2013;7.0]</td>
<td align="center" valign="top">7.0 [7.0&#x2013;7.0]</td>
<td align="center" valign="top">0.317</td>
<td align="center" valign="top">0.807</td>
<td align="center" valign="top">(0.591, 0.996)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
</tr>
<tr>
<td align="left" valign="top">Llama</td>
<td align="center" valign="top">7.0 [6.0&#x2013;7.0]</td>
<td align="center" valign="top">7.0 [6.0&#x2013;7.0]</td>
<td align="center" valign="top">0.245</td>
<td align="center" valign="top">0.606</td>
<td align="center" valign="top">(0.368, 0.844)</td>
<td align="center" valign="top">&#x003C; 0.001</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="tfn1">
<label>a</label>
<p><italic>p</italic> value determined by Wilcoxon matched-pairs signed rank test.</p>
</fn>
<fn id="tfn2">
<label>b</label>
<p><italic>p</italic> value determined by weighted kappa.</p>
</fn>
<p>&#x002A;LLM, Large Language Model.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Demonstration of LLMs&#x2019; ability to self-correct when prompted.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">LLM</th>
<th align="left" valign="top">Question</th>
<th align="center" valign="top">Initial</th>
<th align="center" valign="top">Self-corrected</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">ChatGPT-4</td>
<td align="left" valign="middle">Are there alternatives to eyedrops after cataract surgery for people having difficulty putting in their eyedrops?</td>
<td align="center" valign="middle">1</td>
<td align="center" valign="middle">6</td>
</tr>
<tr>
<td align="left" valign="middle">ChatGPT-4o</td>
<td align="left" valign="middle">Do IOLs never need to be replaced?</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">4</td>
</tr>
<tr>
<td align="left" valign="middle">Gemini</td>
<td align="left" valign="middle">As a child&#x2019;s eyes continue to develop, will the IOL need to be replaced in the future?</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">7</td>
</tr>
<tr>
<td align="left" valign="middle">ChatGPT-4o</td>
<td align="left" valign="middle">Can children with congenital cataracts be managed conservatively until they are older before undergoing surgical intervention?</td>
<td align="center" valign="middle">3</td>
<td align="center" valign="middle">7</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot</td>
<td align="left" valign="middle">Is it true that children&#x2019;s poor eyesight is due to their eyeballs not being fully developed, and that it will gradually improve?</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">6</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot</td>
<td align="left" valign="middle">What&#x2019;s the best treatment for cataracts?</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">6</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot</td>
<td align="left" valign="middle">Will my IOL correct my lazy eye after cataract surgery?</td>
<td align="center" valign="middle">2</td>
<td align="center" valign="middle">7</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot</td>
<td align="left" valign="middle">If cataract surgery is performed without implantation of an artificial intraocular lens (IOL), does this indicate surgical failure?</td>
<td align="center" valign="middle">1</td>
<td align="center" valign="middle">7</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>&#x002A;LLM, Large Language Model; IOL, Intraocular Lens.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec16">
<label>3.3</label>
<title>Readability</title>
<p><xref ref-type="fig" rid="fig2">Figures 2A</xref>&#x2013;<xref ref-type="fig" rid="fig2">C</xref> and <xref ref-type="supplementary-material" rid="SM7">Supplementary Table 6</xref> present the length of the LLM-Chatbots&#x2019; responses to the 104 selected cataract-related questions. Notably, both ChatGPT-4o and ChatGPT-4 exhibited significantly higher average totals in word, sentence, and syllable counts compared to human responses, indicating significantly longer response lengths. Furthermore, the mean readability score for human answers was 51.54&#x202F;&#x00B1;&#x202F;13.71, which was significantly higher than that of LLMs, including ChatGPT-4 (27.83&#x202F;&#x00B1;&#x202F;12.19, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), ChatGPT-4o (26.02&#x202F;&#x00B1;&#x202F;10.78, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), Gemini (30.27&#x202F;&#x00B1;&#x202F;12.73, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), Copilot (40.26&#x202F;&#x00B1;&#x202F;14.58, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), and Llama (33.27&#x202F;&#x00B1;&#x202F;13.69, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), indicating a lower Flesch&#x2013;Kincaid Grade Level for human responses (<xref ref-type="fig" rid="fig2">Figures 2D</xref>,<xref ref-type="fig" rid="fig2">E</xref>). <xref ref-type="fig" rid="fig2">Figure 2F</xref> presents a stacked bar chart illustrating the proportions of responses across various readability levels. This visualization provides deeper insight into the nuanced performance of the LLMs in terms of readability.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Readability evaluation of the LLMs. <bold>(A)</bold> Word count of LLM-Chatbot generated responses to cataract care-related questions. <bold>(B)</bold> Sentence count of LLM-Chatbot generated responses to cataract care-related questions. <bold>(C)</bold> Syllables Count of LLM-Chatbot generated responses to cataract care-related questions. <bold>(D)</bold> Reading score of LLM-Chatbot generated responses. <bold>(E)</bold> Reading level of LLM-Chatbot generated responses. <bold>(E)</bold> Grouped Stacked Columns of the readability of LLM-Chatbot responses.</p>
</caption>
<graphic xlink:href="frai-08-1639221-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Violin plots and a bar chart analyze text characteristics across different AI models and humans. Panels A to E display word, sentence, syllable counts, reading score, and level. Panel F shows readability categories. Significant differences are noted with p-values less than 0.01.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="sec17">
<label>4</label>
<title>Discussion</title>
<p>LLMs are transforming the manner in which patients access and engage with broadly available medical information (<xref ref-type="bibr" rid="ref6">Clusmann et al., 2023</xref>; <xref ref-type="bibr" rid="ref22">Tailor et al., 2024</xref>). Instead of interacting with healthcare professionals or conducting extensive online searches, users are increasingly turning to LLMs to pose questions and receive direct responses. Given the propensity of LLMs to generate answers that may lack reliable sources or contain inaccuracies and potentially false citations, coupled with their variable accuracy, it is imperative for ophthalmologists to develop a comprehensive understanding of these models. Consequently, it becomes critical to evaluate the relevance and precision of LLM-generated responses to ophthalmologic inquiries within real-world contexts.</p>
<p>Previous researches have highlighted that the utilization of LLMs can be advantageous in various aspects of patient management and information dissemination within the field of ophthalmology (<xref ref-type="bibr" rid="ref4">Bernstein et al., 2023</xref>; <xref ref-type="bibr" rid="ref7">Dihan et al., 2024</xref>). However, in the domain of cataract, the research results do not seem to be very optimistic. <xref ref-type="bibr" rid="ref17">Moshirfar et al. (2023)</xref> have demonstrated that while GPT-4 outperformed both GPT-3.5 and human experts when addressing the ophthalmological questions from StatPearls in most categories, it was found to be less effective than human professionals specifically in the category of &#x201C;lens and cataract&#x201D; (<xref ref-type="bibr" rid="ref17">Moshirfar et al., 2023</xref>). Additionally, another study has indicated that the accuracy of ChatGPT&#x2019;s responses regarding cataract surgery is inconsistent, varying with the nature of the query. ChatGPT achieved an optimistic accuracy score when detailing the procedural steps, lens options, and refractive outcomes of cataract surgery. However, its accuracy decreased when describing the risks and benefits associated with the procedure (<xref ref-type="bibr" rid="ref11">Gupta et al., 2024</xref>). Existing studies predominantly rely on relatively small sample sizes and offer limited comprehensiveness in evaluating the performance metrics of LLMs, with a notable deficiency in the depth and detail of related investigations.</p>
<p>This study conducted a qualitative and quantitative assessment of the appropriateness of responses from the five most popular LLMs concerning cataract-related clinical inquiries across multiple dimensions. The findings revealed that closed-source LLMs exhibited robust aggregate appropriateness, outperforming both human responses and open-source models across various domains. Among the evaluated LLMs, ChatGPT-4o distinguished itself as the most adept in addressing cataract-related questions, attaining the best performance across all assessment metrics (<xref ref-type="fig" rid="fig3">Figure 3</xref>). In contrast, since the LLMs were not specifically trained for this particular purpose (<xref ref-type="bibr" rid="ref21">Sandmann et al., 2024</xref>), the open-source LLM Llama, despite showing comparable competence in delivering comprehensive responses, generally fell short of the performance observed in closed-source LLMs. This limitation highlights significant concerns regarding the efficacy of LLMs, particularly open-source models. Such concerns warrant careful scrutiny in the domain of cataract care, as the reliability and accuracy of these models are essential for their effective use in clinical practice. Regarding readability, AI-generated responses demonstrated significantly higher text complexity than human-generated content. This poses comprehension challenges&#x2014;particularly for vulnerable populations like the elderly or those with limited health literacy. Such complexity carries clinical significance, as reduced readability could impede patients&#x2019; understanding of medical information, potentially influencing clinical decision-making&#x2014;a consideration warranting attention in ophthalmic practice. Additionally, all LLM-chatbots exhibited substantial self-correction capabilities. In the stability assessment, the evaluated LLMs, except for Gemini, demonstrated moderate to strong stability in their performance, further indicating their reliability in providing responses to cataract-related inquiries.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Radar chart demonstrated the overall performance of the LLMs.</p>
</caption>
<graphic xlink:href="frai-08-1639221-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Radar chart comparing Human, ChatGPT-4, ChatGPT-4o, Gemini, Copilot, and Llama across five criteria: accuracy, completeness, conciseness, harmlessness, and readability difficulty. Each line represents a different entity, highlighting their strengths and weaknesses in each category.</alt-text>
</graphic>
</fig>
<p>The enhanced performance observed in this study, compared to previous evaluations, can be attributed to refined prompting techniques that specifically directed the model to respond in the format of an ophthalmology note while also instructing the LLM chatbots to present their responses in a structured bullet-point format, enhancing clarity and coherence. It is essential for clinicians and patients to recognize that the quality of LLM responses can be significantly influenced by user prompts. Well-defined prompts with specific instructions are considerably more effective in eliciting accurate and precise responses (<xref ref-type="bibr" rid="ref25">Young and Zhao, 2024</xref>).</p>
<p>This investigation demonstrates multiple strengths. We rigorously evaluated five LLMs in their responses to common cataract-related queries. A robust methodological framework, incorporating randomization and meticulous appraisal by consultant ophthalmologists, ensured the integrity of the assessments. Notwithstanding these contributions, several limitations should be acknowledged. First, qualitative evaluations by experts entail inherent subjectivity. To address this, experienced ophthalmologists employed standardized criteria and consensus-based ratings to enhance objectivity. Second, because the analysis focused on the most prevalent patient-centered cataract concerns and relied on English for both query formulation and response generation, it necessarily excluded specialized topics such as rare complications. Moreover, the distribution of questions across domains was uneven (for instance, only 10 queries related to PCO), and these linguistic and sampling constraints may introduce bias and diminish statistical power. Consequently, domain-specific findings should be interpreted cautiously and validated using larger, more balanced datasets, alongside personalized clinical approaches to address complex knowledge gaps. Additionally, LLM performance is highly sensitive to prompt engineering, underscoring the necessity for rigorous standardization frameworks before clinical deployment. Given the rapid evolution of LLM technology, continuous evaluation aligned with technological developments is critical to maintain relevance. Taken together, these considerations highlight the need for ongoing validation as language models and clinical applications continue to evolve.</p>
</sec>
<sec sec-type="conclusions" id="sec18">
<label>5</label>
<title>Conclusion</title>
<p>Taken together, our findings indicate that LLM-chatbots, particularly ChatGPT-4o, possess the potential to deliver accurate and comprehensive responses to cataract-related inquiries. In further assessments, LLMs exhibited commendable capabilities in various dimensions, including conciseness, safety, stability, and self-correction. However, regarding readability, it was observed that the complexity of their responses may present a higher level of difficulty compared to human-generated content, potentially necessitating a certain level of specialized knowledge for adequate comprehension. The implications of our findings are profound, as they suggest a viable pathway for the incorporation of LLM chatbots into cataract care management, potentially improving patient engagement and information accessibility. Furthermore, both patients and clinicians must remain cognizant of the inherent limitations of these LLMs, fostering an environment of informed usage and critical evaluation in clinical practice.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec19">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="ethics-statement" id="sec20">
<title>Ethics statement</title>
<p>The studies involving human participants were reviewed and approved by Human Research Ethics Committee of the Eye and ENT Hospital of Fudan University (ChiCTR2000039132). Written informed consent to participate in this study was provided by the participants.</p>
</sec>
<sec sec-type="author-contributions" id="sec21">
<title>Author contributions</title>
<p>XW: Data curation, Formal analysis, Methodology, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. YL: Formal analysis, Visualization, Writing &#x2013; review &#x0026; editing. LS: Data curation, Formal analysis, Writing &#x2013; review &#x0026; editing. YW: Formal analysis, Writing &#x2013; review &#x0026; editing. SP: Data curation, Writing &#x2013; review &#x0026; editing. RR: Data curation, Writing &#x2013; review &#x0026; editing. YZ: Data curation, Writing &#x2013; review &#x0026; editing. TC: Methodology, Supervision, Writing &#x2013; review &#x0026; editing. YJ: Funding acquisition, Project administration, Resources, Supervision, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="funding-information" id="sec22">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This study was supported by the National Natural Science Foundation of China (Grant no. 82271068), and the Shanghai Science and Technology Commission (Grant no. 22Y11910400).</p>
</sec>
<sec sec-type="COI-statement" id="sec23">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec24">
<title>Generative AI statement</title>
<p>The authors declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec25">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec26">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2025.1639221/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/frai.2025.1639221/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.DOCX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_1.XLSX" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_2.XLSX" id="SM3" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_3.XLSX" id="SM4" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_4.XLSX" id="SM5" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_5.XLSX" id="SM6" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_6.XLSX" id="SM7" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alberts</surname><given-names>I. L.</given-names></name> <name><surname>Mercolli</surname><given-names>L.</given-names></name> <name><surname>Pyka</surname><given-names>T.</given-names></name> <name><surname>Prenosil</surname><given-names>G.</given-names></name> <name><surname>Shi</surname><given-names>K.</given-names></name> <name><surname>Rominger</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Large language models (LLM) and ChatGPT: what will the impact on nuclear medicine be?</article-title> <source>Eur. J. Nucl. Med. Mol. Imaging</source> <volume>50</volume>, <fpage>1549</fpage>&#x2013;<lpage>1552</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00259-023-06172-w</pub-id>, PMID: <pub-id pub-id-type="pmid">36892666</pub-id></citation></ref>
<ref id="ref2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alkaissi</surname><given-names>H.</given-names></name> <name><surname>McFarlane</surname><given-names>S. I.</given-names></name></person-group> (<year>2023</year>). <article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title>. <source>Cureus</source> <volume>15</volume>:<fpage>e35179</fpage>. doi: <pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id>, PMID: <pub-id pub-id-type="pmid">36811129</pub-id></citation></ref>
<ref id="ref3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Antaki</surname><given-names>F.</given-names></name> <name><surname>Milad</surname><given-names>D.</given-names></name> <name><surname>Chia</surname><given-names>M. A.</given-names></name> <name><surname>Gigu&#x00E8;re</surname><given-names>C. &#x00C9;.</given-names></name> <name><surname>Touma</surname><given-names>S.</given-names></name> <name><surname>el-Khoury</surname><given-names>J.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Capabilities of GPT-4 in ophthalmology: an analysis of model entropy and progress towards human-level medical question answering</article-title>. <source>Br. J. Ophthalmol.</source> <volume>108</volume>, <fpage>1371</fpage>&#x2013;<lpage>1378</lpage>. doi: <pub-id pub-id-type="doi">10.1136/bjo-2023-324438</pub-id>, PMID: <pub-id pub-id-type="pmid">37923374</pub-id></citation></ref>
<ref id="ref4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bernstein</surname><given-names>I. A.</given-names></name> <name><surname>Zhang</surname><given-names>Y. V.</given-names></name> <name><surname>Govil</surname><given-names>D.</given-names></name> <name><surname>Majid</surname><given-names>I.</given-names></name> <name><surname>Chang</surname><given-names>R.</given-names></name> <name><surname>Sun</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Comparison of ophthalmologist and large language model chatbot responses to online patient eye care questions</article-title>. <source>JAMA Netw. Open</source> <volume>6</volume>:<fpage>e2330320</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.30320</pub-id></citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>S.</given-names></name> <name><surname>Kann</surname><given-names>B. H.</given-names></name> <name><surname>Foote</surname><given-names>M. B.</given-names></name> <name><surname>Aerts</surname><given-names>H. J. W. L.</given-names></name> <name><surname>Savova</surname><given-names>G. K.</given-names></name> <name><surname>Mak</surname><given-names>R. H.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Use of artificial intelligence chatbots for Cancer treatment information</article-title>. <source>JAMA Oncol.</source> <volume>9</volume>, <fpage>1459</fpage>&#x2013;<lpage>1462</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jamaoncol.2023.2954</pub-id>, PMID: <pub-id pub-id-type="pmid">37615976</pub-id></citation></ref>
<ref id="ref6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Clusmann</surname><given-names>J.</given-names></name> <name><surname>Kolbinger</surname><given-names>F. R.</given-names></name> <name><surname>Muti</surname><given-names>H. S.</given-names></name> <name><surname>Carrero</surname><given-names>Z. I.</given-names></name> <name><surname>Eckardt</surname><given-names>J.-N.</given-names></name> <name><surname>Laleh</surname><given-names>N. G.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>The future landscape of large language models in medicine</article-title>. <source>Commun. Med.</source> <volume>3</volume>:<fpage>141</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id>, PMID: <pub-id pub-id-type="pmid">37816837</pub-id></citation></ref>
<ref id="ref7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dihan</surname><given-names>Q.</given-names></name> <name><surname>Chauhan</surname><given-names>M. Z.</given-names></name> <name><surname>Eleiwa</surname><given-names>T. K.</given-names></name> <name><surname>Hassan</surname><given-names>A. K.</given-names></name> <name><surname>Sallam</surname><given-names>A. B.</given-names></name> <name><surname>Khouri</surname><given-names>A. S.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Using large language models to generate educational materials on childhood Glaucoma</article-title>. <source>Am. J. Ophthalmol.</source> <volume>265</volume>, <fpage>28</fpage>&#x2013;<lpage>38</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ajo.2024.04.004</pub-id>, PMID: <pub-id pub-id-type="pmid">38614196</pub-id></citation></ref>
<ref id="ref8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Flesch</surname><given-names>R.</given-names></name></person-group> (<year>1948</year>). <article-title>A new readability yardstick</article-title>. <source>J. Appl. Psychol.</source> <volume>32</volume>, <fpage>221</fpage>&#x2013;<lpage>233</lpage>. doi: <pub-id pub-id-type="doi">10.1037/h0057532</pub-id>, PMID: <pub-id pub-id-type="pmid">18867058</pub-id></citation></ref>
<ref id="ref9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Goodman</surname><given-names>R. S.</given-names></name> <name><surname>Patrinely</surname><given-names>J. R.</given-names></name> <name><surname>Stone</surname><given-names>C. A.</given-names> <suffix>Jr.</suffix></name> <name><surname>Zimmerman</surname><given-names>E.</given-names></name> <name><surname>Donald</surname><given-names>R. R.</given-names></name> <name><surname>Chang</surname><given-names>S. S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Accuracy and reliability of chatbot responses to physician questions</article-title>. <source>JAMA Netw. Open</source> <volume>6</volume>:<fpage>e2336483</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.36483</pub-id>, PMID: <pub-id pub-id-type="pmid">37782499</pub-id></citation></ref>
<ref id="ref10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname><given-names>R.</given-names></name> <name><surname>Herzog</surname><given-names>I.</given-names></name> <name><surname>Park</surname><given-names>J. B.</given-names></name> <name><surname>Weisberger</surname><given-names>J.</given-names></name> <name><surname>Firouzbakht</surname><given-names>P.</given-names></name> <name><surname>Ocon</surname><given-names>V.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Performance of ChatGPT on the plastic surgery Inservice training examination</article-title>. <source>Aesthet. Surg. J.</source> <volume>43</volume>, <fpage>NP1078</fpage>&#x2013;<lpage>NP1082</lpage>. doi: <pub-id pub-id-type="doi">10.1093/asj/sjad128</pub-id>, PMID: <pub-id pub-id-type="pmid">37128784</pub-id></citation></ref>
<ref id="ref11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname><given-names>A. S.</given-names></name> <name><surname>Sulewski</surname><given-names>M. E.</given-names></name> <name><surname>Armenti</surname><given-names>S. T.</given-names></name></person-group> (<year>2024</year>). <article-title>Performance of ChatGPT in cataract surgery counseling</article-title>. <source>J. Cataract Refract. Surg.</source> <volume>50</volume>, <fpage>424</fpage>&#x2013;<lpage>425</lpage>. doi: <pub-id pub-id-type="doi">10.1097/j.jcrs.0000000000001345</pub-id>, PMID: <pub-id pub-id-type="pmid">38523277</pub-id></citation></ref>
<ref id="ref12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname><given-names>X.</given-names></name> <name><surname>Ran</surname><given-names>A. R.</given-names></name> <name><surname>Nguyen</surname><given-names>T. X.</given-names></name> <name><surname>Szeto</surname><given-names>S.</given-names></name> <name><surname>Yam</surname><given-names>J. C.</given-names></name> <name><surname>Chan</surname><given-names>C. K. M.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>What can GPT-4 do for diagnosing rare eye diseases? A pilot study</article-title>. <source>Ophthalmol. Ther.</source> <volume>12</volume>, <fpage>3395</fpage>&#x2013;<lpage>3402</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s40123-023-00789-8</pub-id>, PMID: <pub-id pub-id-type="pmid">37656399</pub-id></citation></ref>
<ref id="ref13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname><given-names>A. S.</given-names></name> <name><surname>Hirabayashi</surname><given-names>K.</given-names></name> <name><surname>Barna</surname><given-names>L.</given-names></name> <name><surname>Parikh</surname><given-names>D.</given-names></name> <name><surname>Pasquale</surname><given-names>L. R.</given-names></name></person-group> (<year>2024</year>). <article-title>Assessment of a large language model's responses to questions and cases about Glaucoma and retina management</article-title>. <source>JAMA Ophthalmol.</source> <volume>142</volume>, <fpage>371</fpage>&#x2013;<lpage>375</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.6917</pub-id>, PMID: <pub-id pub-id-type="pmid">38386351</pub-id></citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ji</surname><given-names>Z.</given-names></name> <name><surname>Lee</surname><given-names>N.</given-names></name> <name><surname>Frieske</surname><given-names>R.</given-names></name> <name><surname>Yu</surname><given-names>T.</given-names></name> <name><surname>Su</surname><given-names>D.</given-names></name> <name><surname>Xu</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Survey of hallucination in natural language generation</article-title>. <source>ACM Comput. Surv.</source> <volume>55</volume>:<fpage>248</fpage>. doi: <pub-id pub-id-type="doi">10.1145/3571730</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Landis</surname><given-names>J. R.</given-names></name> <name><surname>Koch</surname><given-names>G. G.</given-names></name></person-group> (<year>1977</year>). <article-title>The measurement of observer agreement for categorical data</article-title>. <source>Biometrics</source> <volume>33</volume>, <fpage>159</fpage>&#x2013;<lpage>174</lpage>. doi: <pub-id pub-id-type="doi">10.2307/2529310</pub-id>, PMID: <pub-id pub-id-type="pmid">843571</pub-id></citation></ref>
<ref id="ref16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lim</surname><given-names>Z. W.</given-names></name> <name><surname>Pushpanathan</surname><given-names>K.</given-names></name> <name><surname>Yew</surname><given-names>S. M. E.</given-names></name> <name><surname>Lai</surname><given-names>Y.</given-names></name> <name><surname>Sun</surname><given-names>C. H.</given-names></name> <name><surname>Lam</surname><given-names>J. S. H.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Benchmarking large language models' performances for myopia care: a comparative analysis of ChatGPT-3.5, ChatGPT-4.0, and Google bard</article-title>. <source>EBioMedicine</source> <volume>95</volume>:<fpage>104770</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ebiom.2023.104770</pub-id>, PMID: <pub-id pub-id-type="pmid">37625267</pub-id></citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Moshirfar</surname><given-names>M.</given-names></name> <name><surname>Altaf</surname><given-names>A. W.</given-names></name> <name><surname>Stoakes</surname><given-names>I. M.</given-names></name> <name><surname>Tuttle</surname><given-names>J. J.</given-names></name> <name><surname>Hoopes</surname><given-names>P. C.</given-names></name></person-group> (<year>2023</year>). <article-title>Artificial intelligence in ophthalmology: a comparative analysis of GPT-3.5, GPT-4, and human expertise in answering StatPearls questions</article-title>. <source>Cureus</source> <volume>15</volume>:<fpage>e40822</fpage>. doi: <pub-id pub-id-type="doi">10.7759/cureus.40822</pub-id>, PMID: <pub-id pub-id-type="pmid">37485215</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Newman-Casey</surname><given-names>P. A.</given-names></name> <name><surname>Ravilla</surname><given-names>S.</given-names></name> <name><surname>Haripriya</surname><given-names>A.</given-names></name> <name><surname>Palanichamy</surname><given-names>V.</given-names></name> <name><surname>Pillai</surname><given-names>M.</given-names></name> <name><surname>Balakrishnan</surname><given-names>V.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>The effect of Counseling on cataract patient knowledge, decisional conflict, and satisfaction</article-title>. <source>Ophthalmic Epidemiol.</source> <volume>22</volume>, <fpage>387</fpage>&#x2013;<lpage>393</lpage>. doi: <pub-id pub-id-type="doi">10.3109/09286586.2015.1066016</pub-id>, PMID: <pub-id pub-id-type="pmid">26653261</pub-id></citation></ref>
<ref id="ref19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pushpanathan</surname><given-names>K.</given-names></name> <name><surname>Lim</surname><given-names>Z. W.</given-names></name> <name><surname>Er Yew</surname><given-names>S. M.</given-names></name> <name><surname>Chen</surname><given-names>D. Z.</given-names></name> <name><surname>Hui'En Lin</surname><given-names>H. A.</given-names></name> <name><surname>Lin Goh</surname><given-names>J. H.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Popular large language model chatbots' accuracy, comprehensiveness, and self-awareness in answering ocular symptom queries</article-title>. <source>iScience</source> <volume>26</volume>:<fpage>108163</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.isci.2023.108163</pub-id>, PMID: <pub-id pub-id-type="pmid">37915603</pub-id></citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ramirez</surname><given-names>D. A.</given-names></name> <name><surname>Brodie</surname><given-names>F. L.</given-names></name> <name><surname>Rose-Nussbaumer</surname><given-names>J.</given-names></name> <name><surname>Ramanathan</surname><given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>Anxiety in patients undergoing cataract surgery: a pre- and postoperative comparison</article-title>. <source>Clin. Ophthalmol.</source> <volume>11</volume>, <fpage>1979</fpage>&#x2013;<lpage>1986</lpage>. doi: <pub-id pub-id-type="doi">10.2147/opth.S146135</pub-id>, PMID: <pub-id pub-id-type="pmid">29184388</pub-id></citation></ref>
<ref id="ref21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sandmann</surname><given-names>S.</given-names></name> <name><surname>Riepenhausen</surname><given-names>S.</given-names></name> <name><surname>Plagwitz</surname><given-names>L.</given-names></name> <name><surname>Varghese</surname><given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks</article-title>. <source>Nat. Commun.</source> <volume>15</volume>:<fpage>2050</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-024-46411-8</pub-id></citation></ref>
<ref id="ref22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tailor</surname><given-names>P. D.</given-names></name> <name><surname>Xu</surname><given-names>T. T.</given-names></name> <name><surname>Fortes</surname><given-names>B. H.</given-names></name> <name><surname>Iezzi</surname><given-names>R.</given-names></name> <name><surname>Olsen</surname><given-names>T. W.</given-names></name> <name><surname>Starr</surname><given-names>M. R.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Appropriateness of ophthalmology recommendations from an online chat-based artificial intelligence model</article-title>. <source>Mayo Clin. Proc. Digit. Health</source> <volume>2</volume>, <fpage>119</fpage>&#x2013;<lpage>128</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.mcpdig.2024.01.003</pub-id>, PMID: <pub-id pub-id-type="pmid">38577703</pub-id></citation></ref>
<ref id="ref23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Thirunavukarasu</surname><given-names>A. J.</given-names></name> <name><surname>Ting</surname><given-names>D. S. J.</given-names></name> <name><surname>Elangovan</surname><given-names>K.</given-names></name> <name><surname>Gutierrez</surname><given-names>L.</given-names></name> <name><surname>Tan</surname><given-names>T. F.</given-names></name> <name><surname>Ting</surname><given-names>D. S. W.</given-names></name></person-group> (<year>2023</year>). <article-title>Large language models in medicine</article-title>. <source>Nat. Med.</source> <volume>29</volume>, <fpage>1930</fpage>&#x2013;<lpage>1940</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>, PMID: <pub-id pub-id-type="pmid">37460753</pub-id></citation></ref>
<ref id="ref24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>van Dis</surname><given-names>E. A. M.</given-names></name> <name><surname>Bollen</surname><given-names>J.</given-names></name> <name><surname>Zuidema</surname><given-names>W.</given-names></name> <name><surname>van Rooij</surname><given-names>R.</given-names></name> <name><surname>Bockting</surname><given-names>C. L.</given-names></name></person-group> (<year>2023</year>). <article-title>ChatGPT: five priorities for research</article-title>. <source>Nature</source> <volume>614</volume>, <fpage>224</fpage>&#x2013;<lpage>226</lpage>. doi: <pub-id pub-id-type="doi">10.1038/d41586-023-00288-7</pub-id>, PMID: <pub-id pub-id-type="pmid">36737653</pub-id></citation></ref>
<ref id="ref25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Young</surname><given-names>B. K.</given-names></name> <name><surname>Zhao</surname><given-names>P. Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Large language models and the shoreline of ophthalmology</article-title>. <source>JAMA Ophthalmol.</source> <volume>142</volume>, <fpage>375</fpage>&#x2013;<lpage>376</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.6937</pub-id>, PMID: <pub-id pub-id-type="pmid">38386327</pub-id></citation></ref>
<ref id="ref26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>D.</given-names></name> <name><surname>Zhan</surname><given-names>W.</given-names></name> <name><surname>Zheng</surname><given-names>C.</given-names></name> <name><surname>Zhang</surname><given-names>J.</given-names></name> <name><surname>Huang</surname><given-names>A.</given-names></name> <name><surname>Hu</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Online health information-seeking behaviors and skills of Chinese college students</article-title>. <source>BMC Public Health</source> <volume>21</volume>:<fpage>736</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12889-021-10801-0</pub-id></citation></ref>
</ref-list>
</back>
</article>