<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="brief-report">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Vet. Sci.</journal-id>
<journal-title>Frontiers in Veterinary Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Vet. Sci.</abbrev-journal-title>
<issn pub-type="epub">2297-1769</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fvets.2024.1490030</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Veterinary Science</subject>
<subj-group>
<subject>Brief Research Report</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Classification performance and reproducibility of GPT-4 omni for information extraction from veterinary electronic health records</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Wulcan</surname> <given-names>Judit M.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2750244/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jacques</surname> <given-names>Kevin L.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2923008/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lee</surname> <given-names>Mary Ann</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2897744/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kovacs</surname> <given-names>Samantha L.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/623909/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Dausend</surname> <given-names>Nicole</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Prince</surname> <given-names>Lauren E.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wulcan</surname> <given-names>Jonatan</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Marsilio</surname> <given-names>Sina</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Keller</surname> <given-names>Stefan M.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2159874/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Pathology, Microbiology and Immunology, School of Veterinary Medicine, University of California, Davis</institution>, <addr-line>Davis, CA</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>College of Veterinary Medicine and Biomedical Sciences, James L. Voss Veterinary Teaching Hospital, Colorado State University</institution>, <addr-line>Fort Collins, CO</addr-line>, <country>United States</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Medicine and Epidemiology, School of Veterinary Medicine, University of California, Davis</institution>, <addr-line>Davis, CA</addr-line>, <country>United States</country></aff>
<aff id="aff4"><sup>4</sup><institution>Independent Researcher</institution>, <addr-line>Malm&#x000F6;</addr-line>, <country>Sweden</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Audrey Ruple, Virginia Tech, United States</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Andrea K. Wright, Zoetis, United States</p>
<p>Taran Rai, University of Surrey, United Kingdom</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Stefan M. Keller <email>smkeller&#x00040;ucdavis.edu</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>16</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1490030</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>09</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Wulcan, Jacques, Lee, Kovacs, Dausend, Prince, Wulcan, Marsilio and Keller.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Wulcan, Jacques, Lee, Kovacs, Dausend, Prince, Wulcan, Marsilio and Keller</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Large language models (LLMs) can extract information from veterinary electronic health records (EHRs), but performance differences between models, the effect of hyperparameter settings, and the influence of text ambiguity have not been previously evaluated. This study addresses these gaps by comparing the performance of GPT-4 omni (GPT-4o) and GPT-3.5 Turbo under different conditions and by investigating the relationship between human interobserver agreement and LLM errors. The LLMs and five humans were tasked with identifying six clinical signs associated with feline chronic enteropathy in 250 EHRs from a veterinary referral hospital. When compared to the majority opinion of human respondents, GPT-4o demonstrated 96.9% sensitivity [interquartile range (IQR) 92.9&#x02013;99.3%], 97.6% specificity (IQR 96.5&#x02013;98.5%), 80.7% positive predictive value (IQR 70.8&#x02013;84.6%), 99.5% negative predictive value (IQR 99.0&#x02013;99.9%), 84.4% F1 score (IQR 77.3&#x02013;90.4%), and 96.3% balanced accuracy (IQR 95.0&#x02013;97.9%). The performance of GPT-4o was significantly better than that of its predecessor, GPT-3.5 Turbo, particularly with respect to sensitivity where GPT-3.5 Turbo only achieved 81.7% (IQR 78.9&#x02013;84.8%). GPT-4o demonstrated greater reproducibility than human pairs, with an average Cohen&#x00027;s kappa of 0.98 (IQR 0.98&#x02013;0.99) compared to 0.80 (IQR 0.78&#x02013;0.81) with humans. Most GPT-4o errors occurred in instances where humans disagreed [35/43 errors (81.4%)], suggesting that these errors were more likely caused by ambiguity of the EHR than explicit model faults. Using GPT-4o to automate information extraction from veterinary EHRs is a viable alternative to manual extraction, but requires validation for the intended setting to ensure accuracy and reliability.</p></abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>artificial intelligence</kwd>
<kwd>generative-pretrained transformers</kwd>
<kwd>Chat-GPT</kwd>
<kwd>text mining</kwd>
<kwd>feline chronic enteropathy</kwd>
<kwd>Real-World Evidence (RWE)</kwd>
<kwd>Real-World Data (RWD)</kwd>
</kwd-group>
<counts>
<fig-count count="3"/>
<table-count count="0"/>
<equation-count count="0"/>
<ref-count count="32"/>
<page-count count="9"/>
<word-count count="6063"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Comparative and Clinical Medicine</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Efficient, accurate, and scalable methods for extracting information from electronic health records (EHRs) are essential for conducting retrospective studies in veterinary medicine. Inaccurate information extraction can introduce bias and lead to inappropriate conclusions (<xref ref-type="bibr" rid="B1">1</xref>). Veterinary EHRs commonly lack standardized diagnostic codes, making automated information extraction challenging. This limitation has been identified as a barrier for leveraging routinely collected animal health data (Real-World Data) as a source for clinical evidence (Real-World Evidence) for veterinary medicine (<xref ref-type="bibr" rid="B2">2</xref>). Manual review, the current gold-standard for extracting information from free text, is time-consuming, tedious, and error-prone. In addition, there is a limit to the number of EHRs a human can assess, which hinders large-scale information extraction. Introducing an automated filtering step before the manual review can improve the efficiency and sensitivity of information extraction (<xref ref-type="bibr" rid="B3">3</xref>). Key-word searches, commonly used as a pre-filtering step, are a crude tool and risk excluding relevant EHRs. Rule-based programming (e.g., regular expressions) and supervised machine learning have shown improved classification performance over key-word searches (<xref ref-type="bibr" rid="B3">3</xref>). However, these methods are costly to develop, require large amounts of labeled data, and generalize poorly across different institutions and conditions. In addition, they require fine-tuning and retraining for new tasks, making them impractical for small observational studies (<xref ref-type="bibr" rid="B3">3</xref>).</p>
<p>A new and rapidly evolving tool for information extraction is the use of large language models (LLM), a form of unsupervised machine learning that can predict the next element in a text sequence after being trained on a large amount of unlabeled text (<xref ref-type="bibr" rid="B4">4</xref>). Early LLMs used a semi-supervised approach, where models trained on unlabeled text could be fine-tuned on labeled text for specific tasks (<xref ref-type="bibr" rid="B4">4</xref>). Modern LLMs can solve new tasks with few or no labeled training examples (<xref ref-type="bibr" rid="B5">5</xref>). However, LLMs can produce true-sounding falsehoods (hallucinations) or exhibit reasoning errors (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>). A recent study demonstrated good performance of GPT-3.5 Turbo for extracting information from veterinary EHRs (<xref ref-type="bibr" rid="B8">8</xref>). However, the performance of GPT-3.5 Turbo was not compared to that of other models, the nature of errors was not explored, and the influence of hyperparameter settings and text ambiguity were not assessed (<xref ref-type="bibr" rid="B8">8</xref>). Identifying the strengths, weaknesses and cost of different models is important for model selection. Understanding the cause of errors and the context in which they occur is crucial for optimizing model performance, and for setting of realistic expectations.</p>
<p>The objective of this study was to assess the classification performance and reproducibility of GPT-4 omni (GPT-4o) for identifying six clinical signs associated with feline chronic enteropathy (FCE) from EHRs. In addition, we compare the classification performance of GPT-4o to that of GPT-3.5 Turbo, compare the reproducibility of GPT-4o to human respondents under different conditions and investigate the relationship between human interobserver agreement and LLM errors.</p></sec>
<sec id="s2">
<title>2 Methods</title>
<sec>
<title>2.1 Study design and sample size</title>
<p>Constructed as a retrospective cross-sectional study, the sample size was determined to estimate the sensitivity of a single test (<xref ref-type="bibr" rid="B9">9</xref>). The calculations accounted for a type I error rate of 5%, an acceptable margin of error of 7%, an expected sensitivity of 95% and an expected prevalence of 15%. Methods for prevalence estimation are detailed in the <xref ref-type="supplementary-material" rid="SM1">Supplementary material S1</xref> and prevalence estimates for each clinical sign are available in <xref ref-type="supplementary-material" rid="SM3">Supplementary Table S1</xref>.</p></sec>
<sec>
<title>2.2 Case material</title>
<p>A test set consisting of 250 EHRs was sampled from all feline visits at the Veterinary Medical Teaching Hospital (VMTH) at University of California Davis, between 1985 and 2023. EHRs without text in the &#x0201C;Pertinent history field&#x0201D; or those used for study planning were excluded. EHRs for patients already represented in the test set were also excluded and replaced with resampled EHRs. The EHRs in the test set included only the admission date, presenting complaint, and pertinent history field from the original EHRs. The EHRs were manually deidentified by redacting possibly identifying information (see <xref ref-type="supplementary-material" rid="SM1">Supplementary material S2</xref>).</p>
<p>Pilot sets, used for initial prevalence estimation, and a tuning set, used to refine prompts and model parameters, were used during the study planning phase. These sets were distinct from the test set and are outlined in detail in <xref ref-type="supplementary-material" rid="SM1">Supplementary material S3</xref> and <xref ref-type="supplementary-material" rid="SM2">Supplementary Figure S1</xref>.</p></sec>
<sec>
<title>2.3 Software, data type classification, scripts, and packages</title>
<p>The test set EHRs were analyzed using GPT-4o and GPT-3.5 Turbo (Open AI, San Francisco, CA, USA) accessed through Microsoft Azure&#x00027;s Open AI Application Programming Interface (API) (Microsoft Azure Redmond, WA, USA) provided by UC Davis AggieCloud Services. The account was commissioned based on a data type classification of &#x0201C;De-identified patient information (with negligible re-identification risk)&#x0201D;, a Protection Level Classification of P2 and an Availability Level Classification of A1 in accordance with UC Davis&#x00027; Information Security Policy 3 (IS-3) (request RITM0074868). IS-3 is based on security standards ISO 27001 and 27002 and supports cybersecurity compliance requirements NIST 800-171, PCI, and HIPAA.</p>
<p>The analysis was conducted using a custom Python script (<xref ref-type="bibr" rid="B10">10</xref>) that leveraged the chat-completion endpoint with API version 2024-02-01. The script utilized Python&#x00027;s standard library (<xref ref-type="bibr" rid="B11">11</xref>), as well as the openai (<xref ref-type="bibr" rid="B12">12</xref>) and tiktoken (<xref ref-type="bibr" rid="B13">13</xref>) packages. Human respondents accessed the test set EHRs through a custom online survey (Qualtrics, Provo, UT, USA). All data analysis and statistical computations and visualizations were conducted using R programming language within the RStudio integrated development environment (<xref ref-type="bibr" rid="B14">14</xref>, <xref ref-type="bibr" rid="B15">15</xref>). The custom R scripts were supported by a range of open-source packages for data science (<xref ref-type="bibr" rid="B16">16</xref>, <xref ref-type="bibr" rid="B17">17</xref>), data import and data export (<xref ref-type="bibr" rid="B18">18</xref>&#x02013;<xref ref-type="bibr" rid="B20">20</xref>), statistics (<xref ref-type="bibr" rid="B21">21</xref>, <xref ref-type="bibr" rid="B22">22</xref>), and visualization (<xref ref-type="bibr" rid="B23">23</xref>&#x02013;<xref ref-type="bibr" rid="B25">25</xref>). All scripts used in this study are available at GitHub (<ext-link ext-link-type="uri" xlink:href="https://github.com/ucdavis/llm_vet_records">https://github.com/ucdavis/llm_vet_records</ext-link>).</p></sec>
<sec>
<title>2.4 Model tasks and prompt engineering</title>
<p>Respondents (humans and LLMs) were asked to perform two tasks: (1) Determine whether an EHR mentioned the presence of six clinical signs associated with FCE (classification task) and (2) cite pertinent sections of the EHR supporting the decision (citation task). Detailed instructions for both tasks were developed through iterative evaluation and adjustment (prompt engineering), informed by human and GPT-4o responses to the tuning set. The prompt used for LLM analysis only differed from the instructions provided to human respondents in the specific instructions to output the response in a JSON format.</p>
<sec>
<title>2.4.1 Classification of presence of clinical signs</title>
<p>The clinical signs associated with FCE were selected based on a recent diagnostic consensus statement (<xref ref-type="bibr" rid="B26">26</xref>) and included decreased appetite, vomiting, weight loss, diarrhea, constipation and polyphagia. The instructions specified that a &#x0201C;current&#x0201D;, or &#x0201C;recently present&#x0201D; clinical sign qualified as &#x0201C;present&#x0201D; and allowed answers were &#x0201C;true&#x0201D; or &#x0201C;false&#x0201D;. A precise time cut-off for what was considered &#x0201C;recent&#x0201D; was not provided as preliminary experiments indicated that such a criterion led to false negative results for intermittent signs (see <xref ref-type="supplementary-material" rid="SM1">Supplementary material S4</xref>).</p></sec>
<sec>
<title>2.4.2 Citation of supporting text</title>
<p>To trace respondent decisions for classification error analysis, respondents were instructed to cite pertinent sections of the EHR. The instructions specified that only copy-pasted text should be provided, each text section should be enclosed in quotation marks, different portions of the text should be separated by white space, and ellipses should not be used to shorten the text.</p></sec></sec>
<sec>
<title>2.5 EHR analysis</title>
<sec>
<title>2.5.1 EHR analysis by LLM</title>
<p>The Azure Open AI API allows setting a &#x0201C;temperature&#x0201D; value between 0 and 2, where a temperature of 0 produces the most likely response, while higher values prompt the model to generate more varied and creative outputs (<xref ref-type="bibr" rid="B27">27</xref>). The test set was analyzed at temperatures 0, 0.5, and 1, which were chosen based on initial experiments that showed high failure rates and invalid JSON formats at temperatures 1.5 and 2 (<xref ref-type="supplementary-material" rid="SM2">Supplementary Figure S2</xref>). Each analysis was repeated five times at each temperature setting. In addition to question responses and text citations, the time to complete and the cost were documented.</p></sec>
<sec>
<title>2.5.2 EHR analysis by humans</title>
<p>The test set records were analyzed by five human respondents: two veterinary students (who had completed their second and third years of study, respectively), and three veterinarians (one recent graduate and two with 2-years post-graduate experience each). The humans were blinded to each other&#x00027;s responses and to the responses of the LLMs.</p></sec></sec>
<sec>
<title>2.6 Assessment of classification performance</title>
<p>The majority opinion (mode) of human responses was considered the reference standard and the mode of the LLM responses was classified as either a true positive, false positive, true negative, or false negative. For each clinical sign, sensitivity (also referred to as &#x0201C;recall&#x0201D;), specificity, positive predictive value (PPV) (also referred to as &#x0201C;precision&#x0201D;), and negative predictive value (NPV) were calculated and reported along with 95% confidence intervals, using the &#x0201C;Wilson&#x0201D; method (<xref ref-type="bibr" rid="B28">28</xref>). The F1 score (the harmonic mean of sensitivity and PPV) and balanced accuracy (the arithmetic mean of sensitivity and specificity) were computed. To summarize classification performance across clinical signs, the median and interquartile range (IQR) were reported for each performance metric. The statistical significance of differences in responses between GPT-4o and GPT-3.5 Turbo at temperature 0, as well as between different temperature settings of GPT-4o was assessed with McNemar&#x00027;s chi square test with continuity correction and 1 degree of freedom.</p></sec>
<sec>
<title>2.7 Assessment of reproducibility</title>
<p>Reproducibility was analyzed for both human respondents and repeated runs of GPT-4o. Cohen&#x00027;s Kappa was calculated separately for each unique pair of respondents and averaged across human pairs and pairs of repeated GPT-4o runs at each temperature.</p></sec>
<sec>
<title>2.8 Assessment of compliance with instructions</title>
<p>Compliance with instructions was assessed in three main areas: (1) adherence to output format instructions, (2) providing a true or false response to classification questions, and (3) following citation instructions (citation compliance). The responses generated by the LLMs were assessed for all three areas, while human responses were only evaluated for citation compliance (see <xref ref-type="supplementary-material" rid="SM1">Supplementary material S5</xref>).</p></sec>
<sec>
<title>2.9 Assessment of classification errors</title>
<p>All instances where the mode LLM response differed from the majority opinion (mode) of human respondents were considered errors. If discrepant responses cited the same text sections, the cause of error was assumed to be a difference in interpretation (interpretation discrepancy). If the citations differed, it was assumed that some respondents missed relevant sections (citation discrepancy).</p>
<p>Additionally, errors were further categorized based on ambiguity in the alignment between the description of the clinical sign in the EHR and its definition. If the description could not be conclusively interpreted as meeting or not meeting the qualitative definition of a clinical sign, the error was classified as &#x0201C;qualitative ambiguity&#x0201D;. Similarly, if the timing or chronology described in the record could not be conclusively aligned with the temporal definition of a clinical sign (e.g., distinguishing between a historic vs. present sign), the error was classified as temporal ambiguity.</p></sec></sec>
<sec id="s3">
<title>3 Results</title>
<sec>
<title>3.1 Case material</title>
<p>The test set consisted of 250 EHRs from cat visits occurring between 1991 and 2023 and ranging in word length from 3 to 1262. Although all feline EHRs between 1985 and 2023 were initially considered, no EHRs prior to 1991 contained text in the &#x0201C;Pertinent History&#x0201D; section of the report and were thus excluded. The flow of EHR selection is depicted in <xref ref-type="supplementary-material" rid="SM2">Supplementary Figure S1</xref>.</p></sec>
<sec>
<title>3.2 Classification performance</title>
<p>The performance metrics were computed using the majority opinion of human respondents as a reference standard. The sensitivity specificity, balanced accuracy and NPV for GPT-4o averaged over 96% across clinical signs, regardless of temperature (<xref ref-type="fig" rid="F1">Figure 1</xref>). The average PPV and F1 scores were lower (80.7% and 84.4% respectively at temperature 0) due to GPT-4o errors being dominated by false positives.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Classification performance metrics of GPT-4 omni (GPT-4o) for extracting the presence or absence of six clinical signs at different temperatures. Classification performance metrics for each clinical sign was computed by comparing the mode of GPT-4o responses from five repeated runs at each temperature to a reference standard composed of the majority opinion (mode) of five human respondents. Note the wide confidence intervals of classification performance metrics for three clinical signs of low prevalence in the test set (diarrhea, constipation, and weight loss) hindering interpretation of subtle variations of classification performance estimates across temperatures for these clinical signs. Error bars represent 95% confidence intervals. F1 scores and balanced accuracy are derivatives of sensitivity, specificity, and positive predictive value (PPV); therefore, are reported without confidence intervals. NPV, negative predictive value; PPV, positive predictive value.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fvets-11-1490030-g0001.tif"/>
</fig>
<p>The temperature setting did not significantly impact the classification performance (Temperature 0 vs. temperature 0.5: &#x003C7;<sup>2</sup> = 0.9, <italic>p</italic>-value = 0.34; temperature 0 vs. temperature 1: &#x003C7;<sup>2</sup> = 1.07, <italic>p</italic>-value = 0.3, temperature 0.5 vs. temperature 1: &#x003C7;<sup>2</sup> = 0, <italic>p</italic>-value = 1). GPT-3.5 Turbo performed significantly worse than GPT-4o at temperature 0 (&#x003C7;<sup>2</sup> = 29.9, <italic>p</italic>-value &#x0003C; 0.0001), particularly for sensitivity (81.7%, IQR 78.9&#x02013;84.8%).</p>
<p>The average (median) performance metrics for GPT-4o and GPT-3.5 Turbo, across all clinical signs, are reported together with interquartile ranges (IQRs) in <xref ref-type="supplementary-material" rid="SM3">Supplementary Table S2</xref>.</p></sec>
<sec>
<title>3.3 Reproducibility</title>
<p>The interobserver agreement of GPT-4o responses across consecutive runs decreased at higher temperature settings yet remained higher than human interobserver agreement even at the highest temperature setting (<xref ref-type="fig" rid="F2">Figure 2A</xref>). The average Cohen&#x00027;s kappa between repeated runs of GPT-4o ranged from 0.98 at temperature 0 to 0.93 at temperature 1, while the average Cohen&#x00027;s kappa between human respondents was 0.8. <xref ref-type="supplementary-material" rid="SM3">Supplementary Table S3</xref> contains the summary statistics as well as Cohen&#x00027;s Kappa per pair of respondents for human respondents and repeated runs of GPT-4o at each temperature.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Interobserver agreement and citation compliance for humans and GPT-4 omni (GPT-4o) at different temperatures. GPT-4o, GPT-4 omni. <bold>(A)</bold> Interobserver agreement. Cohen&#x00027;s Kappa was calculated for each unique pair of human respondents, and repeated runs of GPT-4o at different temperatures. GPT-4o showed a decline in agreement between consecutive runs at higher temperatures yet maintained higher agreement than human respondents even at the highest temperature. <bold>(B)</bold> Citation compliance. Citation compliance was assessed by ensuring each citation was properly enclosed in quotes, separated by white-space and matched exactly with the electronic health record (EHR) text. GPT-4o had slightly higher compliance than humans at temperature 0, but its compliance decreased at temperatures 0.5 and 1, falling below that of human respondents.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fvets-11-1490030-g0002.tif"/>
</fig>
</sec>
<sec>
<title>3.4 Compliance with instructions</title>
<p>Unlike preliminary experiments at temperatures 1.5 and 2.0 (<xref ref-type="supplementary-material" rid="SM2">Supplementary Figure S1</xref>), GPT-4o produced outputs for all questions and maintained correct JSON format for over 99% of questions at temperatures 0, 0.5, and 1.0. All output with incorrect JSON format were manually corrected. GPT-4o adhered to the instructions to provide &#x0201C;true&#x0201D; or &#x0201C;false&#x0201D; answers in over 99.9% of classification questions across all temperatures. However, it responded with &#x0201C;NA&#x0201D; to one question at temperature 0 and two questions at temperature 1. Compliance with citation instructions decreased as temperatures increased for GPT-4o (<xref ref-type="fig" rid="F2">Figure 2B</xref>). Human respondents complied with citation instructions less often than GPT-4o at temperature 0 but more frequently than GPT-4o at temperature 0.5 and 1. <xref ref-type="supplementary-material" rid="SM3">Supplementary Table S4</xref> contains details on compliance with instructions for GPT-4o and human respondents.</p>
<p>Citations for both human respondents and GPT-4o at temperature 0 that did not exactly match the EHR text were manually reviewed. All discrepancies were attributed to either minor deviations in quotations, capitalization, punctation or spacing, shortening of the text, paraphrasing, or including the question or field name in the response. No hallucinations (citations not present in the EHR) were detected. <xref ref-type="supplementary-material" rid="SM3">Supplementary Table S5</xref> contains details on discrepancies between citations and EHR texts.</p>
<p>Only one instance of a citation by GPT-4o at temperature 0 altered the meaning of the text: the EHR stated &#x0201C;occasionally strains in litter box to defecate, no diarrhea&#x0201D; but GPT-4o shortened this to &#x0201C;occasional diarrhea&#x0201D;. Despite this change, GPT-4o correctly classified the case as &#x0201C;false&#x0201D; for diarrhea, indicating that the misquotation did not affect the classification outcome.</p></sec>
<sec>
<title>3.5 Classification errors</title>
<p>The mode response of repeated runs of GPT-4o at temperature 0 differed from the majority opinion response of the five human respondents in 43 out of 1,500 questions (2.9%) (<xref ref-type="fig" rid="F3">Figure 3A</xref>). Most errors were false positives [35 out of 43 total errors (81.4%)]. All human and GPT-4o responses at all temperatures are depicted in <xref ref-type="supplementary-material" rid="SM2">Supplementary Figures S3</xref>&#x02013;<xref ref-type="supplementary-material" rid="SM2">S8</xref>.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Classification errors by GPT-4 omni (GPT-4o) at temperature 0. All questions where the mode GPT-4o classification response disagreed with the majority opinion (mode) of human respondents were considered errors. <bold>(A)</bold> Human and GPT-4o responses. Five human respondents and five repeated runs of GPT-4o responded to questions on the presence of six clinical signs. False positive errors (instances where GPT-4o answered &#x0201C;true&#x0201D; and the majority of humans answered &#x0201C;false&#x0201D; were more common than false negative errors. Blue, true; Orange, false; white, NA; GPT-4o, GPT-4 omni; Temp, temperature. <bold>(B)</bold> Classification errors. Most errors occurred in questions where at least one human respondent disagreed with the majority opinion. Interpretation errors were more common than citation errors. For interpretation errors, temporal ambiguity was more common than qualitative ambiguity. Some citation errors involved electronic health records without ambiguity, suggesting that some respondents overlooked relevant sections of the text.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fvets-11-1490030-g0003.tif"/>
</fig>
<p>Most GPT-4o errors [35 out of 43 total errors (81.4%); 35 out of 1,500 questions (2.3%)] occurred in questions where at least one human assessor disagreed with the human majority, indicating potential ambiguity regarding the clinical sign in the EHR (<xref ref-type="fig" rid="F3">Figure 3B</xref>). In 22 of these errors [51.2%; 22 out of 1,500 questions (1.5%)], two human respondents disagreed with the human majority opinion and sided with GPT-4o. In 13 errors [30.2%; 13 out of 1,500 questions (0.9%)], one human assessor disagreed with the majority opinion and sided with GPT-4o. Only eight errors [18.6%; eight out of 1,500 questions (0.5%)] involved GPT-4o responding incorrectly to a question where all human respondents agreed. For instance, an EHR noting &#x0201C;owner found a spit-up phenobarbital pill on the ground&#x0201D; led all GPT-4o runs to answer &#x0201C;true&#x0201D; for vomiting, while all human respondents answered &#x0201C;false&#x0201D;, not citing the text section.</p>
<p>Interpretation errors, i.e., errors where humans and GPT-4o cited the same text but answered differently were more common than citation errors where humans and GPT-4o cited different text (<xref ref-type="fig" rid="F3">Figure 3B</xref>) [32/43 (74.4%) vs. 11/43 errors (25.6%) of total errors].</p>
<p>Interpretation errors arose more frequently from EHR texts with temporal ambiguity than from EHR texts with qualitative ambiguity [22/32 (68.8%) vs. 10/32 (31.2%) of interpretation errors]. For example, an EHR noting a cat was &#x0201C;polyphagic&#x0201D; at a previous visit, but &#x0201C;eating less&#x0201D; after treatment, leading to mixed responses among human respondents due to temporal ambiguity. In contrast, an EHR describing a cat with &#x0201C;1 bowel movement consisting of a hard, crusty piece of feces covered with a softer, outer layer&#x0201D; led to mixed responses among humans for constipation due to qualitative ambiguity.</p>
<p>While some citation errors involved EHRs with qualitative or temporal ambiguity, others did not, suggesting that some respondents overlooked relevant sections of the text (<xref ref-type="fig" rid="F3">Figure 3B</xref>). For example, an EHR noting, &#x0201C;weight 6/30/01 16.5 lb, weight 11/00 20.5 lb&#x0201D; led two out of five humans (and all GPT-4o runs) to answer &#x0201C;true&#x0201D; for weight loss, while three out of five humans answered &#x0201C;false&#x0201D; without citing the text. Notably, most of these instances involved the majority of human respondents answering &#x0201C;false&#x0201D; while GPT-4o answered &#x0201C;true&#x0201D;, suggesting that some errors might reflect flaws in the human majority opinion rather than true GPT-4o errors. Only one instance involved GPT-4o missing an explicit mention of a clinical sign that the majority of humans did not, indicating that GPT-4o was better at identifying relevant portions of the text then human respondents.</p></sec>
<sec>
<title>3.6 Time and cost</title>
<p>GPT-3.5 Turbo analysis was quicker and cheaper than GPT-4o analysis. The median time and cost per EHR were 1.6 s (IQR 1.4&#x02013;1.9 s) and 0.07 US cents (IQR 0.06&#x02013;0.08 cents) for GPT-3.5 Turbo and 2.5 s (IQR 1.9&#x02013;3.3 s) and 0.7 US cents (IQR 0.7&#x02013;0.9) for GPT-4o.</p></sec></sec>
<sec id="s4">
<title>4 Discussion</title>
<p>This study demonstrated a high classification performance of the LLM GPT-4o in identifying clinical signs consistent with FCE in EHRs from a single veterinary referral hospital. The findings indicate near perfect sensitivity and negative predictive value, an outcome favorable for the intended use of the model as a screening tool. These results align with two previous studies that analyzed the classification performance of GPT models using human manual review as the reference standard. One of the studies used GPT-3.5 Turbo to identify cases of obesity in veterinary medical EHRs, reporting a sensitivity of 100% (<xref ref-type="bibr" rid="B8">8</xref>), while another study with a previous version of GPT-4 (1106) achieved a 97% sensitivity in identifying comorbidities in human cancer patient EHRs (<xref ref-type="bibr" rid="B29">29</xref>). In both these studies, as well as the current one, errors were dominated by false positives, where the LLM indicated a clinical sign as present while the human reviewer considered it absent, resulting in lower PPVs and F1 scores.</p>
<p>In addition to an excellent classification performance, this study demonstrated good reproducibility, which was higher between repeated runs of GPT-4o than human respondents, regardless of temperature settings. At temperature 0, average Cohen&#x00027;s Kappa for repeated runs of GPT-4o approached perfect agreement, suggesting that a single analysis run is sufficient and that averaging the results over multiple runs may not be necessary at this temperature. In contrast to previous studies, this study employed multiple human reviewers, which enabled the evaluation of interobserver agreement, and the identification of challenging records characterized by low levels of consensus among reviewers.</p>
<p>Increasing the temperature did not negatively affect classification performance but reduced reproducibility and compliance, suggesting that lower temperature settings are better suited for this analytic task. Since the temperature setting cannot be adjusted in the web-based versions of ChatGPT, reproducing the experiments conducted in this study would require the use of an API interface.</p>
<p>Most GPT-4o errors occurred in instances where human respondents disagreed, suggesting that many of these errors were &#x0201C;reasonable interpretations&#x0201D; of ambiguous information. The majority of disagreements among human respondents involved temporal ambiguities. Notably, respondents were asked to assess the &#x0201C;presence&#x0201D; of a clinical sign and that a &#x0201C;current&#x0201D;, or &#x0201C;recently present&#x0201D; clinical sign qualified as &#x0201C;present&#x0201D;. Supplying a more stringent definition of &#x0201C;presence&#x0201D;, such as including time cut-offs for &#x0201C;current&#x0201D; or &#x0201C;recent&#x0201D;, might have reduced the frequency of errors. However, this approach was ultimately dismissed to avoid the systematic exclusion of chronic intermittent clinical signs, which are vital for identifying FCE.</p>
<p>The varying experience level of human respondents (students and veterinarians) may have contributed to interobserver variability. However, at our institution, manual review of EHRs for retrospective studies is often assigned to students, making their inclusion representative of real-world practice. In our experience students&#x00027; caution and attention to detail can offset their lack of experience and may even surpass veterinary specialists in record review.</p>
<p>Although the classification performance of human reviewers cannot be assessed when using human reviewers as a reference standard, some discrepant responses arose from humans missing relevant sections of the text. Similarly to a previous study (<xref ref-type="bibr" rid="B29">29</xref>), failing to cite explicit mentions of a clinical sign was much rarer for GPT-4o than humans suggesting that in some instances, GPT-4o may exceed human sensitivity.</p>
<p>These findings also highlight a broader issue: human reviewers, while serving as the reference standard, are not infallible. Any discrepancies between GPT-4o and human responses may reflect differences from an imperfect standard rather than definitive model failures. To address this issue, we complemented performance metrics with a detailed error analysis. This approach provided a more nuanced understanding of presumed model errors and revealed instances where ambiguities in the EHR, rather than true model errors accounted for the observed differences.</p>
<p>Hallucinations, a major concern for utilizing LLMs for veterinary research (<xref ref-type="bibr" rid="B30">30</xref>), were not observed in this study. However, instances of paraphrasing, shortening, and non-adherence to instructions for citations occurred even at the lowest temperature setting of 0. Although these deviations did not affect the GPT-4o response in any of the observed cases, they underscore that LLMs, unlike rule-based computer programs, are probabilistic, and may not always comply with the instructions provided.</p>
<p>In our specific setting, GPT-4o outperformed GPT-3.5 Turbo but was 10 times more expensive to run. However, it is possible that the performances of both models are more similar when applied to other tasks. For instance, Fins et al. (<xref ref-type="bibr" rid="B8">8</xref>) achieved near perfect sensitivity for detection of mentions of obesity with GPT-3.5 Turbo suggesting that this more cost-efficient model may be sufficient for some applications. In addition, it is conceivable that LLMs not tested in the current study perform better than GPT-4o or perform similarly at lower cost. Open-source LLMs could be particularly appealing when evaluating large numbers of EHRs, where cost is a limiting factor.</p>
<p>Only EHRs from a single tertiary referral clinic were included in this study. The quality of EHRs can vary across different veterinary settings, and it is unclear if our findings would generalize to less detailed records. This highlights the importance of validating LLMs for the specific task and environment in which they are intended to be used.</p>
<p>In contrast to ChatGPT, the online chat version of GPT, API applications through OpenAI or Microsoft Azure can be configured to comply with federal privacy regulations for human EHRs. Previous studies using LLMs for this purpose have relied on de-identified medical records (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B29">29</xref>), synthetic data (<xref ref-type="bibr" rid="B31">31</xref>) or locally run open source LLMs (<xref ref-type="bibr" rid="B31">31</xref>). While the de-identification of EHRs is feasible for a dataset size such as used in the current study, de-identifying larger sets of records might pose a significant barrier and negate most benefits of using an LLM over manual review. Although privacy and security regulations are less stringent in veterinary medicine than in human medicine, obtaining approval to use these types of applications is an essential point to consider prior to study execution.</p>
<p>In conclusion, the use of GPT-4o to extract information from veterinary EHRs can be a reliable alternative to human manual extraction. While considerations for cost and data privacy remain, this technology unlocks new possibilities for retrospective data analysis at a scale previously unattainable. This capability is critical for transforming routinely collected Real-World Data into Real-World Evidence to inform clinical practice and research in veterinary medicine. Future work should focus on scaling up data mining in veterinary medicine, integrating data from multiple institutions, and developing guidelines for ongoing validation and comparison of LLMs in this field.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The datasets used in this study are not publicly available due to legal restrictions on sharing EHR data. However, the data can be provided upon request by contacting <email>smkeller&#x00040;ucdavis.edu</email>.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>JMW: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Validation, Visualization, Writing&#x02014; original draft, Writing&#x02014; review &#x00026; editing. KLJ: Conceptualization, Methodology, Validation, Writing&#x02014; review &#x00026; editing. MAL: Validation, Writing&#x02014; review &#x00026; editing. SLK: Validation, Writing&#x02014; review &#x00026; editing. ND: Validation, Writing &#x02013; review &#x00026; editing. LEP: Validation, Writing&#x02014; review &#x00026; editing. JW: Conceptualization, Formal analysis, Methodology, Writing &#x02013; review &#x00026; editing. SM: Conceptualization, Writing&#x02014; review &#x00026; editing. SMK: Conceptualization, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Supervision, Validation, Visualization, Writing&#x02014; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s7">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. JMW received support from the UC Davis Charles River Laboratories Digital Pathology Fellowship and the Peter C. Kennedy Endowed Fellowship for Veterinary Anatomic Pathology.</p>
</sec>
<ack><p>We would like to thank Danielle Harvey, Department of Public Health Science, University of California Davis, for statistical advice. Chat GPT free and Chat GPT teams, version 4o were used for manuscript and code editing. The initial version of this manuscript was published as a preprint on arXiv (<xref ref-type="bibr" rid="B32">32</xref>) prior to peer review.</p>
</ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s8">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s9">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fvets.2024.1490030/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fvets.2024.1490030/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Presentation_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>Supplementary Presentation 1</label>
<caption><p>Subheaders and text citations updated to comply with format for other supplemenrary files. The order of two subsectuons were changes to comply wiith edits of the text.</p></caption>
</supplementary-material>
<supplementary-material xlink:href="Presentation_2.pdf" id="SM2" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>Supplementary Presentation 2</label>
<caption><p>All figures compiled to a PDF. The order of Fig S1 and S2 changed (to comply wth changes in the text) Lgeneds updated to comply with format of main figures.</p></caption></supplementary-material>
<supplementary-material xlink:href="Presentation_3.pdf" id="SM3" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>Supplementary Presentation 3</label>
<caption><p>Saved as PDF &#x00026; changes in legends to coply with format. for main paper.</p></caption></supplementary-material>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Manuel</surname> <given-names>DG</given-names></name> <name><surname>Rosella</surname> <given-names>LC</given-names></name> <name><surname>Stukel</surname> <given-names>TA</given-names></name></person-group>. <article-title>Importance of accurately identifying disease in studies using electronic health records</article-title>. <source>BMJ.</source> (<year>2010</year>) <volume>341</volume>:<fpage>c4226</fpage>. <pub-id pub-id-type="doi">10.1136/bmj.c4226</pub-id><pub-id pub-id-type="pmid">20724404</pub-id></citation></ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Center</surname> <given-names>for Veterinary Medicine</given-names></name></person-group>. <source>CVM GFI &#x00023;266 Use of Real-World Data and Real-World Evidence to Support Effectiveness of New Animal Drugs</source>. <publisher-loc>FDA</publisher-loc> (<year>2021</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/regulatory-information/search-fda-guidance-documents/cvm-gfi-266-use-real-world-data-and-real-world-evidence-support-effectiveness-new-animal-drugs">https://www.fda.gov/regulatory-information/search-fda-guidance-documents/cvm-gfi-266-use-real-world-data-and-real-world-evidence-support-effectiveness-new-animal-drugs</ext-link> (accessed December 17, 2024).<pub-id pub-id-type="pmid">37029504</pub-id></citation></ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ford</surname> <given-names>E</given-names></name> <name><surname>Carroll</surname> <given-names>JA</given-names></name> <name><surname>Smith</surname> <given-names>HE</given-names></name> <name><surname>Scott</surname> <given-names>D</given-names></name> <name><surname>Cassell</surname> <given-names>JA</given-names></name></person-group>. <article-title>Extracting information from the text of electronic medical records to improve case detection: a systematic review</article-title>. <source>J Am Med Inform Assoc.</source> (<year>2016</year>) <volume>23</volume>:<fpage>1007</fpage>&#x02013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1093/jamia/ocv180</pub-id><pub-id pub-id-type="pmid">26911811</pub-id></citation></ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>De Angelis</surname> <given-names>L</given-names></name> <name><surname>Baglivo</surname> <given-names>F</given-names></name> <name><surname>Arzilli</surname> <given-names>G</given-names></name> <name><surname>Privitera</surname> <given-names>GP</given-names></name> <name><surname>Ferragina</surname> <given-names>P</given-names></name> <name><surname>Tozzi</surname> <given-names>AE</given-names></name> <etal/></person-group>. <article-title>ChatGPT and the rise of large language models: the new AI-driven infodemic threat in public health</article-title>. <source>Front Public Health.</source> (<year>2023</year>) <volume>11</volume>:<fpage>1166120</fpage>. <pub-id pub-id-type="doi">10.3389/fpubh.2023.1166120</pub-id><pub-id pub-id-type="pmid">37181697</pub-id></citation></ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brown</surname> <given-names>TB</given-names></name> <name><surname>Mann</surname> <given-names>B</given-names></name> <name><surname>Ryder</surname> <given-names>N</given-names></name> <name><surname>Subbiah</surname> <given-names>M</given-names></name> <name><surname>Kaplan</surname> <given-names>J</given-names></name> <name><surname>Dhariwal</surname> <given-names>P</given-names></name> <etal/></person-group>. <article-title>Language models are few-shot learners</article-title>. <source>arXiv</source> [preprint]. (<year>2020</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>JP</given-names></name> <name><surname>Staats</surname> <given-names>C</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Szegedy</surname> <given-names>C</given-names></name> <name><surname>Weinberger</surname> <given-names>KQ</given-names></name> <name><surname>Wu</surname> <given-names>Y</given-names></name></person-group>. <article-title>Don&#x00027;t trust: verify &#x02013; grounding LLM quantitative reasoning with autoformalization</article-title>. <source>arXiv</source> [preprint]. (<year>2024</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2403.18120</pub-id></citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Farquhar</surname> <given-names>S</given-names></name> <name><surname>Kossen</surname> <given-names>J</given-names></name> <name><surname>Kuhn</surname> <given-names>L</given-names></name> <name><surname>Gal</surname> <given-names>Y</given-names></name></person-group>. <article-title>Detecting hallucinations in large language models using semantic entropy</article-title>. <source>Nature.</source> (<year>2024</year>) <volume>630</volume>:<fpage>625</fpage>&#x02013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-024-07421-0</pub-id><pub-id pub-id-type="pmid">38898292</pub-id></citation></ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fins</surname> <given-names>IS</given-names></name> <name><surname>Davies</surname> <given-names>H</given-names></name> <name><surname>Farrell</surname> <given-names>S</given-names></name> <name><surname>Torres</surname> <given-names>JR</given-names></name> <name><surname>Pinchbeck</surname> <given-names>G</given-names></name> <name><surname>Radford</surname> <given-names>AD</given-names></name> <etal/></person-group>. <article-title>Evaluating ChatGPT text mining of clinical records for companion animal obesity monitoring</article-title>. <source>Vet Rec.</source> (<year>2024</year>) <volume>194</volume>:<fpage>e3669</fpage>. <pub-id pub-id-type="doi">10.1002/vetr.3669</pub-id><pub-id pub-id-type="pmid">38058223</pub-id></citation></ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Obuchowski</surname> <given-names>NA</given-names></name></person-group>. <article-title>Sample size calculations in studies of test accuracy</article-title>. <source>Stat Methods Med Res.</source> (<year>1998</year>) <volume>7</volume>:<fpage>371</fpage>&#x02013;<lpage>92</lpage>. <pub-id pub-id-type="doi">10.1191/096228098678080061</pub-id></citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Van Rossum</surname> <given-names>G</given-names></name></person-group>. <source>Python documentation.</source> Python Software Foundation. Python 3.9.13 The Python Language Reference (<year>2022</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://docs.python.org/3/reference/index.html">https://docs.python.org/3/reference/index.html</ext-link> (accessed December 18, 2024).</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Van</surname> <given-names>Rossum G</given-names></name></person-group>. <source>Python Documentation</source>. Python 3.9.13 The Python Standard Library (<year>2022</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://docs.python.org/3/library/index.html">https://docs.python.org/3/library/index.html</ext-link> (accessed December 18, 2024).</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="web"><person-group person-group-type="author"><collab>OpenAI</collab></person-group>. <source>openai</source> (<year>2024</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/openai/openai-python">https://github.com/openai/openai-python</ext-link> (accessed December 18, 2024).</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Jain</surname> <given-names>S</given-names></name></person-group>. <source>OpenAI</source>. <publisher-loc>tiktoken</publisher-loc> (<year>2022</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/openai/tiktoken">https://github.com/openai/tiktoken</ext-link> (accessed December 18, 2024).</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>R</surname> <given-names>Core Team</given-names></name></person-group>. <source>R: A Language and Environment for Statistical Computing</source>. <publisher-loc>Vienna</publisher-loc>: <publisher-name>R Foundation for Statistical Computing</publisher-name> (<year>2023</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link> (accessed April 4, 2024).</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Posit</surname> <given-names>Team</given-names></name></person-group>. <source>R Studio: Integrated Development Environment for R</source>. Boston MA: Posit Software. PBC (<year>2023</year>). Available at: <ext-link ext-link-type="uri" xlink:href="http://www.posit.co">http://www.posit.co</ext-link> (accessed April 4, 2024).</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wickham</surname> <given-names>H</given-names></name> <name><surname>Averick</surname> <given-names>M</given-names></name> <name><surname>Bryan</surname> <given-names>J</given-names></name> <name><surname>Chang</surname> <given-names>W</given-names></name> <name><surname>McGowan</surname> <given-names>LD</given-names></name> <name><surname>Fran&#x000E7;ois</surname> <given-names>R</given-names></name> <etal/></person-group>. <article-title>Welcome to the tidyverse</article-title>. <source>JOSS.</source> (<year>2019</year>) <volume>4</volume>:<fpage>1686</fpage>. <pub-id pub-id-type="doi">10.21105/joss.01686</pub-id></citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gagolewski</surname> <given-names>M</given-names></name></person-group>. <article-title>stringi: fast and portable character string processing in R</article-title>. <source>J Stat Softw.</source> (<year>2022</year>) <volume>103</volume>:<fpage>1</fpage>&#x02013;<lpage>59</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v103.i02</pub-id></citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Ooms</surname> <given-names>J</given-names></name></person-group>. <source>writexl: Export Data Frames to Excel &#x0201C;xlsx&#x0201D; Format</source>. (<year>2023</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=writexl">https://CRAN.R-project.org/package=writexl</ext-link> (accessed April 4, 2024).</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ooms</surname> <given-names>J</given-names></name></person-group>. <article-title>The jsonlite package: a practical and consistent mapping between JSON data and R objects</article-title>. <source>arXiv</source> [preprint]. (<year>2014</year>). <pub-id pub-id-type="doi">10.48550/arXiv.1403.2805</pub-id></citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Wickham</surname> <given-names>H</given-names></name> <name><surname>Bryan</surname> <given-names>J</given-names></name></person-group>. <source>readxl: Read Excel Files</source> (<year>2023</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=readxl">https://CRAN.R-project.org/package=readxl</ext-link> (accessed April 4, 2024).</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Gamer</surname> <given-names>M</given-names></name> <name><surname>Lemon</surname> <given-names>J</given-names></name> <name><surname>Fellows</surname> <given-names>I</given-names></name> <name><surname>Singh</surname> <given-names>P</given-names></name></person-group>. <source>irr: Various Coefficients of Interrater Reliability and Agreement</source> (<year>2019</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=irr">https://CRAN.R-project.org/package=irr</ext-link> (accessed July 25, 2024).</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Sundar</surname> <given-names>DR</given-names></name></person-group>. <source>binom: Binomial Confidence Intervals for Several Parameterizations</source> (<year>2022</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=binom">https://CRAN.R-project.org/package=binom</ext-link> (accessed July 25, 2024).</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brunson</surname> <given-names>JC</given-names></name></person-group>. <article-title>ggalluvial: layered grammar for alluvial plots</article-title>. <source>J Open Source Softw.</source> (<year>2020</year>) <volume>5</volume>:<fpage>2017</fpage>. <pub-id pub-id-type="doi">10.21105/joss.02017</pub-id><pub-id pub-id-type="pmid">36919162</pub-id></citation></ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Neuwirth</surname> <given-names>E</given-names></name></person-group>. <source>RColorBrewer: ColorBrewer Palettes</source> (<year>2022</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=RColorBrewer">https://CRAN.R-project.org/package=RColorBrewer</ext-link> (accessed April 4, 2024).</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Baptiste</surname> <given-names>A</given-names></name></person-group>. <source>gridExtra: Miscellaneous Functions for &#x0201C;Grid&#x0201D; Graphics</source> (<year>2017</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=gridExtra">https://CRAN.R-project.org/package=gridExtra</ext-link> (accessed July 25, 2024).</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Marsilio</surname> <given-names>S</given-names></name> <name><surname>Freiche</surname> <given-names>V</given-names></name> <name><surname>Johnson</surname> <given-names>E</given-names></name> <name><surname>Leo</surname> <given-names>C</given-names></name> <name><surname>Langerak</surname> <given-names>AW</given-names></name> <name><surname>Peters</surname> <given-names>I</given-names></name> <etal/></person-group>. <article-title>ACVIM consensus statement guidelines on diagnosing and distinguishing low-grade neoplastic from inflammatory lymphocytic chronic enteropathies in cats</article-title>. <source>J Vet Intern Med.</source> (<year>2023</year>) <volume>37</volume>:<fpage>794</fpage>&#x02013;<lpage>816</lpage>. <pub-id pub-id-type="doi">10.1111/jvim.16690</pub-id><pub-id pub-id-type="pmid">37130034</pub-id></citation></ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Azure</surname> <given-names>OpenAI</given-names></name></person-group>. <source>Azure OpenAI Service REST API Reference.</source> Azure OpenAI (<year>2024</year>). Available at: <ext-link ext-link-type="uri" xlink:href="https://learn.microsoft.com/en-us/azure/ai-services/openai/reference">https://learn.microsoft.com/en-us/azure/ai-services/openai/reference</ext-link> (accessed June 24, 2024).</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Newcombe</surname> <given-names>RG</given-names></name></person-group>. <article-title>Two-sided confidence intervals for the single proportion: comparison of seven methods</article-title>. <source>Stat Med</source>. (<year>1998</year>) <volume>17</volume>:<fpage>857</fpage>&#x02013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1002/(SICI)1097-0258(19980430)17:8&#x0003C;857::AID-SIM777&#x0003E;3.0.CO;2-E</pub-id></citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zurita</surname> <given-names>AW</given-names></name> <name><surname>Miras Del Rio</surname> <given-names>H</given-names></name> <name><surname>Ruiz De Aguirre</surname> <given-names>NU</given-names></name> <name><surname>Navarro</surname> <given-names>CN</given-names></name> <name><surname>Jim&#x000E9;nez</surname> <given-names>MR</given-names></name> <name><surname>Carmona</surname> <given-names>DM</given-names></name> <etal/></person-group>. <article-title>The transformative potential of large language models in mining electronic health records data</article-title>. <source>arXiv</source> [preprint] (<year>2024</year>). <pub-id pub-id-type="doi">10.1101/2024.03.07.24303588</pub-id><pub-id pub-id-type="pmid">39746191</pub-id></citation></ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chu</surname> <given-names>CP</given-names></name></person-group>. <article-title>ChatGPT in veterinary medicine: a practical guidance of generative artificial intelligence in clinics, education, and research</article-title>. <source>Front Vet Sci</source>. (<year>2024</year>) <volume>11</volume>:<fpage>1395934</fpage>. <pub-id pub-id-type="doi">10.3389/fvets.2024.1395934</pub-id><pub-id pub-id-type="pmid">38911678</pub-id></citation></ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guevara</surname> <given-names>M</given-names></name> <name><surname>Chen</surname> <given-names>S</given-names></name> <name><surname>Thomas</surname> <given-names>S</given-names></name> <name><surname>Chaunzwa</surname> <given-names>TL</given-names></name> <name><surname>Franco</surname> <given-names>I</given-names></name> <name><surname>Kann</surname> <given-names>BH</given-names></name> <etal/></person-group>. <article-title>Large language models to identify social determinants of health in electronic health records</article-title>. <source>npj Digit Med.</source> (<year>2024</year>) <volume>7</volume>:<fpage>1</fpage>&#x02013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1038/s41746-023-00970-0</pub-id><pub-id pub-id-type="pmid">38200151</pub-id></citation></ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Wulcan</surname> <given-names>JM</given-names></name> <name><surname>Jacques</surname> <given-names>KL</given-names></name> <name><surname>Lee</surname> <given-names>MA</given-names></name> <name><surname>Kovacs</surname> <given-names>SL</given-names></name> <name><surname>Dausend</surname> <given-names>N</given-names></name> <name><surname>Prince</surname> <given-names>LE</given-names></name> <etal/></person-group>. <article-title>Classification performance and reproducibility of GPT-4 omni for information extraction from veterinary electronic health records [Internet]</article-title>. <source>arXiv</source> (<year>2024</year>). Available at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2409.13727">http://arxiv.org/abs/2409.13727</ext-link> (accessed September 23, 2024).</citation>
</ref>
</ref-list>
</back>
</article> 