<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2026.1658575</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Improving reliability and accuracy of structured data extraction using a consensus large-language model approach&#x2013;a use case description in multiple sclerosis</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Poser</surname>
<given-names>Philip Lennart</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3120948"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Klimas</surname>
<given-names>Rafael</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Luerweg</surname>
<given-names>Justus</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Reuter</surname>
<given-names>Emilie</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hanefeld</surname>
<given-names>Christoph</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gold</surname>
<given-names>Ralf</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/670489/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Salmen</surname>
<given-names>Anke</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/512226"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Motte</surname>
<given-names>Jeremias</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/772693"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Neurology, St. Josef-Hospital, Ruhr-University Bochum</institution>, <city>Bochum</city>, <country country="de">Germany</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Internal Medicine, Katholisches Klinikum Bochum, Ruhr-University Bochum</institution>, <city>Bochum</city>, <country country="de">Germany</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Philip Lennart Poser, <email xlink:href="mailto:philip.poser@rub.de">philip.poser@rub.de</email></corresp>
<fn fn-type="equal" id="fn0001">
<label>&#x2020;</label>
<p>These authors share senior authorship</p>
</fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-13">
<day>13</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>9</volume>
<elocation-id>1658575</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>04</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Poser, Klimas, Luerweg, Reuter, Hanefeld, Gold, Salmen and Motte.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Poser, Klimas, Luerweg, Reuter, Hanefeld, Gold, Salmen and Motte</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-13">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>The absence of standardization in the documentation of routine clinical data complicates research usage of retrospective data on a large-scale basis. Medically trained personnel is required for interpretation and conversion into a structured format making it time and cost intensive and creating a potential bias of such data. To address these challenges, we have developed a semi-automated approach for evaluating Multiple Sclerosis (MS) outpatients reports that utilizes different large-language models (LLM) and their consensus in comparison to manual evaluation.</p>
</sec>
<sec>
<title>Methods</title>
<p>We used several commercially available LLMs by OpenAI, Anthropic and Google to create a structured output of several variables with differing complexity of 30 anonymized outpatient reports with zero-shot-learning. We added a consensus output by combining the results of three different LLMs. Over several runs, we adapted the prompt, compared the results with a reference and assessed the error rate. Any deviation from the reference was considered an error. A true-error rate was determined for the LLM consensus output and the neurology specialist output, where only content deviations are counted as errors.</p>
</sec>
<sec>
<title>Results</title>
<p>Through 9 iterations of improving the structure and content of the prompt, we have seen a clear reduction in the error rate of the various LLMs. By creating an LLM consensus with the final prompt design, we were able to overcome a ceiling effect in reducing the error rate. With a true-error rate of 1.48%, the LLM consensus shows a similar error rate as neurologists (around 2%) in the creation of structured data.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Our method enables fast and reliable LLM-based analysis of large clinical routine data sets of varying complexity with a low technical barrier to entry. By generating an LLM consensus, we were able to considerably improve the quality of the output making it comparable to data created by neurology specialists. This approach allows large amounts of unstructured data to be analyzed in a time and cost-efficient manner. Nevertheless, the evaluation of errors in results produced by LLM remains difficult. Scientific work using such methods must continue to be subject to strict testing of the validity of the method in the future.</p>
</sec>
</abstract>
<kwd-group>
<kwd>data extraction</kwd>
<kwd>large language model</kwd>
<kwd>multiple sclerosis</kwd>
<kwd>neurology</kwd>
<kwd>real world evidence</kwd>
<kwd>structured data</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="26"/>
<page-count count="10"/>
<word-count count="8070"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medicine and Public Health</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>The absence of structured documentation of clinical data is a relevant barrier to the collection and use of real-world data from medical care for scientific purposes. Most of the documentation of routine clinical data and findings is unstructured, e.g., medical history, daily visit documentation and diagnostic reports. In contrast, structured data are data that can be stored in a data organization tool such as a spreadsheet or a database. Both correctness of the content and correctness of the format are crucial for the scientific and statistical use of structured data. Information within routine clinical data is often not explicit, but only indirectly described. For example, real-world formulations to indicate the start of a treatment include vague time information such as &#x201C;In spring 2022,&#x201D; &#x201C;Mid-February 2023&#x201D; or &#x201C;The next MRI (magnetic resonance imaging) examination is planned 6&#x202F;months after start of treatment.&#x201D; Assumptions between different parts of a report can be drawn with sufficient accuracy, e.g., if a person with Multiple Sclerosis (MS) has an Expanded Disability Status Scale (EDSS) score of less than 2.0, by definition there can be no restriction of walking distance, even if this is not explicitly stated elsewhere. The structure, the information provided and the style of, e.g., medicals reports depend on various factors, such as the given hospital IT infrastructure or personal habits of the physician.</p>
<p>Therefore, the extraction of data from clinical routine is often associated with data interpretation and conversion &#x2013; so called &#x201C;data transformation&#x201D; and &#x201C;data aggregation&#x201D; &#x2013; making it a complex task (<xref ref-type="bibr" rid="ref5">Capurro et al., 2014</xref>; <xref ref-type="bibr" rid="ref1">Adnan et al., 2020</xref>). The analysis of routine clinical data is frequently a very labor-intensive process requiring skilled human resources to review the data and convert it into a structured format (<xref ref-type="bibr" rid="ref22">Tayefi et al., 2021</xref>). An analysis of further parameters in this context often requires a repetition of the data review augmenting the workload. The rich data source derived from clinical routine is thus under-used and usually focused on sub-cohorts of special interest for a specific research question. This may represent a relevant source of bias in the analysis of real-world data (RWD) (<xref ref-type="bibr" rid="ref20">Sherman et al., 2016</xref>; <xref ref-type="bibr" rid="ref8">Ehrenstein et al., 2024</xref>).</p>
<p>Large-language models (LLMs) such as ChatGPT (OpenAI), Claude (Anthropic), Gemini (Google), Llama (Meta) and others represent an attractive artificial intelligence (AI-) supported solution to transfer text-based data sources into structured ready-to-use data for further analysis in various applications (<xref ref-type="bibr" rid="ref7">Dagdelen et al., 2024</xref>; <xref ref-type="bibr" rid="ref26">Woznicki et al., 2025</xref>). Since the concept behind LLMs is the understanding and reproduction of natural language and not the output of structured data, the quality and ability to output structured data differs depending on the model and provider (<xref ref-type="bibr" rid="ref12">Liu et al., 2024</xref>). The concept of using LLMs in the context of medical care is currently a rapidly expanding field of research (<xref ref-type="bibr" rid="ref10">Gencer and Gencer, 2025</xref>).</p>
<p>We set out to establish an LLM-based approach to the analysis of real-world, unstructured outpatient reports, analyzing variables of varying complexity in the field of neurology. In this study, we tested the options for outputting structured data within different LLMs. In a second step, we tested different methods in a MS use case to enable timely assessment of large retrospective datasets in MS. We aimed for a practical, easy-to-use approach for clinician-scientists to obtain structured data and use it in a scientific or clinical context, e.g., quality assurance. Our research is therefore not intended to represent a definitive methodology for extracting structured data from medical records, but rather to help develop an appropriate approach that can be implemented in one&#x2019;s own clinical research.</p>
</sec>
<sec sec-type="methods" id="sec2">
<label>2</label>
<title>Methods</title>
<sec id="sec3">
<label>2.1</label>
<title>Source data</title>
<p>Outpatient reports in German language, generated by eight different neurologists, were extracted from the clinical information system filtering for reports from our MS clinic of visits between 01-Jan-2023 to 31-Dec-2023 of an academic hospital with neuroimmunological focus. Reports were only included if the visit and report date matched. Reports of 30 patients were randomly selected, manually anonymized and used for primary LLM prompt analysis and reiteratively used for refinement of the prompt.</p>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Three-step human and LLM evaluation of source data</title>
<p>Data of interest were defined in a tabular form for human report evaluation. A first-generation prompt in plain language was generated to query the LLMs.</p>
<p>Medical data were analyzed in one run by trained medical personnel (TMP) and the LLMs (<xref ref-type="fig" rid="fig1">Figure 1a</xref>, orange path). The results of the LLMs were compared with those of the TMP by neurology specialists (NSPs) (<xref ref-type="fig" rid="fig1">Figure 1a</xref>, blue path). The latter extracted their structured results based on the reports, the prior human answers and LLM answers from the first prompt. These data were analyzed and a structured human reference was developed in agreement with the working group consisting of two specialists and two assistant physicians from the field of neurology (<xref ref-type="fig" rid="fig1">Figure 1a</xref>, green path). During the creation of the reference data, a unanimous decision was made for each variable. This reference was then used to iteratively improve the prompt (<xref ref-type="fig" rid="fig1">Figure 1b</xref>).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p><bold>(a)</bold> Creation of the human reference by trained medical staff (orange) and neurology specialists (blue) and creation of the reference results (green) from the first prompt, <bold>(b)</bold> iterative improvement of the prompt until satisfactory result, <bold>(c)</bold> creation of an LLM consensus.</p>
</caption>
<graphic xlink:href="frai-09-1658575-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Figure with three flowcharts. Panel a shows 30 neurological outpatient reports processed by trained personnel and four AI models to generate structured results, with a working group reviewing results to establish a reference for neurology specialists. Panel b depicts an iterative workflow of generating structured output, comparing results to a reference, updating prompt design, and finalizing the prompt. Panel c shows three AI models processing the same 30 reports to provide structured results, which are combined into a consensus structured result.</alt-text>
</graphic>
</fig>
<p>The plain language prompt was converted into a JavaScript Object Notation (JSON)-format for the structured output functions of the different LLMs, which allows responses to also be returned in JSON format.</p>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>Variables analyzed</title>
<p>We used a mix of different variable types with mixed complexity. To obtain an assessment of complexity, the working group in consensus gave a subjective assessment of the complexity of a variable, ranging from low-complex, medium-complex to high-complex. The variables analyzed were all related to the disease MS but were not specifically selected for the application of an LLM-supported evaluation. The selection of variables represents a classic evaluation of a retrospective data set in MS. The following 19 variables were examined: diagnosis (low), disease course (low), date of first manifestation (medium), date of first diagnosis (medium), current EDSS (low), oligoclonal bands (OCBs) status (low), Aquaporin-4(AQP4)-antibody (AB) status (low), myelin-oligodendocyte-glycoprotein(MOG)-AB status (low), last cranial MRI (cMRI) date (medium), cMRI activity (medium), own cMRI report interpretation (high), current immunotherapy as active substance name (low), start date of current immunotherapy (medium), previous immunotherapies as active substance name (medium), other diagnoses (medium), comedication (low), walking distance (high) and walking aid (medium).</p>
<p>In the evaluation of the creation of structured outputs, we also considered the following additional variables: functional scores (FS) of the EDSS, results of the Multiple Sclerosis Functional Composite (MSFC). In total, all of these latter 13 variables were only mentioned in a very small proportion of the medical records. Therefore, we excluded them from the content analysis. We also excluded the question &#x201C;current neurological symptoms&#x201D; from the content analysis as a reliable and distinct evaluation was not possible due to the widely differing wording of the structured results.</p>
</sec>
<sec id="sec6">
<label>2.4</label>
<title>LLMs</title>
<p>We decided to use commercial LLMs because they allow access to models with a higher number of parameters without requiring extensive infrastructure. From our perspective, the use of commercial LLMs reflects the simplest approach to generating structured output with limited financial resources and without significant technical effort. In this study, we used the following commercially available versions of LLMs to analyze the data: claude-3-haiku-20240307 (Haiku; Anthropic), claude-3-opus-20240229 (Opus; Anthropic), claude-3-7-sonnet-20250219 (Sonnet-3.7; Anthropic), gemini-1.5-flash-002 (Gemini-1.5-flash; Google), gemini-1.5-pro-002 (Gemini-1.5-pro; Google), gemini-2.0-pro-exp-02-05 (Gemini-2-pro; Google), gpt-4o-2024-08-06 (4o; OpenAI), gpt-4o-mini-2024-07-18 (4o-mini; OpenAI) and o3-mini-2025-01-31 (o3-mini; OpenAI). The LLMs were accessed via the manufacturers&#x2019; application programming interfaces (APIs) when not stated otherwise. All requests were made with a temperature value of 0.7 and a top <italic>p</italic> value of 1 if applicable. Wherever possible, the data were analyzed as a batch analysis. The order of the models within this paper is based on alphabetical order and does not represent a qualitative ranking. Models are always listed in the same order as they are listed in the methods. We used the same prompt for the queries to the different LLMs. All requests for the final analysis were made between 22nd of March 2025 to 28th of March 2025.</p>
<p>For the consensus LLM output, we compared the output of the Sonnet-3.7, Gemini-2-pro and o3-mini models (<xref ref-type="fig" rid="fig1">Figure 1c</xref>). At the time of writing, these models were considered &#x201C;flagship&#x201D; models, offering the highest range of functionality and the best performance in the company&#x2019;s own benchmarks. The idea was to use LLMs from different providers, as these have different software architectures and different training data sets. To be included, the output of two or more models had to match. If no consensus could be reached, the output was excluded from the analysis and not counted as an error.</p>
</sec>
<sec id="sec7">
<label>2.5</label>
<title>Error evaluation</title>
<p>Errors of structure: Within the generation of a structured output, any deviation from the desired data format was considered an error. The accuracy of the content of the responses was not a factor in the evaluation of the correctness of the format.</p>
<p>Errors of content: Within the content evaluation, all deviations from the human consensus reference were evaluated as errors. All variables of the consensus output and the NSPs were individually compared with regard to the quality/severity of the error.</p>
<p>Statistical differences between the various evaluations were tested using Fisher&#x2019;s exact test.</p>
<p>For clarity, in addition to the number of errors, we have also specified an accuracy, which is calculated as 1-(number of errors/number of variables analyzed). Based on the variables collected, it was not possible to categorize the responses of the LLMs into true positives, false positives, true negatives, and false negatives. Therefore, a conscious decision was made not to calculate precision, recall, and F1 scores, and instead to use accuracy as a measure.</p>
</sec>
<sec id="sec8">
<label>2.6</label>
<title>Definition of a &#x201C;true-error-rate&#x201D;</title>
<p>The following definition was used to assess whether an error was a true-error:<list list-type="bullet">
<list-item>
<p>&#x201C;Enumerations&#x201D;: In the case of defined values from a set of possibilities, all deviations from the required answer were considered errors.</p>
</list-item>
<list-item>
<p>&#x201C;Dates&#x201D;: In the case of dates, all data that deviated by more than 1&#x202F;month from the specified date were considered incorrect. This is because wording in doctors&#x2019; letters such as &#x201C;at the end of the month&#x201D; leaves room for interpretation. A deviation of 1&#x202F;month did not represent a deviation for the variables we analyzed that would significantly distort the outcome of a statistical analysis.</p>
</list-item>
<list-item>
<p>For all other content errors, the comprehensibility of the deviation from the required answer was checked. If an answer could be verified with the help of the doctor&#x2019;s letter, it was considered correct. An example of such a case is the EDSS. If the score was not clearly stated, it could be determined based on the clinical examination findings and the medical history. This leaves some room for interpretation. If the determination of an EDSS value could be verified based on the medical history and the examination findings, it was considered correct.</p>
</list-item>
</list></p>
</sec>
<sec id="sec9">
<label>2.7</label>
<title>Ethical considerations</title>
<p>Retrospective chart analysis within our monocentric neuroimmunological registry has been approved by the ethics committee Westfalen-Lippe, Germany (registration number 2024-590-f-S). Strict data anonymization has been performed prior to any usage of the data, in particular prior to data entry into either of the LLMs.</p>
</sec>
</sec>
<sec sec-type="results" id="sec10">
<label>3</label>
<title>Results</title>
<sec id="sec11">
<label>3.1</label>
<title>Generating structured outputs</title>
<p>A first hurdle in the use of LLMs for the evaluation of routine clinical data is the reliable generation of a structured output. There are two different ways to generate a structured output: The first way is via a plain text prompt and the second way is via a built-in function of the LLM. This can be, for example, a function calling function or a structured output function, which is supported by most of the LLM providers.</p>
<p>As a proof-of-concept, we first tested the generation of structured outputs using the web interface of 4o-mini. We used a text-only prompt to generate an excel spreadsheet. Our first observation was that due to the context window, we were limited in the number of medical records we could provide to the LLM. If we used a prompt at the beginning and pasted the medical records afterwards, the LLM would lose track of its task and the output would not comply with the task. We could observe that handing over the prompt with each medical record made the output much more reliable. In 2 out of 30 cases, the LLM was unable to produce an output which could be easily fixed by re-handing the task to the LLM.</p>
<p>As a first step, we wanted to generate a constant output of the correct columns. The columns of the spreadsheet mainly contained 3 different error types deviating from the desired format: A deviation from the column naming, a deviation from the number of columns (omitting or adding columns different from the prompt) and a deviation from the requested order. To reduce these errors, the first step was to adapt the prompt with an explicit reference to compliance with the structure. Contrary to our expectations, this increased the number of errors in 2 out of 3 error types. Overall, we were unable to generate a pure text prompt that would allow a reliable, uniformly structured output of the data (<xref ref-type="table" rid="tab1">Table 1</xref>).</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Number of structural errors of the columns in the generation of 30 structured outputs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Prompt</th>
<th align="center" valign="top">Change of column order</th>
<th align="center" valign="top">Omitting of columns</th>
<th align="center" valign="top">Incorrect naming of columns</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="bottom">Unmodified prompt</td>
<td align="center" valign="bottom">30</td>
<td align="center" valign="bottom">18</td>
<td align="center" valign="bottom">113</td>
</tr>
<tr>
<td align="left" valign="bottom">Prompt with enforced structure</td>
<td align="center" valign="bottom">21</td>
<td align="center" valign="bottom">21</td>
<td align="center" valign="bottom">147</td>
</tr>
<tr>
<td align="left" valign="bottom">Structured output function</td>
<td align="center" valign="bottom">0</td>
<td align="center" valign="bottom">0</td>
<td align="center" valign="bottom">0</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The Prompts used were: 1. A text-only prompt without special emphasis on structured output, 2. A text-only prompt with special emphasis on structured output and 3. A prompt using the structured output function of the LLM. The test was performed with the LLM 4o-mini.</p>
</table-wrap-foot>
</table-wrap>
<p>As a second step, we converted the plain text prompt to a JSON-format and used the build-in functions of different providers&#x2019; API to achieve a structured output. Many providers of LLMs offer a corresponding function to obtain a spreadsheet-like JSON output. With the help of structured output functions, we were able to drastically reduce the rate of faulty column outputs so that no more errors occurred in the structure (<xref ref-type="table" rid="tab1">Table 1</xref>). From there on, we tested our prompt with the 6 most commonly used LLMs at that timepoint: 4o, 4o-mini, Gemini-1.5-pro, Gemini-1.5-flash, Opus and Haiku. All further evaluations were carried out using the structured output functions.</p>
</sec>
<sec id="sec12">
<label>3.2</label>
<title>Improving the output data formats of LLMs</title>
<p>When generating a structured output through the methods described above we observed an important problem regarding possible further downstream analysis of the data: Data types often did not match the desired format (e.g., date formats, number formats etc.). In the next step, we therefore concentrated on generating consistent data formats.</p>
<p>In the first step, we started by detecting faulty data types and correcting them as best we could. The first option available for this within the structured output functions is to specify an expected data type. By setting the type of the data, we were able to reduce incorrect datatypes drastically. Although many LLMs allow to use a common schema object, not all providers support the same functions. As the LLMs &#x201C;Haiku&#x201D; and &#x201C;Opus&#x201D; do not have a specific structured output function (but can be forced to create JSON files by forcing the use of tools) and therefore were not modified by specifying a return data type, we excluded them from further analysis.</p>
<p>Another way to improve the creation of a structured data set is to use enumerations. With the help of these, the LLM can be given certain answer options from which to choose. In our example, we searched for diseases in the context of multiple sclerosis. By limiting the possible answers to a known set value, we were able to further reduce the number of data type errors.</p>
<p>As a last step, we adjusted the description of the task within the prompt and the system prompt and further emphasized the importance of sticking to the structured output.</p>
<p>By using these methods, we were able to reduce the rate of incorrect data formats by up to 88 percent (<xref ref-type="table" rid="tab2">Table 2</xref>). Nevertheless, even after several iterations of improvement, we were not able to generate completely error-free outputs for the initially three tested models with regard to the data format.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Number of errors produced in the different iterations of the improvement depending on the executor used.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Prompt modification</th>
<th align="center" valign="top">Haiku</th>
<th align="center" valign="top">Opus</th>
<th align="center" valign="top">Gemini-1.5-flash</th>
<th align="center" valign="top">Gemini-1.5-pro</th>
<th align="center" valign="top">Gemini-2.0-pro</th>
<th align="center" valign="top">4o</th>
<th align="center" valign="top">4o-mini</th>
<th align="center" valign="top">o3-mini</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">String-only return</td>
<td align="center" valign="top">100</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">81</td>
<td align="center" valign="top">32</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">9</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="top">Set return type</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">52</td>
<td align="center" valign="top">19</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="top">Set enum values</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">42</td>
<td align="center" valign="top">18</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="top">Revised description</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">37</td>
<td align="center" valign="top">13</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="top">Revised system prompt</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">34</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
</tr>
<tr>
<td align="left" valign="top">Error reduction</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">58.02%</td>
<td align="center" valign="top">62.50%</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">87.50%</td>
<td align="center" valign="top">88.89%</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>&#x201C;&#x2013;&#x201C;indicates that an analysis was not conducted. String-only return: Use of the structured output function without defining a specific return data format, Set return type: As before, only using a set return data format, Set enum values: As before, only using specific predefined return values, Revised description: As before, only using a different description within the prompt with regard to the data format, Revised system prompt: As before, only using a different system prompt, Error reduction: Percentage reduction in errors from the first iteration.</p>
</table-wrap-foot>
</table-wrap>
<p>As a final step, we tested our JSON-format prompt with the newer LLMs 3o-mini, Gemini-2.0-pro and Sonnet-3.7. Using these models, we could not detect any errors with regards to the structure.</p>
<p>With the help of the first two steps, we were able to create a prompt that was able to deliver mostly consistent results in terms of output. As we could already see in the first steps that both GPT-4o-mini and Gemini-1.5-flash performed the same or worse in terms of output structure than the larger LLMs, we did not carry out the further analysis with these two. However, as they performed better in terms of structured output, we also performed our evaluations with the newer LLMs o3-mini, Gemini-2.0-experimental and Sonnet-3.7.</p>
</sec>
<sec id="sec13">
<label>3.3</label>
<title>Improving and comparing output quality of LLMs</title>
<p>To evaluate content quality of the outputs, we compared the different models with a human reference created for the data set. Overall, we found that errors in the TMP control and in the NSP control were lower than in the LLM evaluations. Yet, the error types within the different variables were very similar: LLMs tended to make similar errors to the human control. In particular, the variables &#x201C;interpretation of cMRI activity&#x201D; (incorrect interpretation), &#x201C;current immunotherapy&#x201D; and &#x201C;previous immunotherapy&#x201D; (indication of drugs instead of drug names) and &#x201C;walking aid&#x201D; (distinction between missing and no walking aid) caused diverging results. Interestingly, the LLM-based output highlighted certain inaccuracies in the human reference evaluation by TMPs which was revealed by the evaluation of the results by the NSPs, particularly in complex variables such as MRI interpretation and treatment timelines (<xref ref-type="fig" rid="fig2">Figure 2</xref>; <xref ref-type="table" rid="tab3">Table 3</xref>).</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Heatmap of the error rate when evaluating the data using the first prompt in percent for the variables: a: Diagnosis, b: Disease course, c: Date of first manifestation, d: Date of first diagnosis, e: Current EDSS, f: OCB status, g: AQP4-AB status, h: MOG-AB status, i: Last cMRI date, j: cMRI activity, k: Own cMRI report interpretation, l: Current immunotherapy as active substance name, m: Start date of current immunotherapy, n: Previous immunotherapies as active substance, o: Other diagnoses, p: Comedication, q: Walking distance, r: Walking aid, s: Total errors. Especially the variables &#x201C;current immunotherapy&#x201D; and &#x201C;previous immunotherapies&#x201D; contained the most content errors in the evaluation by the LLMs.</p>
</caption>
<graphic xlink:href="frai-09-1658575-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Heat map comparing models Sonnet 3.7, Gemini 1.5 pro, Gemini 2.0 pro, 4o, o3-mini, TMP, and NSP across categories labeled a to s. Color intensity ranges from purple to yellow, indicating values from zero to one.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Number of errors depending on the prompt and LLM model used.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Variable</th>
<th align="center" valign="top" colspan="7">First prompt</th>
<th align="center" valign="top" colspan="6">Last prompt</th>
</tr>
<tr>
<th align="center" valign="top">Sonnet 3.7</th>
<th align="center" valign="top">Gemini 1.5 pro</th>
<th align="center" valign="top">Gemini 2.0 pro</th>
<th align="center" valign="top">4o</th>
<th align="center" valign="top">o3-mini</th>
<th align="center" valign="top">TMP</th>
<th align="center" valign="top">NSP</th>
<th align="center" valign="top">Sonnet 3.7</th>
<th align="center" valign="top">Gemini 1.5 pro</th>
<th align="center" valign="top">Gemini 2.0 pro</th>
<th align="center" valign="top">4o</th>
<th align="center" valign="top">o3-mini</th>
<th align="center" valign="top">LLM consensus</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Diagnosis</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
</tr>
<tr>
<td align="left" valign="top">Disease course</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="top">Date of first manifestation</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">3</td>
</tr>
<tr>
<td align="left" valign="top">Date of first diagnosis</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="top">Current EDSS</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">0</td>
</tr>
<tr>
<td align="left" valign="top">OCB status</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
</tr>
<tr>
<td align="left" valign="top">AQP4-AB status</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
</tr>
<tr>
<td align="left" valign="top">MOG-AB status</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">0</td>
</tr>
<tr>
<td align="left" valign="top">Last cMRI date</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">10</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="top">cMRI activity</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0</td>
</tr>
<tr>
<td align="left" valign="top">Own cMRI report interpretation</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="top">Current immunotherapy as active substance name</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">14</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">13</td>
<td align="center" valign="top">9</td>
<td align="center" valign="top">11</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">2</td>
</tr>
<tr>
<td align="left" valign="top">Start date of current immunotherapy</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="top">Previous immunotherapies as active substance name</td>
<td align="center" valign="top">20</td>
<td align="center" valign="top">21</td>
<td align="center" valign="top">18</td>
<td align="center" valign="top">19</td>
<td align="center" valign="top">15</td>
<td align="center" valign="top">20</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">10</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">7</td>
</tr>
<tr>
<td align="left" valign="top">Other diagnoses</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">4</td>
</tr>
<tr>
<td align="left" valign="top">Comedication</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">10</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">9</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">11</td>
<td align="center" valign="top">4</td>
</tr>
<tr>
<td align="left" valign="top">Walking distance</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">9</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">5</td>
</tr>
<tr>
<td align="left" valign="top">Walking aid</td>
<td align="center" valign="top">9</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">0</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">6</td>
</tr>
<tr>
<td align="left" valign="top">Total errors</td>
<td align="center" valign="top">71</td>
<td align="center" valign="top">75</td>
<td align="center" valign="top">72</td>
<td align="center" valign="top">95</td>
<td align="center" valign="top">60</td>
<td align="center" valign="top">79</td>
<td align="center" valign="top">22</td>
<td align="center" valign="top">58</td>
<td align="center" valign="top">64</td>
<td align="center" valign="top">60</td>
<td align="center" valign="top">67</td>
<td align="center" valign="top">52</td>
<td align="center" valign="top">36</td>
</tr>
<tr>
<td align="left" valign="top">Accuracy</td>
<td align="center" valign="top">86.85%</td>
<td align="center" valign="top">86.11%</td>
<td align="center" valign="top">86.67%</td>
<td align="center" valign="top">82.41%</td>
<td align="center" valign="top">88.89%</td>
<td align="center" valign="top">85.37%</td>
<td align="center" valign="top">95.93%</td>
<td align="center" valign="top">89.26%</td>
<td align="center" valign="top">88.15%</td>
<td align="center" valign="top">88.89%</td>
<td align="center" valign="top">87.59%</td>
<td align="center" valign="top">90.37%</td>
<td align="center" valign="top">93.30%</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Number of errors for the different variables depending on the prompt and LLM model used. In addition, accuracy is given as the proportion of correct answers out of all answers.</p>
</table-wrap-foot>
</table-wrap>
<p>We have further improved the prompt based on this feedback from the NSPs. The main changes include: The explicit specification of criteria for cMRI interpretation, the explicit naming of active substances and their trade names with the enforced request to name only active substances and the explanation to interpret the walking distance and walking aid also in the context of the remaining examination findings and the EDSS. Thereby, we were able to reduce the number of errors by more than 25% from 15 to 11.1% in average. As this was the maximum we could achieve under several iterations of improvement, we compared an &#x201C;LLM consensus&#x201D; response with our reference as described in the methods part. Overall, no consensus could be found for 3 values out of a total of 540. Using this method, we were able to decrease the error rate by around another 30% compared to the best performing LLM. This method enabled us to overcome our ceiling effect and decrease the total percentage of errors to 6.7%. Nearly all variables profited from the consensus approach in terms of a reduction of the error rate (<xref ref-type="fig" rid="fig3">Figure 3</xref>; <xref ref-type="table" rid="tab3">Table 3</xref>).</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Heatmap of the error rate when evaluating the data using the second prompt in percent for the variables: a: Diagnosis, b: Disease course, c: Date of first manifestation, d: Date of first diagnosis, e: Current EDSS, f: OCB status, g: AQP4-AB status, h: MOG-AB status, i: Last cMRI date, j: cMRI activity, k: Own cMRI report interpretation, l: Current immunotherapy as active substance name, m: Start date of current immunotherapy, n: Previous immunotherapies as active substance, o: Other diagnoses, p: Comedication, q: Walking distance, r: Walking aid, s: Total errors. By adapting the prompt, a clear reduction in the content error rate was achieved, particularly in the variable &#x201C;current immunotherapy&#x201D; and &#x201C;previous immunotherapies&#x201D; can be observed in comparison to the first prompt (<xref ref-type="fig" rid="fig2">Figure 2</xref>). The LLM consensus generated the lowest percentage of errors in relation to the variable.</p>
</caption>
<graphic xlink:href="frai-09-1658575-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Heatmap comparing six models or methods listed vertically&#x2014;Sonnet 3.7, Gemini 1.5 pro, Gemini 2.0 pro, 4o, o3-mini, and LLM consensus&#x2014;across variables labeled a to s horizontally, with color intensity from purple to yellow representing values zero to one on the scale.</alt-text>
</graphic>
</fig>
<p>In a final step, we checked the content of all deviating answers from the LLM consensus and the NSPs individually. We were able to determine that of the 36 deviations categorized as errors, only 8 answers were actually real content-related errors. The remaining 28 answers rated as incorrect related to previously mentioned edge cases, differed only to a very small extent and could often be justified by a diverging interpretation of the unstructured data (e.g., a difference in the initial diagnosis of a few days or a few months), resulting in a true-error-rate of 1.48%. In comparison to that, out of the 22 errors by the NSPs, 11 answers were content-related errors (which were mostly caused by non-adherence to the requirements within the prompt), resulting in a true-error-rate of around 2%. The difference between the LLM consensus evaluation and the NSPs was not significant (<italic>p</italic>&#x202F;=&#x202F;0.6447; odds ratio&#x202F;=&#x202F;1.38; 95% CI 0.57&#x2013;3.31).</p>
</sec>
</sec>
<sec sec-type="discussion" id="sec14">
<label>4</label>
<title>Discussion</title>
<p>In our study, we were able to show that it is possible to use commercial LLMs with relatively simple means to transfer anonymized unstructured data from routine medical practice into a content-correct and reliable structured form for subsequent evaluation and clinical research. Over several iterations of improving the prompt (<xref ref-type="fig" rid="fig1">Figure 1b</xref>), a clear improvement in the outcome has been shown. In our small test data set, we were able to observe that LLMs are not inferior to medical professionals in the evaluation of clinical data in certain scenarios. This observation is consistent with previous studies in the area of LLM and unstructured medical data (<xref ref-type="bibr" rid="ref2">Alkhalaf et al., 2024</xref>; <xref ref-type="bibr" rid="ref11">Huang et al., 2024</xref>; <xref ref-type="bibr" rid="ref24">Wiest et al., 2024</xref>; <xref ref-type="bibr" rid="ref25">Wiest et al., 2024</xref>). These studies have so far shown that commercial LLMs are capable of evaluating routine clinical data. Previous studies tend to show higher accuracy in the evaluation of routine clinical data, although the analyses were also predominantly performed with older LLMs. One explanation for this could be that the prompt for analyzing the data was more specifically adapted to the data set. It is also conceivable that the input data was more homogeneous than our data. This would be the case, for example, with uniform diagnostic reports. However, our observation of higher accuracy of LLMs when using a consensus response is consistent with recent publications (<xref ref-type="bibr" rid="ref17">Omar et al., 2025</xref>; <xref ref-type="bibr" rid="ref13">MacKay et al., 2025</xref>). Our work differs from previous literature in particular in its use of a consensus approach and in the complexity and number of variables analyzed.</p>
<p>The first problem we encountered was the generation of a structured output. The fact that the creation of such an output is sometimes difficult and is not perceived as sufficient for many areas has already been described before (<xref ref-type="bibr" rid="ref12">Liu et al., 2024</xref>). We have observed that the approach we used, especially with the newer LLMs, produced a reliable output. However, for automated pipelines it should be kept in mind that using the methods we used does not give 100 percent certainty of obtaining a correct return format.</p>
<p>In the next step, we were able to show that a significant improvement of the content of the output could be achieved by adjusting the prompt. It is important to note that in our case the prompt was adapted with consideration to the inputs. For example, edge cases were analyzed in detail and considered by formulations within the prompt. It has already been shown that the outcomes of LLMs in a clinical context depend on the level of detail of the prompt (<xref ref-type="bibr" rid="ref4">Burford et al., 2024</xref>). At the same time a precise adjustment of the prompt also means canceling out the time advantages of the LLM (<xref ref-type="bibr" rid="ref19">Shah, 2024</xref>). A compromise must therefore always be found between accuracy and effort. Another important aspect of these findings is that the content reliability of a prompt only applies to a specific data set: the one it was developed for. Conversely, this also means that the traceability of the data creating is less good. Another problem with the approach we have described is that adapting the prompt to the data set can cause overfitting. This can distort the results of the accuracy analysis. Adapting a prompt to the routine clinical data of a clinic also means that transferring it to other clinical data is likely to be possible only with significant adaptation, if at all. Because of these reasons we see the need for a precise and comprehensible description of the establishment of an LLM-supported evaluation of clinical data. Such an evaluation should not only be performed at the beginning but should also be done throughout the whole data generation process.</p>
<p>Even though our focus was not on the comparison of the different LLMs, we were able to see that especially newer LLMs tended to perform better when analyzing the unstructured data. However, we cannot say conclusively from our study how the various LLMs perform with larger data sets. There are also reports of differences of the accuracy between certain LLMs (<xref ref-type="bibr" rid="ref16">Ntinopoulos et al., 2025</xref>). We are therefore unable to make a general statement as to which LLM should be used to evaluate medical data. Especially since we have limited ourselves to evaluating only a few selected LLMs, we cannot make any more precise statements about &#x201C;the best LLM.&#x201D; It would also be conceivable, for example, that different LLMs benefit from a specially tailored prompt and that using a universal prompt is not the best way to obtain the most accurate results. For reasons of feasibility, we have, for example, refrained from analyzing specialized medical LLMs such as Med-PaLM 2 or others (<xref ref-type="bibr" rid="ref21">Singhal et al., 2025</xref>). Fine-tuning an existing model could also be a way to increase the accuracy of LLM responses for a specific dataset (<xref ref-type="bibr" rid="ref3">Bui et al., 2025</xref>). An important finding in this context is that we were able to bridge a certain ceiling effect of correct answers through the pooled use of several LLMs from different providers introducing a consensus decision. The use of multiple LLMs for a consensus finding might hold great potential - despite the increased costs - as an approach to ensure the best possible accuracy of the data in terms of content and structure when collecting complex variables. This approach, the use of different intelligences and specializations to achieve the best outcome is ultimately borrowed from medical practice, where so-called boards (e.g., tumor board or immunoboard) are used to solve complex medical cases. To our knowledge, this approach in LLMs is not yet established. However, further studies with larger datasets are needed to confirm our hypotheses.</p>
<p>Another problem we identified throughout our study is the evaluation of data accuracy &#x2013; particularly for variables which need interpretation. While counting outputs which deviate from a defined reference in our evaluation was a good way to improve the prompt, it also overrepresented errors defined as a deviation from a given standard, but not necessarily wrong in content. Most approaches to the use of LLMs in the healthcare context focus on diagnostics and not on the output of structured data (<xref ref-type="bibr" rid="ref14">Meng et al., 2024</xref>; <xref ref-type="bibr" rid="ref23">Ullah et al., 2024</xref>; <xref ref-type="bibr" rid="ref15">Nazi and Peng, 2024</xref>). There are general guidelines for publishing with the help of LLMs (<xref ref-type="bibr" rid="ref9">Gallifant et al., 2025</xref>). Nevertheless, we could observe that it is extremely difficult to describe the accuracy of a method sufficiently well. This difficulty in describing accuracy poses a threat to the comprehensibility of scientific data. Within the manuscript, a conscious decision was made against extensive statistical analysis. From our perspective, <italic>p</italic>-values can be misleading in the context of LLM evaluations. A specific adaptation of a prompt to a data set will probably always lead to high accuracy for that specific data set and result in a positive outcome in a statistical evaluation. However, this does not automatically mean that the results can be transferred to other data. Additionally, the use of LLMs bears several limitations and may introduce a qualitative bias into scientific data analyses due to a lack of accountability and transparency (<xref ref-type="bibr" rid="ref6">Clusmann et al., 2023</xref>). Repetitive inquiries may result in diverging output data due to performance fluctuation or updates. Hallucinations pose a risk of generating incorrect data (<xref ref-type="bibr" rid="ref18">Qiu et al., 2024</xref>). The state of knowledge of LLMs is limited to its training data and usually does not contain all latest scientific findings and LLMs are usually operated by commercial providers associated with risks of data protection and improper further data usage.</p>
<p>One issue that arises from our evaluation is the limited proven transferability. Using a small data set from a specific cohort, we were able to show that a consensus-based evaluation of physician letters is not inferior to a manual evaluation. The data and variables were not specifically selected for evaluation using LLM. Although it is conceivable in principle that a consensus LLM-based evaluation could also deliver better results in the context of other diseases and data sets, we cannot substantiate this thesis with our current results.</p>
<p>By using LLMs, we were able to reduce the time and costs required. An analysis of 30 medical records using batch analysis cost less than one dollar. At the same time, the cumulative time required for analysis by medical staff for all medical records was reduced from around 5&#x202F;h to just a few minutes. Nevertheless, it should be borne in mind that the use of a consensus model multiplies the costs. The use of three or more LLMs also means triple or higher costs. Despite the increased costs, the use of consensus in our application case was a significantly cheaper method than manual evaluation. Overall LLMs show great potential for the evaluation of unstructured medical data but should be used with caution and under a critical view, especially when applied in complex situations.</p>
</sec>
<sec id="sec15">
<label>5</label>
<title>Limitations</title>
<p>Although we were able to test a range of different LLMs, this is only a small sample of the options currently available. Since our test data set was limited to 30 records, we cannot make a definitive statement about the best possible use of LLMs for the evaluation of unstructured medical data. A significantly higher number of records would be necessary to make a definitive statement about the comparison of different LLMs. Likewise, a significant increase in the number of different LLMs analyzed would be necessary. The data comes from a single-center analysis, which makes it difficult to compare with other clinics and diseases. Likewise, we only included MS-specific variables in our study. In addition, the iterative prompt improvement approach may result in overfitting of our prompt to the dataset. Our selection of variables represents only a minimal excerpt of the potentially collectible data. Therefore studies with larger data sets are necessary to confirm our hypotheses and observations.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec16">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="sec17">
<title>Ethics statement</title>
<p>The studies involving humans were approved by ethics committee Westfalen-Lippe, Germany (registration no. 2024-590-f-S). The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="sec18">
<title>Author contributions</title>
<p>PP: Writing &#x2013; original draft, Investigation, Writing &#x2013; review &#x0026; editing, Formal analysis. RK: Data curation, Investigation, Formal analysis, Writing &#x2013; review &#x0026; editing. JL: Writing &#x2013; review &#x0026; editing, Investigation, Formal analysis. ER: Writing &#x2013; review &#x0026; editing, Investigation. CH: Supervision, Writing &#x2013; review &#x0026; editing. RG: Writing &#x2013; review &#x0026; editing, Supervision. AS: Data curation, Supervision, Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft, Investigation, Formal analysis. JM: Writing &#x2013; review &#x0026; editing, Supervision, Writing &#x2013; original draft, Investigation, Data curation, Formal analysis.</p>
</sec>
<sec sec-type="COI-statement" id="sec19">
<title>Conflict of interest</title>
<p>PLP received funding by Gemeinnuetzige Hertie-Stifung, none related to the article. RK received speaker honoraria for activities with Argenx.; travel grants from Alnylam, Takeda and Grifols. His research is supported by LFB and Ruhr University Bochum. LJ reports no disclosures related to this project. ER reports no disclosures related to this project. CH reports no disclosures related to this project. RG holds shares in Merck, Novartis, Kyverna Therapeutics Inc. and Roche; consulting fees from Novartis, Merck, Roche and Biogen; honoraria from Novartis; lecture fees from Biogen, BMS, Eisai, Genesis, Janssen, Merck, Novartis, Roche, Sanofi, Sandoz, TIBUA and third-party funding from Biogen, Novartis, TIBUA and Sanofi. AS received speaker honoraria for activities with Bristol Myers Squibb, Merck, Neuraxpharm, Novartis, Roche, and Sanofi; consulting fees from Neuraxpharm and research support by the Baasch Medicus Foundation, the Medical Faculty of the University of Bern, the Swiss MS Society and the regional association of North Rhine-Westphalia of the German MS Society (DMSG Landesverband NRW). JM holds shares in Amgen, Bayer, Biontech, Edwards Lifesciences, Fresenius, Merck, Sanofi and received research funding from Ruhr University Bochum, Klaus Tschira Foundation, Biogen, Novartis, Kyverna, received travel grants from Biogen idec, Novartis AG, GBS/CIDP Foundation International neuraxfarm, Bristol Myers Squibb, Sanofi, Teva and Eisai GmbH, Candit, Johnson and Johnson, speaker honoraria and medical advisory honoraria from Alexion, Grifols, Novartis, Candit, Johnson and Johnson. A. Salmen has received speaker honoraria for activities with Merck, Neuraxpharm, Novartis, Roche, and Sanofi; consulting fees from Neuraxpharm; and research support from the regional association of North Rhine-Westphalia of the German Multiple Sclerosis Society (DMSG Landesverband NRW).</p>
</sec>
<sec sec-type="ai-statement" id="sec20">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was used in the creation of this manuscript. During the preparation of this work the author(s) used claude-3-haiku-20240307 (Haiku; Anthropic), claude-3-opus-20240229 (Opus; Anthropic), claude-3-7-sonnet-20250219 (Sonnet-3.7; Anthropic), gemini-1.5-flash-002 (Gemini-1.5-flash; Google), gemini-1.5-pro-002 (Gemini-1.5-pro; Google), gemini-2.0-pro-exp-02-05 (Gemini-2-pro; Google), gpt-4o-2024-08-06 (4o; OpenAI), gpt-4o-mini-2024-07-18 (4o-mini; OpenAI) and o3-mini-2025-01-31 (o3-mini; OpenAI) in order to create structured outputs from text. After using this tool/service, the authors reviewed and edited the content as needed and take(s) full responsibility for the content of the publication.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec21">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec22">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2026.1658575/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/frai.2026.1658575/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_2.pdf" id="SM2" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_3.docx" id="SM3" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Adnan</surname><given-names>K.</given-names></name> <name><surname>Akbar</surname><given-names>R.</given-names></name> <name><surname>Khor</surname><given-names>S. W.</given-names></name> <name><surname>Ali</surname><given-names>A. B. A.</given-names></name></person-group> (<year>2020</year>). <article-title>Role and challenges of unstructured big data in healthcare</article-title>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer</publisher-name>, pp. <fpage>301</fpage>&#x2013;<lpage>323</lpage>.</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alkhalaf</surname><given-names>M.</given-names></name> <name><surname>Yu</surname><given-names>P.</given-names></name> <name><surname>Yin</surname><given-names>M.</given-names></name> <name><surname>Deng</surname><given-names>C.</given-names></name></person-group> (<year>2024</year>). <article-title>Applying generative AI with retrieval augmented generation to summarize and extract key clinical information from electronic health records</article-title>. <source>J. Biomed. Inform.</source> <volume>156</volume>:<fpage>104662</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jbi.2024.104662</pub-id>, <pub-id pub-id-type="pmid">38880236</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bui</surname><given-names>N.</given-names></name> <name><surname>Nguyen</surname><given-names>G.</given-names></name> <name><surname>Nguyen</surname><given-names>N.</given-names></name> <name><surname>Vo</surname><given-names>B.</given-names></name> <name><surname>Vo</surname><given-names>L.</given-names></name> <name><surname>Huynh</surname><given-names>T.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Fine-tuning large language models for improved health communication in low-resource languages</article-title>. <source>Comput. Methods Prog. Biomed.</source> <volume>263</volume>:<fpage>108655</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cmpb.2025.108655</pub-id>, <pub-id pub-id-type="pmid">39987667</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Burford</surname><given-names>K. G.</given-names></name> <name><surname>Itzkowitz</surname><given-names>N. G.</given-names></name> <name><surname>Ortega</surname><given-names>A. G.</given-names></name> <name><surname>Teitler</surname><given-names>J. O.</given-names></name> <name><surname>Rundle</surname><given-names>A. G.</given-names></name></person-group> (<year>2024</year>). <article-title>Use of generative AI to identify helmet status among patients with micromobility-related injuries from unstructured clinical notes</article-title>. <source>JAMA Netw. Open</source> <volume>7</volume>:<fpage>e2425981</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.25981</pub-id>, <pub-id pub-id-type="pmid">39136946</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Capurro</surname><given-names>D.</given-names></name> <name><surname>Yetisgen</surname><given-names>M.</given-names></name> <name><surname>van Eaton</surname><given-names>E.</given-names></name> <name><surname>Black</surname><given-names>R.</given-names></name> <name><surname>Tarczy-Hornoch</surname><given-names>P.</given-names></name></person-group> (<year>2014</year>). <article-title>Availability of structured and unstructured clinical data for comparative effectiveness research and quality improvement: a multisite assessment</article-title>. <source>EGEMS (Wash. DC)</source> <volume>2</volume>:<fpage>1079</fpage>. doi: <pub-id pub-id-type="doi">10.13063/2327-9214.1079</pub-id>, <pub-id pub-id-type="pmid">25848594</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Clusmann</surname><given-names>J.</given-names></name> <name><surname>Kolbinger</surname><given-names>F. R.</given-names></name> <name><surname>Muti</surname><given-names>H. S.</given-names></name> <name><surname>Carrero</surname><given-names>Z. I.</given-names></name> <name><surname>Eckardt</surname><given-names>J. N.</given-names></name> <name><surname>Laleh</surname><given-names>N. G.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>The future landscape of large language models in medicine</article-title>. <source>Commun Med (Lond).</source> <volume>3</volume>:<fpage>141</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id>, <pub-id pub-id-type="pmid">37816837</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dagdelen</surname><given-names>J.</given-names></name> <name><surname>Dunn</surname><given-names>A.</given-names></name> <name><surname>Lee</surname><given-names>S.</given-names></name> <name><surname>Walker</surname><given-names>N.</given-names></name> <name><surname>Rosen</surname><given-names>A. S.</given-names></name> <name><surname>Ceder</surname><given-names>G.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Structured information extraction from scientific text with large language models</article-title>. <source>Nat. Commun.</source> <volume>15</volume>:<fpage>1418</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-024-45563-x</pub-id>, <pub-id pub-id-type="pmid">38360817</pub-id></mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ehrenstein</surname><given-names>V.</given-names></name> <name><surname>Hellfritzsch</surname><given-names>M.</given-names></name> <name><surname>Kahlert</surname><given-names>J.</given-names></name> <name><surname>Langan</surname><given-names>S. M.</given-names></name> <name><surname>Urushihara</surname><given-names>H.</given-names></name> <name><surname>Marinac-Dabic</surname><given-names>D.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Validation of algorithms in studies based on routinely collected health data: general principles</article-title>. <source>Am. J. Epidemiol.</source> <volume>193</volume>, <fpage>1612</fpage>&#x2013;<lpage>1624</lpage>. doi: <pub-id pub-id-type="doi">10.1093/aje/kwae071</pub-id>, <pub-id pub-id-type="pmid">38754870</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gallifant</surname><given-names>J.</given-names></name> <name><surname>Afshar</surname><given-names>M.</given-names></name> <name><surname>Ameen</surname><given-names>S.</given-names></name> <name><surname>Aphinyanaphongs</surname><given-names>Y.</given-names></name> <name><surname>Chen</surname><given-names>S.</given-names></name> <name><surname>Cacciamani</surname><given-names>G.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title>. <source>Nat. Med.</source> <volume>31</volume>, <fpage>60</fpage>&#x2013;<lpage>69</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id>, <pub-id pub-id-type="pmid">39779929</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gencer</surname><given-names>G.</given-names></name> <name><surname>Gencer</surname><given-names>K.</given-names></name></person-group> (<year>2025</year>). <article-title>Large language models in healthcare: a bibliometric analysis and examination of research trends</article-title>. <source>J. Multidiscip. Healthc.</source> <volume>18</volume>, <fpage>223</fpage>&#x2013;<lpage>238</lpage>. doi: <pub-id pub-id-type="doi">10.2147/JMDH.S502351</pub-id>, <pub-id pub-id-type="pmid">39844924</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname><given-names>J.</given-names></name> <name><surname>Yang</surname><given-names>D. M.</given-names></name> <name><surname>Rong</surname><given-names>R.</given-names></name> <name><surname>Nezafati</surname><given-names>K.</given-names></name> <name><surname>Treager</surname><given-names>C.</given-names></name> <name><surname>Chi</surname><given-names>Z.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>A critical assessment of using ChatGPT for extracting structured data from clinical notes</article-title>. <source>NPJ Digit. Med.</source> <volume>7</volume>:<fpage>106</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-024-01079-8</pub-id>, <pub-id pub-id-type="pmid">38693429</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>Y.</given-names></name> <name><surname>Li</surname><given-names>D.</given-names></name> <name><surname>Wang</surname><given-names>K.</given-names></name> <name><surname>Xiong</surname><given-names>Z.</given-names></name> <name><surname>Shi</surname><given-names>F.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Are LLMs good at structured outputs? A benchmark for evaluating structured output capabilities in LLMs</article-title>. <source>Inf. Process. Manag.</source> <volume>61</volume>:<fpage>103809</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ipm.2024.103809</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>MacKay</surname><given-names>E. J.</given-names></name> <name><surname>Goldfinger</surname><given-names>S.</given-names></name> <name><surname>Chan</surname><given-names>T. J.</given-names></name> <name><surname>Grasfield</surname><given-names>R. H.</given-names></name> <name><surname>Eswar</surname><given-names>V. J.</given-names></name> <name><surname>Li</surname><given-names>K.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Automated structured data extraction from intraoperative echocardiography reports using large language models</article-title>. <source>Br. J. Anaesth.</source> <volume>134</volume>, <fpage>1308</fpage>&#x2013;<lpage>1317</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bja.2025.01.028</pub-id>, <pub-id pub-id-type="pmid">40037947</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meng</surname><given-names>X.</given-names></name> <name><surname>Yan</surname><given-names>X.</given-names></name> <name><surname>Zhang</surname><given-names>K.</given-names></name> <name><surname>Liu</surname><given-names>D.</given-names></name> <name><surname>Cui</surname><given-names>X.</given-names></name> <name><surname>Yang</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>The application of large language models in medicine: a scoping review</article-title>. <source>iScience</source> <volume>27</volume>:<fpage>109713</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.isci.2024.109713</pub-id>, <pub-id pub-id-type="pmid">38746668</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nazi</surname><given-names>Z. A.</given-names></name> <name><surname>Peng</surname><given-names>W.</given-names></name></person-group> (<year>2024</year>). <article-title>Large language models in healthcare and medical domain: a review</article-title>. <source>Informatics</source> <volume>11</volume>:<fpage>57</fpage>. doi: <pub-id pub-id-type="doi">10.3390/informatics11030057</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ntinopoulos</surname><given-names>V.</given-names></name> <name><surname>Rodriguez Cetina Biefer</surname><given-names>H.</given-names></name> <name><surname>Tudorache</surname><given-names>I.</given-names></name> <name><surname>Papadopoulos</surname><given-names>N.</given-names></name> <name><surname>Odavic</surname><given-names>D.</given-names></name> <name><surname>Risteski</surname><given-names>P.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Large language models for data extraction from unstructured and semi-structured electronic health records: a multiple model performance evaluation</article-title>. <source>BMJ Health Care Inform.</source> <volume>32</volume>:<fpage>1139</fpage>. doi: <pub-id pub-id-type="doi">10.1136/bmjhci-2024-101139</pub-id>, <pub-id pub-id-type="pmid">39832824</pub-id></mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Omar</surname><given-names>M.</given-names></name> <name><surname>Glicksberg</surname><given-names>B. S.</given-names></name> <name><surname>Nadkarni</surname><given-names>G. N.</given-names></name> <name><surname>Klang</surname><given-names>E.</given-names></name></person-group> (<year>2025</year>). <article-title>Refining LLMs outputs with iterative consensus ensemble (ICE)</article-title>. <source>Comput. Biol. Med.</source> <volume>196</volume>:<fpage>110731</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.110731</pub-id>, <pub-id pub-id-type="pmid">40669284</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Qiu</surname><given-names>J.</given-names></name> <name><surname>Yuan</surname><given-names>W.</given-names></name> <name><surname>Lam</surname><given-names>K.</given-names></name></person-group> (<year>2024</year>). <article-title>The application of multimodal large language models in medicine</article-title>. <source>Lancet Reg Health West Pac.</source> <volume>45</volume>:<fpage>101048</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.lanwpc.2024.101048</pub-id>, <pub-id pub-id-type="pmid">38524685</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shah</surname><given-names>S. V.</given-names></name></person-group> (<year>2024</year>). <article-title>Accuracy, consistency, and hallucination of large language models when analyzing unstructured clinical notes in electronic medical records</article-title>. <source>JAMA Netw. Open</source> <volume>7</volume>:<fpage>e2425953</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.25953</pub-id>, <pub-id pub-id-type="pmid">39136951</pub-id></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sherman</surname><given-names>R. E.</given-names></name> <name><surname>Anderson</surname><given-names>S. A.</given-names></name> <name><surname>Dal Pan</surname><given-names>G. J.</given-names></name> <name><surname>Gray</surname><given-names>G. W.</given-names></name> <name><surname>Gross</surname><given-names>T.</given-names></name> <name><surname>Hunter</surname><given-names>N. L.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Real-world evidence - what is it and what can it tell us?</article-title> <source>N. Engl. J. Med.</source> <volume>375</volume>, <fpage>2293</fpage>&#x2013;<lpage>2297</lpage>. doi: <pub-id pub-id-type="doi">10.1056/NEJMsb1609216</pub-id>, <pub-id pub-id-type="pmid">27959688</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Singhal</surname><given-names>K.</given-names></name> <name><surname>Tu</surname><given-names>T.</given-names></name> <name><surname>Gottweis</surname><given-names>J.</given-names></name> <name><surname>Sayres</surname><given-names>R.</given-names></name> <name><surname>Wulczyn</surname><given-names>E.</given-names></name> <name><surname>Amin</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Toward expert-level medical question answering with large language models</article-title>. <source>Nat. Med.</source> <volume>31</volume>, <fpage>943</fpage>&#x2013;<lpage>950</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id>, <pub-id pub-id-type="pmid">39779926</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tayefi</surname><given-names>M.</given-names></name> <name><surname>Ngo</surname><given-names>P.</given-names></name> <name><surname>Chomutare</surname><given-names>T.</given-names></name> <name><surname>Dalianis</surname><given-names>H.</given-names></name> <name><surname>Salvi</surname><given-names>E.</given-names></name> <name><surname>Budrionis</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Challenges and opportunities beyond structured data in analysis of electronic health records</article-title>. <source>WIREs Comput. Stat.</source> <volume>13</volume>:<fpage>e1549</fpage>. doi: <pub-id pub-id-type="doi">10.1002/wics.1549</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ullah</surname><given-names>E.</given-names></name> <name><surname>Parwani</surname><given-names>A.</given-names></name> <name><surname>Baig</surname><given-names>M. M.</given-names></name> <name><surname>Singh</surname><given-names>R.</given-names></name></person-group> (<year>2024</year>). <article-title>Challenges and barriers of using large language models (LLM) such as ChatGPT for diagnostic medicine with a focus on digital pathology - a recent scoping review</article-title>. <source>Diagn. Pathol.</source> <volume>19</volume>:<fpage>43</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13000-024-01464-7</pub-id>, <pub-id pub-id-type="pmid">38414074</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wiest</surname><given-names>I. C.</given-names></name> <name><surname>Ferber</surname><given-names>D.</given-names></name> <name><surname>Zhu</surname><given-names>J.</given-names></name> <name><surname>van Treeck</surname><given-names>M.</given-names></name> <name><surname>Meyer</surname><given-names>S. K.</given-names></name> <name><surname>Juglan</surname><given-names>R.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Privacy-preserving large language models for structured medical information retrieval</article-title>. <source>NPJ Digit. Med.</source> <volume>7</volume>:<fpage>1233</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-024-01233-2</pub-id>, <pub-id pub-id-type="pmid">39304709</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wiest</surname><given-names>I. C.</given-names></name> <name><surname>Wolf</surname><given-names>F.</given-names></name> <name><surname>Lessmann</surname><given-names>M. E.</given-names></name> <name><surname>Van Treeck</surname><given-names>M.</given-names></name> <name><surname>Ferber</surname><given-names>D.</given-names></name> <name><surname>Zhu</surname><given-names>J.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>LLM-AIx: an open source pipeline for information extraction from unstructured medical text based on privacy preserving large language models</article-title>. <source>medRxiv</source> <volume>3</volume>:<fpage>2917</fpage>. doi: <pub-id pub-id-type="doi">10.1101/2024.09.02.24312917</pub-id>, <pub-id pub-id-type="pmid">39281753</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Woznicki</surname><given-names>P.</given-names></name> <name><surname>Laqua</surname><given-names>C.</given-names></name> <name><surname>Fiku</surname><given-names>I.</given-names></name> <name><surname>Hekalo</surname><given-names>A.</given-names></name> <name><surname>Truhn</surname><given-names>D.</given-names></name> <name><surname>Engelhardt</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Automatic structuring of radiology reports with on-premise open-source large language models</article-title>. <source>Eur. Radiol.</source> <volume>35</volume>, <fpage>2018</fpage>&#x2013;<lpage>2029</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00330-024-11074-y</pub-id>, <pub-id pub-id-type="pmid">39390261</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0002">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2569733/overview">Mark Christiaan Scheper</ext-link>, Rotterdam University of Applied Sciences, Netherlands</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0003">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1220502/overview">Jianlin Shi</ext-link>, The University of Utah, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2037933/overview">G&#x00FC;lcan Gencer</ext-link>, Afyonkarahisar Health Sciences University, T&#x00FC;rkiye</p>
</fn>
</fn-group>
</back>
</article>