<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Digit. Health</journal-id><journal-title-group>
<journal-title>Frontiers in Digital Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Digit. Health</abbrev-journal-title></journal-title-group>
<issn pub-type="epub">2673-253X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdgth.2026.1761601</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>MEDAI-LLM-SUMM: a reporting checklist for medical text summarization studies using large language models</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes"><name><surname>Khoruzhaya</surname><given-names>Anna N.</given-names></name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="cor1">&#x002A;</xref><uri xlink:href="https://loop.frontiersin.org/people/3306328/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Varyukhina</surname><given-names>Mariya D.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/3396192/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Erizhokov</surname><given-names>Rustam A.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/3396219/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Blokhin</surname><given-names>Ivan A.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/1859957/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Reshetnikov</surname><given-names>Roman V.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/3225328/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Kodenko</surname><given-names>Mariya R.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/3396256/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Pamova</surname><given-names>Anastasia P.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/3023741/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Burtsev</surname><given-names>Tikhon A.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/3396141/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Arzamasov</surname><given-names>Kirill M.</given-names></name>
<xref ref-type="aff" rid="aff1"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Omelyanskaya</surname><given-names>Olga V.</given-names></name>
<xref ref-type="aff" rid="aff1"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Vladzymyrskyy</surname><given-names>Anton V.</given-names></name>
<xref ref-type="aff" rid="aff1"/><uri xlink:href="https://loop.frontiersin.org/people/655283/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Vasilev</surname><given-names>Yuriy A.</given-names></name>
<xref ref-type="aff" rid="aff1"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
</contrib-group>
<aff id="aff1"><institution>Research and Practical Clinical Center for Diagnostics and Telemedicine Technologies of the Moscow Health Care Department, State Budget-Funded Health Care Institution of the City of Moscow</institution>, <city>Moscow</city>, <country country="">Russia</country></aff>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label><bold>Correspondence:</bold> Anna N. Khoruzhaya <email xlink:href="mailto:khoruzhayaAN@zdrav.mos.ru">khoruzhayaAN@zdrav.mos.ru</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-02"><day>02</day><month>03</month><year>2026</year></pub-date>
<pub-date publication-format="electronic" date-type="collection"><year>2026</year></pub-date>
<volume>8</volume><elocation-id>1761601</elocation-id>
<history>
<date date-type="received"><day>05</day><month>12</month><year>2025</year></date>
<date date-type="rev-recd"><day>30</day><month>01</month><year>2026</year></date>
<date date-type="accepted"><day>31</day><month>01</month><year>2026</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 Khoruzhaya, Varyukhina, Erizhokov, Blokhin, Reshetnikov, Kodenko, Pamova, Burtsev, Arzamasov, Omelyanskaya, Vladzymyrskyy and Vasilev.</copyright-statement>
<copyright-year>2026</copyright-year><copyright-holder>Khoruzhaya, Varyukhina, Erizhokov, Blokhin, Reshetnikov, Kodenko, Pamova, Burtsev, Arzamasov, Omelyanskaya, Vladzymyrskyy and Vasilev</copyright-holder><license><ali:license_ref start_date="2026-03-02">https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p></license>
</permissions>
<abstract><sec><title>Background</title>
<p>Medical text summarization using large language models (LLMs) has reached an inflection point in 2024&#x2013;2025, with adapted models demonstrating capability to match or exceed human expert performance in specific tasks. However, critical gaps persist in safety validation, evaluation frameworks, and clinical deployment readiness. A comprehensive review revealed that only 7&#x0025; of studies conducted external validation and 3&#x0025; performed patient safety assessments, with hallucination rates ranging from 1.47&#x0025; to 61.6&#x0025;. Existing reporting guidelines, including CONSORT-AI, SPIRIT-AI, TRIPOD-LLM, and DEAL, do not adequately address the specific requirements of medical text summarization tasks.</p>
</sec><sec><title>Objective</title>
<p>to develop MEDAI-LLM-SUMM, the first specialized reporting checklist for research on medical text summarization using LLMs, addressing critical gaps in existing reporting standards.</p>
</sec><sec><title>Methods</title>
<p>A modified iterative consensus approach was employed, comprising three sequential stages: (1) a systematic literature review of 216 publications from PubMed and eLibrary (2023&#x2013;2025) following PRISMA guidelines and an analysis of existing reporting standards (TRIPOD-LLM, DEAL, CONSORT-AI, SPIRIT-AI, TRIPOD&#x2009;&#x002B;&#x2009;AI, CLAIM, STARD-AI); (2) development of an initial 44-item, 7-section checklist by a supervisory group; (3) three rounds of face-to-face consensus discussions with a multidisciplinary expert panel of 11 specialists (3 radiologists, 2 clinicians, 3 medical informatics experts, 1 biostatistician, and 2 medical LLM developers). The consensus criterion required unanimous agreement from all panel members.</p>
</sec><sec><title>Results</title>
<p>The final MEDAI-LLM-SUMM checklist comprises 24 items organized into six sections: (A) Clinical validity (4 items addressing clinical task definition, expert involvement, hypothesis formulation, and medical expertise requirements); (B) Model Selection (5 items covering model justification, system requirements, deployment environment, LLM-as-judge approach, and prompt documentation); (C) Data (3 items on datasets, reference summaries with expert consensus, and data stratification); (D) Quality Assessment (8 items including evaluation metrics, clinical metrics, expert evaluation, hallucination detection, LLM-judge assessment, sample size justification, pilot testing, and limitations documentation); (E) Safety (2 items on ethical approval and data anonymization); and (F) Data Availability (2 items on code and dataset accessibility). Comparative analysis with six existing reporting standards demonstrated that MEDAI-LLM-SUMM uniquely addresses hallucination assessment requirements, reference summary creation methodology, LLM-as-judge validation protocols, and detailed pilot testing specifications.</p>
</sec>
</abstract>
<kwd-group>
<kwd>expert consensus</kwd>
<kwd>large language models</kwd>
<kwd>medical text summarization</kwd>
<kwd>patient safety</kwd>
<kwd>reporting guidelines</kwd>
<kwd>reproducibility</kwd>
</kwd-group><funding-group><funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This article was prepared by a team of authors within the framework of a scientific and practical project in the field of medicine (No. EGISU: 125051305989-8) &#x201C;A promising automated workplace of a radiologist based on generative artificial intelligence&#x201D;.</funding-statement></funding-group><counts>
<fig-count count="2"/>
<table-count count="1"/><equation-count count="0"/><ref-count count="24"/><page-count count="10"/><word-count count="0"/></counts><custom-meta-group><custom-meta><meta-name>section-at-acceptance</meta-name><meta-value>Health Informatics</meta-value></custom-meta></custom-meta-group>
</article-meta>
</front>
<body><sec id="s1" sec-type="intro"><label>1</label><title>Introduction</title>
<p>The use of large language models (LLMs) for medical text summarization reached a turning point in 2024&#x2013;2025. While just two years ago such models produced inconsistent summaries, struggled to extract key information with limited context window (<xref ref-type="bibr" rid="B1">1</xref>), their current performance matches that of human experts in specific tasks (<xref ref-type="bibr" rid="B2">2</xref>). Van Veen D. et al. showed that LLM-generated summaries of radiology reports and clinical records are comparable (45&#x0025;) or even superior (36&#x0025;) to those produced by medical experts (<xref ref-type="bibr" rid="B3">3</xref>). However, a more in-depth analysis revealed critical issues regarding testing, evaluation, safety, and readiness for clinical use (<xref ref-type="bibr" rid="B4">4</xref>).</p>
<p>Several studies have addressed the limitations and shortcomings of LLM summarization. A recent scoping review by Bednarczyk et al. analyzing 30 studies on medical summarization concluded that the field remains &#x201C;exploratory and limited in scope,&#x201D; despite significant commercial investment. While all studies conducted internal validation, only 7&#x0025; (<italic>n</italic>&#x2009;&#x003D;&#x2009;2) performed external validation&#x2014;evaluation on independent datasets from institutions or populations not involved in model development&#x2014;and only 3&#x0025; (<italic>n</italic>&#x2009;&#x003D;&#x2009;1) included patient safety risks analysis assessing potential clinical harms from model errors. Furthermore, 57&#x0025; focused exclusively on narrow tasks (e.g., radiology reports), 50&#x0025; utilized intensive care unit data, and 87&#x0025; processed only English-language text. Reported hallucination rate ranged from 1.47&#x0025; to 61.6&#x0025; depending on the task complexity and evaluation methodology, with 44&#x0025; of hallucinations potentially impacting diagnosis or patient management (<xref ref-type="bibr" rid="B5">5</xref>). This creates a gap between &#x201C;laboratory&#x201D; performance and clinical reality. In certain medical tasks (e.g., assigning ICD codes), GPT-4 barely reached 50&#x0025; accuracy (<xref ref-type="bibr" rid="B6">6</xref>). Certain clinical data entry programs [e.g., Dragon Ambient eXperience (DAX) Copilot], claimed to significantly reduce document processing time, have raised scepticism among outpatient physicians due to a lack of efficacy (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>This underscores the importance of developing and deploying tools to guide and report studies on the use of LLM for medical data processing. Existing guidelines, including CONSORT-AI (<xref ref-type="bibr" rid="B8">8</xref>), SPIRIT-AI (<xref ref-type="bibr" rid="B9">9</xref>), and the recently introduced TRIPOD-LLM (<xref ref-type="bibr" rid="B10">10</xref>) and DEAL (<xref ref-type="bibr" rid="B11">11</xref>), fail to address some critical aspects of medical text summarization. They focus on general application of LLMs and lack subsections related to hallucinations and methodologies for selecting, testing, and validating summarization models. Furthermore, they lack guidance on how to approach the expert consensus methodology, have no established logic behind selecting and ranking evaluation metrics, and do not specify requirements and rules for pilot testing.</p>
<p>Using the literature data and expert panel conclusions, we developed a MEDAI-LLM-SUMM checklist, a tool for filtering out research papers on LLM-based medical text summarization that addresses critical shortcomings of the existing approaches. This checklist focuses on the content details of LLM summarization studies essential for reproducibility, safety assessment, and clinical applicability.</p>
<p>MEDAI-LLM-SUMM is designed for three primary user groups: (1) researchers conducting studies on LLM-based medical text summarization; (2) journal editors and peer reviewers evaluating manuscript submissions; (3) regulatory bodies and healthcare organizations evaluating commercial LLM summarization tools for clinical deployment.</p>
</sec>
<sec id="s2" sec-type="methods"><label>2</label><title>Materials and methods</title>
<sec id="s2a"><label>2.1</label><title>Study design</title>
<p>The checklist was developed using a modified iterative consensus approach (<xref ref-type="bibr" rid="B12">12</xref>), comprising three sequential stages: a systematic literature review, development of an initial version by a supervisory group, and an iterative consensus process involving a multidisciplinary expert panel.</p>
</sec>
<sec id="s2b"><label>2.2</label><title>Rationale for methodology selection</title>
<p>The modified approach to achieving consensus through open discussions was chosen over the classic anonymized Delphi method (<xref ref-type="bibr" rid="B13">13</xref>) due to the specific nature of the task. The checklist resulted from live discussions regarding the details of prompt engineering, model architectures, and hallucination metrics, a challenging task that could hardly be achieved through questionnaires alone. Three rounds of documented discussions ensured transparent decision-making comparable to the Delphi method, while offering the advantage of face-to-face interaction and interdisciplinary integration. The checklist development followed the general principles of the EQUATOR Network (<xref ref-type="bibr" rid="B14">14</xref>) for reporting guidelines.</p>
</sec>
<sec id="s2c"><label>2.3</label><title>Systematic literature review</title>
<p>The systematic review was conducted in accordance with the PRISMA guidelines (<xref ref-type="bibr" rid="B15">15</xref>). A search of PubMed and eLibrary databases for 2023&#x2013;2025 identified 216 relevant studies. For each study, we reviewed the methodology for the following elements: model selection rationale, prompt disclosure, hallucination detection methods, peer review protocols, safety strategies, and pilot testing. We also evaluated the existing reporting standards [TRIPOD-LLM (<xref ref-type="bibr" rid="B10">10</xref>), DEAL (<xref ref-type="bibr" rid="B11">11</xref>), CONSORT-AI (<xref ref-type="bibr" rid="B8">8</xref>), SPIRIT-AI (<xref ref-type="bibr" rid="B9">9</xref>), MI-CLAIM (<xref ref-type="bibr" rid="B16">16</xref>), STARD-AI (<xref ref-type="bibr" rid="B17">17</xref>)] to identify items applicable to summarization tasks and detect potential shortcomings.</p>
</sec>
<sec id="s2d"><label>2.4</label><title>Expert panel composition</title>
<p>A supervisory group of two experts (a medical informatics specialist with experience in radiological AI and a radiologist experienced in clinical trials investigating AI products) developed an initial version of the checklist, which included 44 items grouped into seven main sections: study planning and design, data curation, technical specification refinement, testing, data availability, and documentation.</p>
<p>The preliminary version was presented to a multidisciplinary expert panel of 11 specialists: three radiologists, two clinicians, three medical informatics experts, one biostatistician, and two developers of medical LLMs. The experts were required to have at least two years of relevant experience, publications on digital solutions for medicine, and practical experience with medical AI. On average, the experts possessed more than 3 years of experience and had authored five relevant publications. The 2-year threshold was introduced due to the relative novelty of medical LLM domain (widespread adoption since 2023) and the need to engage specialists working with emerging technologies.</p>
<p>The panel included 11 experts, which is consistent with the Delphi recommendations for homogeneous groups. In such studies, the optimal expert panel size is 10&#x2013;18 participants. When determining the panel size, rather than using traditional power analysis, we sought to achieve a robust consensus and to limit the contribution of each participant to the final distribution of responses. Narrative reviews indicate that most panels in healthcare consist of 8&#x2013;23 experts (<xref ref-type="bibr" rid="B18">18</xref>). Thus, our panel of 11 experts was designed to secure a balance between the statistical robustness and the consensus feasibility.</p>
</sec>
<sec id="s2e"><label>2.5</label><title>Consensus process</title>
<p>The consensus process involved three rounds of in-person discussions, each lasting up to two hours, spaced three weeks apart. The meetings were moderated by a member of the supervisory group, who ensured equal representation of opinions using a round-robin technique, where each participant took turns to present their position prior to the open discussion. The checklist items were discussed one-by-one. The experts were allowed to introduce or remove the items, modify wording, or alter the grouping. The arising disagreements were addressed through additional clarifications or rephrasing; otherwise, the decision was postponed until the next meeting and further literature review. The consensus implied a unanimous agreement by all the 11 panel members.</p>
<p>Between the meetings, the supervisors implemented the agreed-upon changes, draw rationales with supporting literature references, and created a document tracking all modifications.</p>
<p>The key disagreements revolved around prompting strategies (resolved by requiring full prompt disclosure, either in the main text or an appendix), pilot testing (shifted from mandatory to recommended but with a clear indication), and hallucination identification criteria (where consensus required both quantitative evaluation of hallucination frequency and a qualitative evaluation of clinical impact).</p>
<p>The final version of the checklist was unanimously approved by all members of the multidisciplinary expert group.</p>
</sec>
</sec>
<sec id="s3" sec-type="results"><label>3</label><title>Results</title>
<sec id="s3a"><label>3.1</label><title>Literature review findings</title>
<p>A systematic review of 216 publications identified critical gaps in reporting (Vasilev et al., 2025). Quality assessment using PROBAST criteria identified that 98&#x0025; (211/216) of studies did not reference any reporting standards, while 89&#x0025; (192/216) demonstrated high risk of bias, primarily due to inadequate documentation of prompt engineering, model versioning, and evaluation methodology. The common lack of technical details regarding model configurations, prompting strategies, and evaluation protocols makes independent verification virtually impossible.</p>
</sec>
<sec id="s3b"><label>3.2</label><title>Consensus outcomes</title>
<p>Over the course of three consensus rounds, the checklist evolved from an initial 44 items across 7 sections to a final version comprising 24 items in 6 sections. Key revisions included consolidating overlapping items, removing items deemed unnecessary for context diversity, and adding requirements that emerged during expert discussions (<xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>).</p>
<fig id="F1" position="float"><label>Figure&#x00A0;1</label>
<caption><p>Checklist evolution.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1761601-g001.tif"><alt-text content-type="machine-generated">Flowchart illustrating four document versions, their item and section counts, and changes after each review round: Version 1 has forty-four items, seven sections; Version 2 has thirty-three items, five sections after removing eleven items and merging two sections; Version 3 has twenty-nine items, five sections after further item removal and definition modification; Version 4, final, has twenty-four items, six sections after additional edits.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3c"><label>3.3</label><title>Final checklist structure</title>
<p>The MEDAI-LLM-SUMM checklist comprises six sections covering the full lifecycle of medical summarization research, from concept to pilot testing. Each section includes recommended elements that a paper should contain to ensure maximal transparency and reproducibility (<xref ref-type="fig" rid="F2">Figure&#x00A0;2</xref>).</p>
<fig id="F2" position="float"><label>Figure&#x00A0;2</label>
<caption><p>MEDAI-LLM-SUMM checklist structure. Total: 24 items (20 core&#x2009;&#x002B;<sans-serif>&#x2009;4</sans-serif> optional). Items marked with asterisk are recommended but optional depending on study design.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1761601-g002.tif"><alt-text content-type="machine-generated">Grid-style diagram presents six colored sections, each listing items relevant to evaluating medical summarization. Sections cover clinical validity, model selection, data, quality assessment, safety, and data availability, with bullet points detailing subtopics under each category.</alt-text>
</graphic>
</fig>
<sec id="s3c1"><label>3.3.1</label><title>Section A: clinical validity</title>
<p>Medical summarization fundamentally differs from general natural language processing (NLP) tasks due to the need to account for deep clinical context (e.g., extensive chronic disease history), the risks associated with inaccurately summarized information (or the omission of critical details), and the unique healthcare practices across different countries. This section ensures the clinical relevance of the summarization task and the study design prior to technical integration.</p>
<p>A1. General questions to determine the relevance and merit of the study. This section contains three questions related to the clinical task at hand, the availability of similar solutions, and the originality of the proposed method.</p>
<p>A2. Expert community involvement (optional). While this section is optional, the involvement of experts is recommended at all stages: (1) identifying critical domain-specific information that the model must consider; (2) developing evaluation criteria and error classification methodology; (3) output validation through assessing the clinical accuracy and safety of summaries; (4) interpretation using pre-determined error thresholds for specific use cases.</p>
<p>A3. Research hypothesis. The stated hypothesis should clearly specify the object of comparison (e.g., a study scenario comparing LLM summaries with those produced by other machine learning methods or medical experts), the comparison criteria (e.g., evaluation and clinical metrics), and the expected outcome (measurable indicators).</p>
<p>A4. Medical task addressed by summarization. This section is critical as various medical fields share different priorities that render universal quality assessment infeasible. For example, a radiology report summary must preserve the site and size of findings; summaries of medical records must contain vital signs, the onset and course of the patient&#x0027;s condition, and laboratory and imaging data; in oncology, discharge summaries, tumor staging, type of surgical intervention, chemotherapy, and molecular markers are prioritized. Without knowing the clinical context, it is impossible to determine whether a model has missed important information or filtered out redundant details.</p>
</sec>
<sec id="s3c2"><label>3.3.2</label><title>Section B: model selection</title>
<p>The quality of a medical text summary depends on the models and their parameters, which indirectly affect the reliability. This section should contain a clear rationale for choosing a specific model, its system requirements, and key configuration parameters critical for the reproducibility.</p>
<p>&#x0412;1. Rationale for model selection. This section requires disclosure of model details and selection rationale. The rationale should include: a comparative analysis of available models and their suitability for the task; target language support (especially for non-English data); compliance with data privacy requirements and the feasibility of local deployment; outcomes of preliminary or pilot testing (if any) using representative clinical data; and the feasibility of domain-specific fine-tuning through additional training using special medical dictionaries. Where multiple models are considered, the section should contain a rationale for the resulting ensemble.</p>
<p>B2. System requirements. This includes disclosure of GPU/TPU configuration (type, number, VRAM size), system RAM capacity, requirements for data security infrastructure, operating system and driver versions, and data storage requirements for datasets and model versions. This information is critical for study reproducibility at other sites and for the estimation of resource constraints required for scaling.</p>
<p>B3. Deployment environment (optional). For retrained models, this section specifies key hyperparameters (learning rate, batch size, number of epochs, sequence length), the hyperparameter optimization method and search ranges, the optimizer and regularization parameters, seed values for reproducibility, and specific fine-tuning settings (layer freezing strategy, learning rate scheduling, etc.).</p>
<p>B4. LLM-as-a-judge (if applicable). LLM-as-a-Judge refers to the methodology of using one large language model to evaluate the outputs of another. Studies employing this methodology should report: (1) the evaluator model and version, (2) evaluation prompts used, (3) calibration against human judgment with inter-rater reliability metrics, and (4) agreement statistics between LLM and human assessors.</p>
<p>B5. Prompting strategy. Disclosing prompts is critical for the reproducibility of studies that use generative systems. The study should fully disclose the summarization prompts, the prompts used for LLM judge (including detailed assessment criteria and instructions for scale usage), sample prompts with use case examples, and prompt engineering strategies employed to ensure accuracy and safety. All prompts must be available in the main text, the appendix, or in an open repository.</p>
</sec>
<sec id="s3c3"><label>3.3.3</label><title>Section C: data</title>
<p>The quality and characteristics of the data govern the reliability and applicability of the study results. This section describes the requirements for the datasets, methodologies for creating reference summaries, and sample stratification.</p>
<p>&#x0421;1. Datasets. Descriptions of datasets used for domain pre-training and task-specific fine-tuning, the mandatory stages of medical LLM training, must include all document types to be summarized (e.g., medical records, discharge summaries, laboratory and imaging test results, etc.), a statistical rationale for sample size to ensure clinical diversity, and the data type ratio (radiology reports, medical records, etc.).</p>
<p>&#x0421;2. Reference summaries with expert consensus. This section is expected to detail the reference summary methodology, including: the selection process, the number and credentials of annotators, inter-annotator agreement and disagreement resolution, annotation guidelines, the size and representativeness of the sample of annotated documents, possible alternative approaches (e.g., manual vs. hybrid methods that utilise LLM-generated summaries refined by annotators, or synthetic data), and the rationale for the chosen approach. If existing datasets with reference summaries were used, the section should indicate their source, quality verification, and suitability for the study objectives.</p>
<p>&#x0421;3. Data stratification. Data stratification may be necessary to achieve balance across different levels of clinical complexity. Unstratified datasets often yield models tailored for typical cases but underperforming in complex clinical scenarios. Disclosing the data stratification contributes to a better understanding of possible reproducibility challenges on similar data that would require specific datasets.</p>
</sec>
<sec id="s3c4"><label>3.3.4</label><title>Section D: quality assessment</title>
<p>The quality assessment section should outline a comprehensive methodology for summarizer validation, including evaluation metrics, clinical evaluation by experts, and pilot testing. This section is critical for demonstrating the system&#x0027;s readiness for clinical use and identifying its limitations.</p>
<p>D1. Evaluation metrics relevant to the medical task. The section requires listing the selected metrics (taking into account the task&#x0027;s context) and provide a detailed rationale for their applicability. Traditional NLP metrics, such as ROUGE (which measures n-gram overlap between the generated and reference summary) and BLEU (which assesses word-level accuracy), may be insufficient for the medical context as they focus on superficial similarity rather than preserving clinical meaning. Using an integrated approach requires specifying the methodology for calculating the final metric (e.g., weighted average and minimum value for all components) and provide the results for all the evaluation metrics with 95&#x0025; confidence intervals (where applicable). The threshold calculation method for the metrics must also be specified.</p>
<p>D2. Clinical metrics and involvement of medical experts. In addition to automated evaluation metrics, expert review of the clinical relevance is critical. The number of experts involved must be statistically justified. The minimum requirement is at least two independent experts for each review. Clinical metrics should reflect the specific domain requirements and may include: preservation of critical information (diagnoses, drug dosages, key tests); absence of clinically significant errors or distortions; time sequence adequately reflecting clinical events; correct medical terminology and compliance with clinical documentation standards; and potential impact on clinical decision-making.</p>
<p>D3. Review by domain experts and validated questionnaire. A validated questionnaire for systematic expert review with clearly defined criteria and scales should be adopted or designed. This section should present a detailed expert review methodology, including: blinding protocols regarding AI-generated and reference summaries; a randomised worklist; statistical methods for inter-rater agreement; and disagreement resolution. The paper should present the review findings indicating the mean values, standard deviations, and confidence intervals for each criterion. In the absence of inter-rater agreement, this decision must be justified or acknowledged as a limitation.</p>
<p>D4. Verification of outputs. An output data verification system is necessary to handle hallucinations. The study is expected to incorporate justified hallucination detection methods (fact-based approaches, entailment-based metrics, LLM-based detection), ensure factual consistency with the original data, verify medical claims against reliable sources (PubMed, clinical guidelines, etc.), and automatically track inconsistencies and unconfirmed claims. Hallucinations class thresholds and security frameworks (guardrails) should be established and outlined.</p>
<p>D5. LLM-as-a-judge evaluation (optional). If the LLM-as-a-judge approach (see Section B4) was used, its parameters and outputs should be disclosed. This should include: the agreement between the LLM and human expert expressed though correlation coefficients and agreement metrics; a detailed overview of the LLM summaries using pre-defined quality criteria; a comparison of LLM robustness using different data subsets; overview of discrepancies between the LLM and experts highlighting possible error patterns; and LLM stability across multiple runs (given the stochastic nature of LLM generation). It is important to note that while LLMs cannot completely replace human expertise, especially when the judgement relies on subtle clinical nuances, they can significantly speed up the evaluation of large volumes of summaries, provided their reliability is adequately validated.</p>
<p>D6. Size of the validation sample. The rationale for the validation sample size should be based on the power analysis that takes into account the expected effect, significance level (usually <italic>&#x03B1;</italic>&#x003D;0.05), and desired statistical power (usually 80&#x0025;&#x2013;90&#x0025;).</p>
<p>D7. Pilot testing (optional). Evaluation of LLM summarization performance within actual clinical workflows is recommended but recognized as exceeding the scope of typical research projects. Pilot testing should follow a defined design that specifies the study type (laboratory or clinical), duration of follow-up for prospective studies, and key performance indicators (e.g., time-saving for medical record review during diagnosis establishment). Mandatory is the systematic identification and documentation of limitations, including technical constraints (context size, language limitations), clinical limitations (document types, specializations), potential biases, and peer review requirements. These limitations should be clearly articulated to ensure safe implementation in clinical practice.</p>
<p>D8. Limitations are investigated and documented. It should be stated whether the summarizer&#x0027;s limitations have been systematically identified and documented. Clearly defining the limitations is critical for safe implementation in clinical practice and for preventing from using the system in inappropriate contexts. Disclosing the limitations does not detract from the value of the research; on the contrary, it demonstrates a responsible approach and helps practitioners make informed decisions. Limitations should be clearly stated both in the paper and in the user documentation.</p>
</sec>
<sec id="s3c5"><label>3.3.5</label><title>Section E: safety</title>
<p>The safety section covers key aspects of ethical and legal compliance during the development, deployment, and further training of AI systems for medical text summarization.</p>
<p>&#x0415;1. Ethical approval. Obtaining ethical approval is mandatory for any research using AI to analyze medical data.</p>
<p>E2. Patient Data Protection. Authors must describe the data protection measures implemented, which may include: de-identification/pseudonymization, local model deployment, use of APIs with privacy guarantees, or other technical solutions. Quality control involves continuous monitoring of the LLM&#x0027;s performance using established evaluation systems.</p>
</sec>
<sec id="s3c6"><label>3.3.6</label><title>Section F: data availability</title>
<p>This section discusses access to data and model configurations required to assess the reproducibility of the findings. The paper should clearly indicate the availability of the software code and datasets. If each item is present, the access format should be specified (GitHub repository, Zenodo, application, etc.); otherwise, a rationale for the unavailability is required (data confidentiality, license, commercial restrictions, etc.).</p>
</sec>
</sec>
<sec id="s3d"><label>3.4</label><title>Comparison with existing standards</title>
<p>For clarity, we compared the checklist with existing reporting standards (<xref ref-type="table" rid="T1">Table&#x00A0;1</xref>). The most relevant AI reporting standards in medicine were selected for comparison: TRIPOD-LLM (an LLM-specific tool), DEAL (LLM development and evaluation), MI-CLAIM (clinical applicability of AI), STARD-AI (diagnostic accuracy), CONSORT-AI and SPIRIT-AI (clinical trials with AI).</p>
<table-wrap id="T1" position="float"><label>Table&#x00A0;1</label>
<caption><p>Feature comparison across checklists.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<tbody>
<tr>
<th valign="top" align="left">Property</th>
<th valign="top" align="center">MEDAI-LLM-SUMM</th>
<th valign="top" align="center">Tripod-LLM</th>
<th valign="top" align="center">Deal</th>
<th valign="top" align="center">MI-Claim</th>
<th valign="top" align="center">STARD-AI</th>
<th valign="top" align="center">Consort-AI</th>
<th valign="top" align="center">Spirit-AI</th>
</tr>
<tr>
<td valign="top" align="left">Year of introduction</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">2025</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">2025</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">2020</td>
</tr>
<tr>
<td valign="top" align="left">Number of key items</td>
<td valign="top" align="left">24 (6 sections)</td>
<td valign="top" align="left">19 main items&#x2009;&#x002B;<sans-serif>&#x2009;50</sans-serif> subitems</td>
<td valign="top" align="left">2 versions (A and B)</td>
<td valign="top" align="left">6 main sections</td>
<td valign="top" align="left">18 new/modified items&#x2009;&#x002B;<sans-serif>&#x2009;STARD</sans-serif> 2015</td>
<td valign="top" align="left">14 additional items&#x2009;&#x002B;<sans-serif>&#x2009;CONSORT</sans-serif> 25</td>
<td valign="top" align="left">15 additional items&#x2009;&#x002B;<sans-serif>&#x2009;SPIRIT</sans-serif> 33</td>
</tr>
<tr>
<td valign="top" align="left">Domain</td>
<td valign="top" align="left">Medical text summarization</td>
<td valign="top" align="left">Predictive models, wide range of tasks</td>
<td valign="top" align="left">LLM development and evaluation</td>
<td valign="top" align="left">Clinical AI models</td>
<td valign="top" align="left">AI diagnostic accuracy</td>
<td valign="top" align="left">Clinical trials of AI tools</td>
<td valign="top" align="left">AI trial protocols</td>
</tr>
<tr>
<td valign="top" align="left">Design methodology</td>
<td valign="top" align="left">Modified consensus approach, 11 experts</td>
<td valign="top" align="left">Accelerated Delphi method</td>
<td valign="top" align="left">Literature review</td>
<td valign="top" align="left">Expert consensus</td>
<td valign="top" align="left">Multi-stage process, &#x003E;240 stakeholders</td>
<td valign="top" align="left">Consensus</td>
<td valign="top" align="left">Consensus</td>
</tr>
<tr>
<td valign="top" align="left">Focus on prompt engineering</td>
<td valign="top" align="left">Detailed (Section B5)</td>
<td valign="top" align="left">Included</td>
<td valign="top" align="left">Detailed</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not covered</td>
</tr>
<tr>
<td valign="top" align="left">Hallucination handling</td>
<td valign="top" align="left">Mandatory (D4)</td>
<td valign="top" align="left">Mentioned</td>
<td valign="top" align="left">Mentioned</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not covered</td>
</tr>
<tr>
<td valign="top" align="left">Expert verification</td>
<td valign="top" align="left">Mandatory with inter-expert agreement</td>
<td valign="top" align="left">Recommended</td>
<td valign="top" align="left">Mentioned</td>
<td valign="top" align="left">Recommended</td>
<td valign="top" align="left">Recommended</td>
<td valign="top" align="left">Recommended</td>
<td valign="top" align="left">Recommended</td>
</tr>
<tr>
<td valign="top" align="left">Pilot testing</td>
<td valign="top" align="left">Recommended and detailed (D7)</td>
<td valign="top" align="left">General recommendations</td>
<td valign="top" align="left">Mentioned</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Clinical validation</td>
<td valign="top" align="left">Mandatory</td>
<td valign="top" align="left">Mandatory</td>
</tr>
<tr>
<td valign="top" align="left">Reference summaries</td>
<td valign="top" align="left">Detailed requirements (C2)</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not covered</td>
<td valign="top" align="left">Not applicable</td>
<td valign="top" align="left">Not applicable</td>
<td valign="top" align="left">Not applicable</td>
<td valign="top" align="left">Not applicable</td>
</tr>
<tr>
<td valign="top" align="left">LLM-as-a-judge</td>
<td valign="top" align="left">Detailed requirements (B4, D5)</td>
<td valign="top" align="left">Mentioned</td>
<td valign="top" align="left">Mentioned</td>
<td valign="top" align="left">Not applicable</td>
<td valign="top" align="left">Not applicable</td>
<td valign="top" align="left">Not applicable</td>
<td valign="top" align="left">Not applicable</td>
</tr>
<tr>
<td valign="top" align="left">Clinical validity</td>
<td valign="top" align="left">Dedicated section A</td>
<td valign="top" align="left">Integrated</td>
<td valign="top" align="left">General concepts</td>
<td valign="top" align="left">Detailed</td>
<td valign="top" align="left">Detailed</td>
<td valign="top" align="left">Core element</td>
<td valign="top" align="left">Core element</td>
</tr>
<tr>
<td valign="top" align="left">Evaluation metrics</td>
<td valign="top" align="left">Mandatory and summmarization-specific (D1)</td>
<td valign="top" align="left">Mandatory and task-specific</td>
<td valign="top" align="left">Mandatory, not detailed</td>
<td valign="top" align="left">General requirements</td>
<td valign="top" align="left">Detailed for diagnostics</td>
<td valign="top" align="left">Trial outcomes</td>
<td valign="top" align="left">Planned metrics</td>
</tr>
<tr>
<td valign="top" align="left">Interactive tools</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">Available (tripod-llm.vercel.app)</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">None</td>
<td valign="top" align="left">None</td>
</tr>
<tr>
<td valign="top" align="left">Scope of application</td>
<td valign="top" align="left">Summarization only</td>
<td valign="top" align="left">Prediction, diagnostics, monitoring, screening</td>
<td valign="top" align="left">All LLM applications</td>
<td valign="top" align="left">All clinical AI applications</td>
<td valign="top" align="left">Diagnostics only</td>
<td valign="top" align="left">All AI interventions in RCTs</td>
<td valign="top" align="left">All AI trial protocols</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4" sec-type="discussion"><label>4</label><title>Discussion</title>
<p>The MEDAI-LLM-SUMM checklist addresses a critical gap in medical AI research infrastructure. Our analysis reveals that the 24-item framework captures requirements not adequately covered by existing guidelines, with 8 items (33&#x0025;) representing entirely novel reporting elements specific to medical text summarization. A recent systematic review of 84 studies found significant heterogeneity in LLM performance across different medical tasks, highlighting the importance of context and task complexity for study planning (<xref ref-type="bibr" rid="B19">19</xref>). The proposed MEDAI-LLM-SUMM checklist structure is intended to standardize ongoing medical research and manuscripts based on their findings (the version for work is presented in <xref ref-type="sec" rid="s10">Supplementary Material 1</xref>).</p>
<p>This study focused on the summarization of medical texts. The narrow scope of application was attributed to several factors.</p>
<p>First, summarization is one of the most mature approaches in medical LLM applications, judging by the volume of studies published over the past two years. However, it also raises the most questions regarding the reliability and consistency of approaches to clinical utility (<xref ref-type="bibr" rid="B20">20</xref>). A systematic review by Bednarczyk et al. identified serious methodological issues related to widespread neglect of external testing and the virtual absence of patient safety testing (in only 3&#x0025; of studies) where clinical decisions were intended to rely on AI summaries. For example, 42&#x0025; of GPT-4 summaries contained hallucinations and 47&#x0025; missed critical information (<xref ref-type="bibr" rid="B5">5</xref>).</p>
<p>In addition, the misleading ease of use (the temptation to reduce the amount of text and leave only the most important information) comes with significant technical complexities that require specific solutions. Tang et al. showed that evaluation metrics correlate weakly with the quality of medical summaries as LLMs are prone to contradictory statements (<xref ref-type="bibr" rid="B1">1</xref>). Croxford et al. confirm that in high-risk domains such as healthcare, &#x201C;good enough&#x201D; summaries are insufficient due to specific requirements for accuracy and clinical relevance (<xref ref-type="bibr" rid="B21">21</xref>). These challenges require specialized evaluation protocols (<xref ref-type="bibr" rid="B22">22</xref>) which may rely on customized LLM judges (<xref ref-type="bibr" rid="B23">23</xref>).</p>
<p>Third, quality control of summaries requires standardization. While quality and reliability metrics still draw from expert judgement, the ReproNLP study found that the reproducibility of results obtained using experts is understudied, raising concerns for NLP domains where human reference is common, including summarization (<xref ref-type="bibr" rid="B24">24</xref>).</p>
<p>All this leads to a peculiar paradox: while the technology is ready for deployment, there are no reliable methods for verifying its safety and effectiveness, despite the apparent need.</p>
<p>MEDAI-LLM-SUMM addresses several critical gaps not covered by existing reporting standards. It provides mandatory requirements for hallucination assessment (item D4), which no other standard explicitly requires. The checklist includes detailed specifications for reference summary creation methodology (item C2), addressing a fundamental challenge in summarization research. It provides specific requirements for LLM-as-judge validation (items B4 and D5), reflecting the growing use of this approach while ensuring rigor. The narrow focus on summarization allows for practical, detailed recommendations rather than general principles characteristic of universal guidelines.</p>
<p>Our approach to this checklist has several methodological limitations that should be considered during interpretation and application.</p>
<p>The most significant limitation is that all 11 members of the expert panel work for multiple organizations in a single country. This creates three types of potential problems: the risk of institutional bias, the limited regulatory context, and the risk of groupthink when reaching consensus. To minimize these risks, the checklist drew from a systematic review of international literature and international standards, rather than institutional preferences. Unanimous agreement by all experts was required, and transparency was ensured by documenting all disagreements and publishing interim versions.</p>
<p>We view the current version as a first iteration requiring international validation. Within 6&#x2013;12 months of publication, an advisory group with experts from other countries is scheduled to develop version 2.0 based on feedback. The panel size (<italic>n</italic>&#x2009;&#x003D;&#x2009;11) is relatively small compared to TRIPOD-LLM or CONSORT-AI, which involved 15&#x2013;30 experts. However, for the highly specialized task of medical text summarization, the multidisciplinary approach ensured adequate representation of key stakeholders.</p>
<p>Unlike the traditional Delphi method, open discussions were used without anonymizing opinions and without quantitatively grading the scale items (e.g., GRADE 1&#x2013;9). The use of non-anonymous consensus rounds may have introduced social desirability bias, with participants potentially conforming to opinions of senior colleagues. However, we note that this approach also enabled real-time clarification of complex technical concepts and immediate resolution of terminology disagreements, which was particularly valuable given the novel and rapidly evolving nature of the LLM field. For the version 2.0, we plan to have experts rate the importance of final items on a scale of 1&#x2013;9 using the GRADE methodology to retrospectively identify critical items.</p>
<p>The most significant methodological limitation of the current version is the lack of formal pilot testing on real publications, which we plan to conduct in the future. Practical applicability, completion time, consistency of item interpretation by different users, comprehensiveness of coverage of real-world methodological issues, and reproducibility of assessments between reviewers were not assessed. The decision to publish the checklist without prior pilot testing was driven by the pressing concern associated with reproducibility in medical LLM research and the adoption of a living document approach, where the current version is considered the first iteration.</p>
<p>We are adopting a regular checklist update approach (similar to TRIPOD-LLM) to maintain relevance amidst the rapid evolution of LLM technologies. Minor updates to clarify wording are planned quarterly as needed, medium updates with item additions are planned annually based on accumulated feedback, and major revisions every 2&#x2013;3 years or when significant technological changes occur. This iterative approach is consistent with the dynamic nature of the field and allows the checklist to evolve alongside technologies and methodological standards, gradually overcoming the limitations of the initial version.</p>
<p>The MEDAI-LLM-SUMM checklist represents the first specialized reporting standard for medical text summarization studies using LLM. Its implementation will address three fundamental problems: the lack of reproducibility, the absence of a uniform approach to safety assessment, and the gap between laboratory performance and clinical readiness.</p>
<p>The standardized description of studies using the proposed checklist is intended to ensure the reproducibility by detailing the selection of models, prompts, evaluation methodologies, and hallucination detection protocols. The checklist will provide an objective tool for evaluating clinical applicability by incorporating requirements for pilot testing, expert validation, and limitations statement. The standardization of evaluation metrics and testing protocols will provide a methodological framework for regulatory approval and clinical guidelines (a worked example of manuscript evaluation is provided in <xref ref-type="sec" rid="s10">Supplementary Material 2</xref>).</p>
<p>Furthermore, the MEDAI-LLM-SUMM checklist could become a unified methodological tool for study comparison, which is critically important amidst the rapid growth of publications and commercial solutions. The narrow focus on the summarization task allowed us to create detailed and applicable recommendations instead of the general principles typical of universal guidelines. The results can be scaled to related subject areas. The proposed approach can serve as a model for specialized checklists for other medical NLP tasks (diagnostics, treatment planning, clinical coding), gradually covering the entire spectrum of medical LLM applications that share unique features and safety requirements.</p>
<p>MEDAI-LLM-SUMM complements rather than replaces existing guidelines. TRIPOD-LLM provides general reporting standards for clinical prediction models using LLMs, but does not address summarization-specific elements such as reference summary creation methodology, multi-document synthesis, or temporal consistency in longitudinal records. We recommend using MEDAI-LLM-SUMM alongside TRIPOD-LLM for comprehensive reporting.</p>
</sec>
</body>
<back>
<sec id="s5" sec-type="data-availability"><title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s10">Supplementary Material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s6" sec-type="author-contributions"><title>Author contributions</title>
<p>AK: Data curation, Investigation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. MV: Data curation, Investigation, Writing &#x2013; review &#x0026; editing. RE: Investigation, Writing &#x2013; review &#x0026; editing. IB: Visualization, Writing &#x2013; review &#x0026; editing. RR: Formal analysis, Methodology, Writing &#x2013; review &#x0026; editing. MK: Formal analysis, Methodology, Writing &#x2013; review &#x0026; editing. AP: Data curation, Writing &#x2013; review &#x0026; editing. TB: Writing &#x2013; review &#x0026; editing. KA: Methodology, Software, Writing &#x2013; review &#x0026; editing. OO: Funding acquisition, Project administration, Resources, Writing &#x2013; review &#x0026; editing. AV: Conceptualization, Supervision, Writing &#x2013; review &#x0026; editing. YV: Conceptualization, Supervision, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec id="s8" sec-type="COI-statement"><title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s9" sec-type="ai-statement"><title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s11" sec-type="disclaimer"><title>Publisher&#x0027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10" sec-type="supplementary-material"><title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fdgth.2026.1761601/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fdgth.2026.1761601/full&#x0023;supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
<supplementary-material xlink:href="Table2.docx" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
</sec>
<ref-list><title>References</title>
<ref id="B1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>L</given-names></name> <name><surname>Sun</surname> <given-names>Z</given-names></name> <name><surname>Idnay</surname> <given-names>B</given-names></name> <name><surname>Nestor</surname> <given-names>JG</given-names></name> <name><surname>Soroush</surname> <given-names>A</given-names></name> <name><surname>Elias</surname> <given-names>PA</given-names></name><etal/></person-group> <article-title>Evaluating large language models on medical evidence summarization</article-title>. <source>Npj Digit Med</source>. (<year>2023</year>) <volume>6</volume>(<issue>1</issue>):<fpage>158</fpage>. <pub-id pub-id-type="doi">10.1038/s41746-023-00896-7</pub-id><pub-id pub-id-type="pmid">37620423</pub-id></mixed-citation></ref>
<ref id="B2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Croxford</surname> <given-names>E</given-names></name> <name><surname>Gao</surname> <given-names>Y</given-names></name> <name><surname>Pellegrino</surname> <given-names>N</given-names></name> <name><surname>Wong</surname> <given-names>K</given-names></name> <name><surname>Wills</surname> <given-names>G</given-names></name> <name><surname>First</surname> <given-names>E</given-names></name><etal/></person-group> <article-title>Current and future state of evaluation of large language models for medical summarization tasks</article-title>. <source>NPJ Health Syst</source>. (<year>2025</year>) <volume>2</volume>(<issue>1</issue>):<fpage>6</fpage>. <pub-id pub-id-type="doi">10.1038/s44401-024-00011-2</pub-id><pub-id pub-id-type="pmid">40124388</pub-id></mixed-citation></ref>
<ref id="B3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Van Veen</surname> <given-names>D</given-names></name> <name><surname>Van Uden</surname> <given-names>C</given-names></name> <name><surname>Blankemeier</surname> <given-names>L</given-names></name> <name><surname>Delbrouck</surname> <given-names>JB</given-names></name> <name><surname>Aali</surname> <given-names>A</given-names></name> <name><surname>Bluethgen</surname> <given-names>C</given-names></name><etal/></person-group> <article-title>Adapted large language models can outperform medical experts in clinical text summarization</article-title>. <source>Nat Med</source>. (<year>2024</year>) <volume>30</volume>(<issue>4</issue>):<fpage>1134</fpage>&#x2013;<lpage>42</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-024-02855-5</pub-id><pub-id pub-id-type="pmid">38413730</pub-id></mixed-citation></ref>
<ref id="B4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hager</surname> <given-names>P</given-names></name> <name><surname>Jungmann</surname> <given-names>F</given-names></name> <name><surname>Holland</surname> <given-names>R</given-names></name> <name><surname>Bhagat</surname> <given-names>K</given-names></name> <name><surname>Hubrecht</surname> <given-names>I</given-names></name> <name><surname>Knauer</surname> <given-names>M</given-names></name><etal/></person-group> <article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title>. <source>Nat Med</source>. (<year>2024</year>) <volume>30</volume>(<issue>9</issue>):<fpage>2613</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="pmid">38965432</pub-id></mixed-citation></ref>
<ref id="B5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bednarczyk</surname> <given-names>L</given-names></name> <name><surname>Reichenpfader</surname> <given-names>D</given-names></name> <name><surname>Gaudet-Blavignac</surname> <given-names>C</given-names></name> <name><surname>Ette</surname> <given-names>AK</given-names></name> <name><surname>Zaghir</surname> <given-names>J</given-names></name> <name><surname>Zheng</surname> <given-names>Y</given-names></name><etal/></person-group> <article-title>Scientific evidence for clinical text summarization using large language models: scoping review</article-title>. <source>J Med Internet Res</source>. (<year>2025</year>) <volume>27</volume>(<issue>1</issue>):<fpage>e68998</fpage>. <pub-id pub-id-type="doi">10.2196/68998</pub-id><pub-id pub-id-type="pmid">40371947</pub-id></mixed-citation></ref>
<ref id="B6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Soroush</surname> <given-names>A</given-names></name> <name><surname>Glicksberg</surname> <given-names>BS</given-names></name> <name><surname>Zimlichman</surname> <given-names>E</given-names></name> <name><surname>Barash</surname> <given-names>Y</given-names></name> <name><surname>Freeman</surname> <given-names>R</given-names></name> <name><surname>Charney</surname> <given-names>AW</given-names></name><etal/></person-group> <article-title>Large language models are poor medical coders &#x2014; benchmarking of medical code querying</article-title>. <source>NEJM AI</source>. (<year>2024</year>) <volume>1</volume>(<issue>5</issue>):<fpage>AIdbp2300040</fpage>. <pub-id pub-id-type="doi">10.1056/AIdbp2300040</pub-id></mixed-citation></ref>
<ref id="B7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>TL</given-names></name> <name><surname>Hetherington</surname> <given-names>TC</given-names></name> <name><surname>Dharod</surname> <given-names>A</given-names></name> <name><surname>Carroll</surname> <given-names>T</given-names></name> <name><surname>Bundy</surname> <given-names>R</given-names></name> <name><surname>Nguyen</surname> <given-names>H</given-names></name><etal/></person-group> <article-title>Does AI-powered clinical documentation enhance clinician efficiency? A longitudinal study</article-title>. <source>NEJM AI</source>. (<year>2024</year>) <volume>1</volume>(<issue>12</issue>):<fpage>AIoa2400659</fpage>. <pub-id pub-id-type="doi">10.1056/AIoa2400659</pub-id></mixed-citation></ref>
<ref id="B8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Cruz Rivera</surname> <given-names>S</given-names></name> <name><surname>Moher</surname> <given-names>D</given-names></name> <name><surname>Calvert</surname> <given-names>MJ</given-names></name> <name><surname>Denniston</surname> <given-names>AK</given-names></name></person-group>. <article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI extension</article-title>. <source>Nat Med</source>. (<year>2020</year>) <volume>26</volume>(<issue>9</issue>):<fpage>1364</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-020-1034-x</pub-id><pub-id pub-id-type="pmid">32908283</pub-id></mixed-citation></ref>
<ref id="B9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rivera</surname> <given-names>C</given-names></name> <name><surname>Liu</surname> <given-names>S</given-names></name> <name><surname>Chan</surname> <given-names>X</given-names></name> <name><surname>Denniston</surname> <given-names>AW</given-names></name> <name><surname>Calvert</surname> <given-names>AK</given-names></name> <name><surname>J</surname> <given-names>M</given-names></name></person-group>. <article-title>Guidelines for clinical trial protocols for interventions involving artificial intelligence: the SPIRIT-AI extension</article-title>. <source>Nat Med</source>. (<year>2020</year>) <volume>26</volume>(<issue>9</issue>):<fpage>1351</fpage>&#x2013;<lpage>63</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-020-1037-7</pub-id><pub-id pub-id-type="pmid">32908284</pub-id></mixed-citation></ref>
<ref id="B10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gallifant</surname> <given-names>J</given-names></name> <name><surname>Afshar</surname> <given-names>M</given-names></name> <name><surname>Ameen</surname> <given-names>S</given-names></name> <name><surname>Aphinyanaphongs</surname> <given-names>Y</given-names></name> <name><surname>Chen</surname> <given-names>S</given-names></name> <name><surname>Cacciamani</surname> <given-names>G</given-names></name><etal/></person-group> <article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title>. <source>Nat Med</source>. (<year>2025</year>) <volume>31</volume>(<issue>1</issue>):<fpage>60</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="pmid">39779929</pub-id></mixed-citation></ref>
<ref id="B11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tripathi</surname> <given-names>S</given-names></name> <name><surname>Alkhulaifat</surname> <given-names>D</given-names></name> <name><surname>Doo</surname> <given-names>FX</given-names></name> <name><surname>Rajpurkar</surname> <given-names>P</given-names></name> <name><surname>McBeth</surname> <given-names>R</given-names></name> <name><surname>Daye</surname> <given-names>D</given-names></name><etal/></person-group> <article-title>Development, evaluation, and assessment of large language models (DEAL) checklist: a technical report</article-title>. <source>NEJM AI</source>. (<year>2025</year>) <volume>2</volume>(<issue>6</issue>):<fpage>AIp2401106</fpage>. <pub-id pub-id-type="doi">10.1056/AIp2401106</pub-id></mixed-citation></ref>
<ref id="B12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hou</surname> <given-names>F</given-names></name> <name><surname>Triantaphyllou</surname> <given-names>E</given-names></name></person-group>. <article-title>An iterative approach for achieving consensus when ranking a finite set of alternatives by a group of experts</article-title>. <source>Eur J Oper Res</source>. (<year>2019</year>) <volume>275</volume>(<issue>2</issue>):<fpage>570</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1016/j.ejor.2018.11.047</pub-id></mixed-citation></ref>
<ref id="B13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Humphrey-Murto</surname> <given-names>S</given-names></name> <name><surname>Wood</surname> <given-names>TJ</given-names></name> <name><surname>Gonsalves</surname> <given-names>C</given-names></name> <name><surname>Mascioli</surname> <given-names>K</given-names></name> <name><surname>Varpio</surname> <given-names>L</given-names></name></person-group>. <article-title>The delphi method</article-title>. <source>Acad Med</source>. (<year>2020</year>) <volume>95</volume>(<issue>1</issue>):<fpage>168</fpage>. <pub-id pub-id-type="doi">10.1097/ACM.0000000000002887</pub-id><pub-id pub-id-type="pmid">31335812</pub-id></mixed-citation></ref>
<ref id="B14"><label>14.</label><mixed-citation publication-type="other"><collab>EQUATOR Network</collab>. <comment>Enhancing the QUAlity and Transparency Of Health Research</comment>. <comment>Available online at:</comment> <ext-link ext-link-type="uri" xlink:href="https://www.equator-network.org/">https://www.equator-network.org/</ext-link> <comment>(Accessed November 27, 2025)</comment>.</mixed-citation></ref>
<ref id="B15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vasilev</surname> <given-names>YA</given-names></name> <name><surname>&#x0410;&#x043B;&#x0435;&#x043A;&#x0441;&#x0430;&#x043D;&#x0434;&#x0440;&#x043E;&#x0432;&#x0438;&#x0447;</surname> <given-names>&#x0412;&#x042E;</given-names></name> <name><surname>Vasilev</surname> <given-names>YA</given-names></name> <name><surname>Reshetnikov</surname> <given-names>RV</given-names></name> <name><surname>&#x0412;&#x043B;&#x0430;&#x0434;&#x0438;&#x043C;&#x0438;&#x0440;&#x043E;&#x0432;&#x0438;&#x0447;</surname> <given-names>&#x0420;&#x0420;</given-names></name> <name><surname>Reshetnikov</surname> <given-names>RV</given-names></name><etal/></person-group> <article-title>Application of large language models in radiological diagnostics: a scoping review</article-title>. <source>Digit Diagn</source>. (<year>2025</year>) <volume>6</volume>(<issue>2</issue>):<fpage>268</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.17816/DD678373</pub-id></mixed-citation></ref>
<ref id="B16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Norgeot</surname> <given-names>B</given-names></name> <name><surname>Quer</surname> <given-names>G</given-names></name> <name><surname>Beaulieu-Jones</surname> <given-names>BK</given-names></name> <name><surname>Torkamani</surname> <given-names>A</given-names></name> <name><surname>Dias</surname> <given-names>R</given-names></name> <name><surname>Gianfrancesco</surname> <given-names>M</given-names></name><etal/></person-group> <article-title>Minimum information about clinical artificial intelligence modeling: the MI-CLAIM checklist</article-title>. <source>Nat Med</source>. (<year>2020</year>) <volume>26</volume>(<issue>9</issue>):<fpage>1320</fpage>&#x2013;<lpage>4</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-020-1041-y</pub-id><pub-id pub-id-type="pmid">32908275</pub-id></mixed-citation></ref>
<ref id="B17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sounderajah</surname> <given-names>V</given-names></name> <name><surname>Guni</surname> <given-names>A</given-names></name> <name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Collins</surname> <given-names>GS</given-names></name> <name><surname>Karthikesalingam</surname> <given-names>A</given-names></name> <name><surname>Markar</surname> <given-names>SR</given-names></name><etal/></person-group> <article-title>The STARD-AI reporting guideline for diagnostic accuracy studies using artificial intelligence</article-title>. <source>Nat Med</source>. (<year>2025</year>) <volume>31</volume>(<issue>10</issue>):<fpage>3283</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-025-03953-8</pub-id><pub-id pub-id-type="pmid">40954311</pub-id></mixed-citation></ref>
<ref id="B18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shang</surname> <given-names>Z</given-names></name></person-group>. <article-title>Use of Delphi in health sciences research: a narrative review</article-title>. <source>Medicine (Baltimore)</source>. (<year>2023</year>) <volume>102</volume>(<issue>7</issue>):<fpage>e32829</fpage>. <pub-id pub-id-type="doi">10.1097/MD.0000000000032829</pub-id><pub-id pub-id-type="pmid">36800594</pub-id></mixed-citation></ref>
<ref id="B19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alkalbani</surname> <given-names>AM</given-names></name> <name><surname>Alrawahi</surname> <given-names>AS</given-names></name> <name><surname>Salah</surname> <given-names>A</given-names></name> <name><surname>Haghighi</surname> <given-names>V</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Alkindi</surname> <given-names>S</given-names></name><etal/></person-group> <article-title>A systematic review of large language models in medical specialties: applications, challenges and future directions</article-title>. <source>Information</source>. (<year>2025</year>) <volume>16</volume>(<issue>6</issue>):<fpage>489</fpage>. <pub-id pub-id-type="doi">10.3390/info16060489</pub-id></mixed-citation></ref>
<ref id="B20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jung</surname> <given-names>KH</given-names></name></person-group>. <article-title>Large language models in medicine: clinical applications, technical challenges, and ethical considerations</article-title>. <source>Healthc Inform Res</source>. (<year>2025</year>) <volume>31</volume>(<issue>2</issue>):<fpage>114</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.4258/hir.2025.31.2.114</pub-id><pub-id pub-id-type="pmid">40384063</pub-id></mixed-citation></ref>
<ref id="B21"><label>21.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Croxford</surname> <given-names>E</given-names></name> <name><surname>Gao</surname> <given-names>Y</given-names></name> <name><surname>Pellegrino</surname> <given-names>N</given-names></name> <name><surname>Wong</surname> <given-names>K</given-names></name> <name><surname>Wills</surname> <given-names>G</given-names></name> <name><surname>First</surname> <given-names>E</given-names></name><etal/></person-group> <article-title>Development and validation of the provider documentation summarization quality instrument for large language models</article-title>. <source>J Am Med Inform Assoc</source>. (<year>2025</year>) <volume>32</volume>(<issue>6</issue>):<fpage>1050</fpage>&#x2013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1093/jamia/ocaf068</pub-id><pub-id pub-id-type="pmid">40323321</pub-id></mixed-citation></ref>
<ref id="B22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vasilev</surname> <given-names>YA</given-names></name> <name><surname>Vladzymyrskyy</surname> <given-names>AV</given-names></name> <name><surname>Omelyanskaya</surname> <given-names>OV</given-names></name> <name><surname>Alymova</surname> <given-names>YA</given-names></name> <name><surname>Akhmedzyanova</surname> <given-names>DA</given-names></name> <name><surname>Shumskaya</surname> <given-names>YF</given-names></name><etal/></person-group> <article-title>Development and validation of a questionnaire to evaluate AI-generated summaries for radiologists: ELEGANCE (expert-led evaluation of generative AI competence and ExcelleNCE)</article-title>. <source>AI</source>. (<year>2025</year>) <volume>6</volume>(<issue>11</issue>):<fpage>287</fpage>. <pub-id pub-id-type="doi">10.3390/ai6110287</pub-id></mixed-citation></ref>
<ref id="B23"><label>23.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Croxford</surname> <given-names>E</given-names></name> <name><surname>Gao</surname> <given-names>Y</given-names></name> <name><surname>First</surname> <given-names>E</given-names></name> <name><surname>Pellegrino</surname> <given-names>N</given-names></name> <name><surname>Schnier</surname> <given-names>M</given-names></name> <name><surname>Caskey</surname> <given-names>J</given-names></name><etal/></person-group> <article-title>Automating evaluation of AI text generation in healthcare with a large language model (LLM)-as-a-judge</article-title>. <source>medRxiv <italic>[Preprint]</italic></source>. (<year>2025</year>):<fpage>2025.04.22.25326219</fpage>. <pub-id pub-id-type="doi">10.1101/2025.04.22.25326219</pub-id></mixed-citation></ref>
<ref id="B24"><label>24.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Belz</surname> <given-names>A</given-names></name> <name><surname>Thomson</surname> <given-names>C</given-names></name> <name><surname>Gonz&#x00E1;lez Corbelle</surname> <given-names>J</given-names></name> <name><surname>Ruelle</surname> <given-names>M</given-names></name></person-group>. <article-title>The 2025 ReproNLP shared task on reproducibility of evaluations in NLP: overview and results</article-title>. In: <person-group person-group-type="editor"><name><surname>Arviv</surname> <given-names>O</given-names></name> <name><surname>Clinciu</surname> <given-names>M</given-names></name> <name><surname>Dhole</surname> <given-names>K</given-names></name> <name><surname>Dror</surname> <given-names>R</given-names></name> <name><surname>Gehrmann</surname> <given-names>S</given-names></name> <name><surname>Habba</surname> <given-names>E</given-names></name><etal/></person-group> editors. <source>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM<sup>2</sup>)</source>. <publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name> (<year>2025</year>). p. <fpage>1002</fpage>&#x2013;<lpage>16</lpage>. <comment>Available online at:</comment> <ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.gem-1.78/">https://aclanthology.org/2025.gem-1.78/</ext-link> <comment>(Accessed November 27, 2025)</comment>.</mixed-citation></ref></ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2048290/overview">Fahim Sufi</ext-link>, Monash University, Australia</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3060106/overview">Parul Berry</ext-link>, Mayo Clinic, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3319440/overview">Thomas Hartka</ext-link>, University of Virginia, United States</p></fn>
</fn-group>
</back>
</article>