<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Digit. Health</journal-id><journal-title-group>
<journal-title>Frontiers in Digital Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Digit. Health</abbrev-journal-title></journal-title-group>
<issn pub-type="epub">2673-253X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdgth.2026.1741973</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Evaluating large language models for automated TNM staging from PET-CT reports: a multi-cancer comparative study</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes"><name><surname>Xu</surname><given-names>Wen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="an1"><sup>&#x2020;</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/3265780/overview"/><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role></contrib>
<contrib contrib-type="author" equal-contrib="yes"><name><surname>Cao</surname><given-names>Lixiu</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="an1"><sup>&#x2020;</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/2999695/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role></contrib>
<contrib contrib-type="author"><name><surname>Shen</surname><given-names>Qijun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/867004/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Shan</surname><given-names>Yanna</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/1060763/overview" /><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
<contrib contrib-type="author"><name><surname>Pan</surname><given-names>Shushu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role></contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Ruan</surname><given-names>Mei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor1">&#x002A;</xref><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role><role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role></contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Radiology, Hangzhou First People&#x2019;s Hospital</institution>, <city>Hangzhou</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Nuclear Medicine Imaging, Tangshan People&#x2019;s Hospital</institution>, <city>Tangshan</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label><bold>Correspondence:</bold> Mei Ruan <email xlink:href="mailto:echoruan1987@126.com">echoruan1987@126.com</email></corresp>
<fn fn-type="equal" id="an1"><label>&#x2020;</label><p>These authors share first authorship</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-04"><day>04</day><month>03</month><year>2026</year></pub-date>
<pub-date publication-format="electronic" date-type="collection"><year>2026</year></pub-date>
<volume>8</volume><elocation-id>1741973</elocation-id>
<history>
<date date-type="received"><day>08</day><month>11</month><year>2025</year></date>
<date date-type="rev-recd"><day>09</day><month>01</month><year>2026</year></date>
<date date-type="accepted"><day>05</day><month>02</month><year>2026</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 Xu, Cao, Shen, Shan, Pan and Ruan.</copyright-statement>
<copyright-year>2026</copyright-year><copyright-holder>Xu, Cao, Shen, Shan, Pan and Ruan</copyright-holder><license><ali:license_ref start_date="2026-03-04">https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p></license>
</permissions>
<abstract><sec><title>Purpose</title>
<p>To evaluate three large language models (LLMs), including ChatGPT 5, ChatGPT 4o, and ChatGPT 3.5, in automating TNM staging from PET-CT reports across six cancer types, and to assess their clinical utility compared with junior radiologists.</p>
</sec><sec><title>Materials and methods</title>
<p>PET-CT reports from 552 treatment-naive patients in two institutions with confirmed primary malignancies (lung, breast, liver, pancreatic, renal, and prostate cancer) were analyzed. Three ChatGPT-series LLMs and five junior radiologists independently performed TNM staging. Reference standards were established by two senior radiologists according to the 8th version of American Joint Committee on Cancer (AJCC) staging system. Performance was evaluated using accuracy rates. Intra-model agreement was assessed by repeating each model three times per report with identical prompts, and inter-model agreement was evaluated using Cohen&#x0027;s <italic>&#x03BA;</italic> coefficients.</p>
</sec><sec><title>Results</title>
<p>ChatGPT 5 achieved the highest overall accuracy (82.1&#x0025;, 453/552), followed by ChatGPT 4o (74.3&#x0025;, 410/552), both significantly outperforming ChatGPT 3.5 (59.6&#x0025;, 329/552) and junior radiologists (77.0&#x0025;, 425/552; <italic>p</italic>&#x2009;&#x003D;&#x2009;0.041 for ChatGPT 5 vs. junior radiologists). Accuracy varied by cancer type, with the highest performance in lung cancer staging (88.5&#x0025;) and the lowest in pancreatic cancer (69.2&#x0025;). Across TNM categories, all models achieved the best performance in T staging, followed by N staging, with M staging remaining the most challenging. ChatGPT 5 showed near-perfect intra-model agreement (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.96), while inter-model agreement ranged from moderate between ChatGPT 3.5 and 4o (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.58) to substantial between ChatGPT 5 and 4o (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.78). ChatGPT 5 processed cases markedly faster than junior radiologists (8.3&#x2009;&#x00B1;&#x2009;3.2 vs. 92.5&#x2009;&#x00B1;&#x2009;21.7&#x2005;s per case; <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001).</p>
</sec><sec><title>Conclusion</title>
<p>Among the three LLMs, ChatGPT 5 demonstrated the highest accuracy, stability, and efficiency in automated TNM staging from PET-CT reports, achieving performance comparable to or slightly exceeding junior radiologists. Its advantages in T staging and lung cancer evaluation highlight its clinical utility as a potential decision-support tool.</p>
</sec>
</abstract>
<kwd-group>
<kwd>artificial intelligence</kwd>
<kwd>large language models</kwd>
<kwd>oncology</kwd>
<kwd>PET-CT</kwd>
<kwd>TNM staging</kwd>
</kwd-group><funding-group><funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This study was funded by the Medical Science and Technology Project of Zhejiang Province (No. 2025KY1041 and No. 2024KY194), and Medical Science Research Project of Hebei (No. 20250225).</funding-statement></funding-group><counts>
<fig-count count="4"/>
<table-count count="3"/><equation-count count="0"/><ref-count count="25"/><page-count count="9"/><word-count count="0"/></counts><custom-meta-group><custom-meta><meta-name>section-at-acceptance</meta-name><meta-value>Health Informatics</meta-value></custom-meta></custom-meta-group>
</article-meta>
</front>
<body><sec id="s1" sec-type="intro"><title>Introduction</title>
<p>The TNM staging system remains the cornerstone of cancer prognosis and treatment planning (<xref ref-type="bibr" rid="B1">1</xref>). Accurate staging guides therapeutic strategies, predicts outcomes, and informs multidisciplinary management. In oncology practice, staging derived from PET-CT is particularly important, as this modality combines functional and structural information into a single examination (<xref ref-type="bibr" rid="B1">1</xref>). However, translating narrative PET-CT reports into precise TNM categories is labor-intensive and subject to variability, especially among junior clinicians. Inconsistent staging can directly influence therapeutic recommendations, clinical trial eligibility, and longitudinal outcome assessment, making reproducibility an essential requirement in daily practice (<xref ref-type="bibr" rid="B2">2</xref>).</p>
<p>Large language models (LLMs) have emerged as promising tools for medical text interpretation (<xref ref-type="bibr" rid="B2">2</xref>&#x2013;<xref ref-type="bibr" rid="B4">4</xref>). The ChatGPT family in particular has attracted attention for its ability to generate coherent responses, structure radiology reports, and even pass board-style examinations (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>). Beyond clinician-facing tasks, LLMs have also shown strong performance in answering patient care questions in oncology settings (<xref ref-type="bibr" rid="B7">7</xref>). Applications in protocol selection, multilingual report translation, and automated impression generation further illustrate the versatility of these systems within radiology (<xref ref-type="bibr" rid="B3">3</xref>, <xref ref-type="bibr" rid="B4">4</xref>). Such progress suggests that LLMs may be capable of tackling more structured and reasoning-intensive tasks, including mapping free-text PET-CT reports to standardized oncologic staging systems. Importantly, successive iterations from ChatGPT 3.5 to ChatGPT 4o and most recently ChatGPT-5 have shown marked gains in reasoning, factual consistency, and efficiency, underscoring the need to reassess performance with each new generation (<xref ref-type="bibr" rid="B8">8</xref>&#x2013;<xref ref-type="bibr" rid="B10">10</xref>).</p>
<p>Despite these advances, the role of LLMs in systematic oncologic staging remains poorly understood. Existing studies have generally addressed individual diagnostic questions or text summarization tasks, but systematic multi-organ TNM classification has not been thoroughly examined (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B11">11</xref>). Moreover, comparisons across different model generations are lacking, and head-to-head benchmarking against human readers remains scarce. Whether LLMs can provide reliable staging across cancers of different organ systems, and how their performance compares with that of junior radiologists, has yet to be established.</p>
<p>The aim of this study was therefore to evaluate the performance of ChatGPT 5, ChatGPT 4o, and ChatGPT 3.5 in automating TNM staging from PET-CT reports across six common cancer types. Reference standards were established by senior radiologists, and results were benchmarked against junior radiologists to assess accuracy, reproducibility, and efficiency in a clinically relevant setting.</p>
</sec>
<sec id="s2" sec-type="methods"><title>Materials and methods</title>
<sec id="s2a"><title>Patient cohort and data collection</title>
<p>This retrospective study included 552 PET-CT reports from patients with histologically confirmed primary malignancies, collected between January 2020 and December 2024 from two tertiary institutions. Cancer types comprised lung (<italic>n</italic>&#x2009;&#x003D;&#x2009;118), breast (<italic>n</italic>&#x2009;&#x003D;&#x2009;96), liver (<italic>n</italic>&#x2009;&#x003D;&#x2009;102), pancreatic (<italic>n</italic>&#x2009;&#x003D;&#x2009;72), renal (<italic>n</italic>&#x2009;&#x003D;&#x2009;76), and prostate (<italic>n</italic>&#x2009;&#x003D;&#x2009;88) cancers (<xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>).</p>
<fig id="F1" position="float"><label>Figure&#x00A0;1</label>
<caption><p>Flow diagram of patient selection and study design. PET-CT reports from 552 treatment-naive patients across six cancer types (lung, breast, liver, pancreatic, renal, and prostate) were retrospectively collected from two institutions. Reports were staged independently by three large language models (ChatGPT 5, ChatGPT 4o, and ChatGPT 3.5) and by junior radiologists, with senior radiologists providing the reference standard.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1741973-g001.tif"><alt-text content-type="machine-generated">Flowchart illustrating the study design for evaluating PET-CT reports, detailing inclusion of reports with complete tumor, lymph node, and metastasis information, exclusion criteria, cancer type categorization, model evaluation using LLMs and junior radiologists, and performance analysis by overall, cancer-specific, and stage-specific accuracy.</alt-text>
</graphic>
</fig>
<p>Inclusion criteria were: (1) treatment-naive patients; (2) availability of complete PET-CT reports with diagnostic-quality images; and (3) availability of a high-confidence reference TNM label, defined as senior radiologist&#x2013;adjudicated clinical TNM (cTNM) that was concordant with the final staging information documented in the medical record (including pathological staging when available). Reports with incomplete information or indeterminate staging were excluded.</p>
</sec>
<sec id="s2b"><title>Report format and text preprocessing</title>
<p>The PET-CT reports followed an institutionally standardized template with fixed section headers. For model input, we extracted only the &#x201C;Findings&#x201D; and &#x201C;Impression&#x201D; sections and removed administrative headers/footers and any patient identifiers. No manual rewriting, normalization of terminology, or rule-based restructuring of the clinical content was performed. The extracted text was then translated into English and reviewed to ensure that clinical meaning was preserved. The final model input consisted of the concatenated Findings and Impression text for each case.</p>
</sec>
<sec id="s2c"><title>Reference standard TNM staging</title>
<p>TNM staging was defined according to the 8th edition AJCC staging system. The reference standard was the consensus clinical TNM (cTNM) based on PET-CT, established by two senior radiologists (15 and 12 years of oncologic imaging experience) who independently reviewed the PET-CT images together with the corresponding report text. Clinical data were restricted to information necessary to apply the appropriate cancer-specific AJCC criteria and resolve staging ambiguities; no additional downstream pathologic upstaging information was used to assign TNM categories beyond what was supported by PET-CT findings. Discrepancies were resolved by consensus with a third radiologist (20 years of experience).</p>
</sec>
<sec id="s2d"><title>Large language model evaluation</title>
<p>Three LLMs were evaluated: ChatGPT 5 (version: gpt-5; August 2025), ChatGPT 4o (version: gpt-4o-2024-11-20; November 2024), and ChatGPT 3.5 (version: gpt-3.5-turbo-0125; January 2024). All analyses were conducted during a fixed study period in August 2025 to ensure consistency of model performance and to minimize potential influence of model updates.</p>
<p>A standardized prompt was applied to all models:</p><disp-quote>
<p>&#x201C;You are an expert radiologist. Based on the following PET-CT report, please determine the TNM stage according to the AJCC 8th edition criteria for [specific cancer type]. Provide separate assessments for T, N, and M with reasoning.&#x201D;</p></disp-quote>
<p>All experiments were executed via the OpenAI API. For each model, we used identical prompts and report text inputs across runs, with parameters held constant to reduce sampling variability (temperature&#x2009;&#x003D;&#x2009;0; top_<italic>p</italic>&#x2009;&#x003D;&#x2009;1.0; presence_penalty&#x2009;&#x003D;&#x2009;0; frequency_penalty&#x2009;&#x003D;&#x2009;0; max_tokens left at the API default). No fixed random seed was specified. Each report was submitted three consecutive times per model as independent API calls to quantify output stability; intra-model agreement was then calculated based on the three returned TNM outputs. For inter-model agreement, the most frequent prediction across the three runs was considered the model&#x0027;s final output. In cases where all three predictions differed, adjudication was performed by a senior radiologist.</p>
</sec>
<sec id="s2e"><title>Human reader evaluation</title>
<p>Five junior radiologists independently assigned T, N, and M categories from the PET-CT report text. All were trained in the AJCC 8th edition criteria but blinded to the reference standard and LLM results. In addition to reader-wise performance, an aggregated junior-radiologist comparator was constructed using majority voting: for each case, the T, N, and M categories were determined separately as the most frequently assigned label across the five readers. In the event of a tie, the reference standard category was used for adjudication. The aggregated TNM stage was considered correct only if all three components (T, N, and M) matched the reference standard. Reading time per case was recorded.</p>
</sec>
<sec id="s2f"><title>Statistical analysis</title>
<p>All statistical analyses were performed using Python (version 3.11.3) and SPSS (version 25.0; IBM, Chicago, IL). Model and human reader outputs were compared against the reference standard.</p>
<p>Agreement was evaluated at two levels: intra-model agreement (across three independent runs of each model) using Fleiss&#x0027; <italic>&#x03BA;</italic>, and inter-model agreement between different models using Cohen&#x0027;s <italic>&#x03BA;</italic>. Agreement with junior radiologists was also assessed using Cohen&#x0027;s <italic>&#x03BA;</italic>, with <italic>&#x03BA;</italic> values interpreted as slight (&#x2264;0.20), fair (0.21&#x2013;0.40), moderate (0.41&#x2013;0.60), substantial (0.61&#x2013;0.80), or almost perfect (&#x2265;0.81).</p>
<p>For comparisons of accuracy between LLMs and junior radiologists, the McNemar test was used. Processing times were compared using the Wilcoxon signed-rank test. Receiver operating characteristic (ROC) analysis was performed to calculate area under the curve (AUC) for overall staging accuracy, and DeLong test was applied for AUC comparisons. Subgroup analyses were performed for each cancer type. All statistical tests were two-sided, and a <italic>p</italic> value&#x2009;&#x003C;&#x2009;0.05 was considered statistically significant. Bonferroni correction was applied for multiple comparisons.</p>
</sec>
</sec>
<sec id="s3" sec-type="results"><title>Results</title>
<sec id="s3a"><title>Patient characteristics</title>
<p>A total of 552 patients with pathologically confirmed malignancies were included. The mean age was 58.4&#x2009;&#x00B1;&#x2009;13.2 years (range, 27&#x2013;86 years), with the majority (53.4&#x0025;) between 50 and 70 years of age. Demographic and staging characteristics for each cancer type are presented in <xref ref-type="table" rid="T1">Table&#x00A0;1</xref>.</p>
<table-wrap id="T1" position="float"><label>Table&#x00A0;1</label>
<caption><p>Clinical characteristics of patients by cancer type.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Characteristic</th>
<th valign="top" align="center">Lung cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;118)</th>
<th valign="top" align="center">Breast cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;96)</th>
<th valign="top" align="center">Liver cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;102)</th>
<th valign="top" align="center">Pancreatic cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;72)</th>
<th valign="top" align="center">Kidney cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;76)</th>
<th valign="top" align="center">Prostate cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;88)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" style="background-color:#d9d9d9" colspan="7">Age (y)</td>
</tr>
<tr>
<td valign="top" align="left">Mean&#x2009;&#x00B1;&#x2009;SD</td>
<td valign="top" align="center">62.3&#x2009;&#x00B1;&#x2009;9.8</td>
<td valign="top" align="center">54.7&#x2009;&#x00B1;&#x2009;11.2</td>
<td valign="top" align="center">58.9&#x2009;&#x00B1;&#x2009;10.5</td>
<td valign="top" align="center">63.5&#x2009;&#x00B1;&#x2009;8.9</td>
<td valign="top" align="center">59.2&#x2009;&#x00B1;&#x2009;10.8</td>
<td valign="top" align="center">65.8&#x2009;&#x00B1;&#x2009;8.2</td>
</tr>
<tr>
<td valign="top" align="left">Range</td>
<td valign="top" align="center">42&#x2013;78</td>
<td valign="top" align="center">32&#x2013;76</td>
<td valign="top" align="center">38&#x2013;75</td>
<td valign="top" align="center">45&#x2013;79</td>
<td valign="top" align="center">35&#x2013;77</td>
<td valign="top" align="center">48&#x2013;82</td>
</tr>
<tr>
<td valign="top" align="left" style="background-color:#d9d9d9" colspan="7">Age group</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C;50</td>
<td valign="top" align="center">16 (13.6)</td>
<td valign="top" align="center">24 (25.0)</td>
<td valign="top" align="center">18 (17.6)</td>
<td valign="top" align="center">8 (11.1)</td>
<td valign="top" align="center">14 (18.4)</td>
<td valign="top" align="center">6 (6.8)</td>
</tr>
<tr>
<td valign="top" align="left">50&#x2013;70</td>
<td valign="top" align="center">78 (66.1)</td>
<td valign="top" align="center">58 (60.4)</td>
<td valign="top" align="center">64 (62.7)</td>
<td valign="top" align="center">48 (66.7)</td>
<td valign="top" align="center">48 (63.2)</td>
<td valign="top" align="center">54 (61.4)</td>
</tr>
<tr>
<td valign="top" align="left">&#x003E;70</td>
<td valign="top" align="center">24 (20.3)</td>
<td valign="top" align="center">14 (14.6)</td>
<td valign="top" align="center">20 (19.6)</td>
<td valign="top" align="center">16 (22.2)</td>
<td valign="top" align="center">14 (18.4)</td>
<td valign="top" align="center">28 (31.8)</td>
</tr>
<tr>
<td valign="top" align="left" style="background-color:#d9d9d9" colspan="7">T stage</td>
</tr>
<tr>
<td valign="top" align="left">T1</td>
<td valign="top" align="center">24 (20.3)</td>
<td valign="top" align="center">20 (20.8)</td>
<td valign="top" align="center">22 (21.6)</td>
<td valign="top" align="center">10 (13.9)</td>
<td valign="top" align="center">18 (23.7)</td>
<td valign="top" align="center">16 (18.2)</td>
</tr>
<tr>
<td valign="top" align="left">T2</td>
<td valign="top" align="center">52 (44.1)</td>
<td valign="top" align="center">44 (45.8)</td>
<td valign="top" align="center">42 (41.2)</td>
<td valign="top" align="center">24 (33.3)</td>
<td valign="top" align="center">30 (39.5)</td>
<td valign="top" align="center">30 (34.1)</td>
</tr>
<tr>
<td valign="top" align="left">T3</td>
<td valign="top" align="center">28 (23.7)</td>
<td valign="top" align="center">22 (22.9)</td>
<td valign="top" align="center">26 (25.5)</td>
<td valign="top" align="center">30 (41.7)</td>
<td valign="top" align="center">18 (23.7)</td>
<td valign="top" align="center">28 (31.8)</td>
</tr>
<tr>
<td valign="top" align="left">T4</td>
<td valign="top" align="center">14 (11.9)</td>
<td valign="top" align="center">10 (10.4)</td>
<td valign="top" align="center">12 (11.8)</td>
<td valign="top" align="center">8 (11.1)</td>
<td valign="top" align="center">10 (13.2)</td>
<td valign="top" align="center">14 (15.9)</td>
</tr>
<tr>
<td valign="top" align="left" style="background-color:#d9d9d9" colspan="7">N stage</td>
</tr>
<tr>
<td valign="top" align="left">N0</td>
<td valign="top" align="center">30 (25.4)</td>
<td valign="top" align="center">26 (27.1)</td>
<td valign="top" align="center">66 (64.7)</td>
<td valign="top" align="center">16 (22.2)</td>
<td valign="top" align="center">52 (68.4)</td>
<td valign="top" align="center">56 (63.6)</td>
</tr>
<tr>
<td valign="top" align="left">N1</td>
<td valign="top" align="center">46 (39.0)</td>
<td valign="top" align="center">42 (43.8)</td>
<td valign="top" align="center">24 (23.5)</td>
<td valign="top" align="center">38 (52.8)</td>
<td valign="top" align="center">16 (21.1)</td>
<td valign="top" align="center">22 (25.0)</td>
</tr>
<tr>
<td valign="top" align="left">N2</td>
<td valign="top" align="center">28 (23.7)</td>
<td valign="top" align="center">18 (18.8)</td>
<td valign="top" align="center">8 (7.8)</td>
<td valign="top" align="center">12 (16.7)</td>
<td valign="top" align="center">6 (7.9)</td>
<td valign="top" align="center">8 (9.1)</td>
</tr>
<tr>
<td valign="top" align="left">N3</td>
<td valign="top" align="center">14 (11.9)</td>
<td valign="top" align="center">10 (10.4)</td>
<td valign="top" align="center">4 (3.9)</td>
<td valign="top" align="center">6 (8.3)</td>
<td valign="top" align="center">2 (2.6)</td>
<td valign="top" align="center">2 (2.3)</td>
</tr>
<tr>
<td valign="top" align="left" style="background-color:#d9d9d9" colspan="7">M stage</td>
</tr>
<tr>
<td valign="top" align="left">M0</td>
<td valign="top" align="center">84 (71.2)</td>
<td valign="top" align="center">74 (77.1)</td>
<td valign="top" align="center">84 (82.4)</td>
<td valign="top" align="center">44 (61.1)</td>
<td valign="top" align="center">62 (81.6)</td>
<td valign="top" align="center">72 (81.8)</td>
</tr>
<tr>
<td valign="top" align="left">M1</td>
<td valign="top" align="center">34 (28.8)</td>
<td valign="top" align="center">22 (22.9)</td>
<td valign="top" align="center">18 (17.6)</td>
<td valign="top" align="center">28 (38.9)</td>
<td valign="top" align="center">14 (18.4)</td>
<td valign="top" align="center">16 (18.2)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF1"><p>Data are numbers of findings, with percentages in parentheses.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>For lung cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;118), most patients were staged as T2 (0.46), N1 (0.39), and M0 (0.71). For breast cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;96), T2 (0.48), N1 (0.44), and M0 (0.77) were predominant. In hepatocellular carcinoma (<italic>n</italic>&#x2009;&#x003D;&#x2009;102), T2 (0.41) was most frequent, with fewer nodal and distant metastases (N0: 0.65; M0: 0.82). Pancreatic cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;72) cases typically presented at more advanced stages, with T3 (0.42), N1 (0.53), and a relatively high proportion of distant metastases (M1: 0.39). For renal cell carcinoma (<italic>n</italic>&#x2009;&#x003D;&#x2009;76), early disease was common, with T1 (0.45) and low rates of nodal (N0: 0.68) and distant metastases (M0: 0.82). Prostate cancer (<italic>n</italic>&#x2009;&#x003D;&#x2009;88) demonstrated a more balanced distribution, with T2 (0.34) and T3 (0.32) most frequently observed.</p>
</sec>
<sec id="s3b"><title>Overall performance of LLMs and junior radiologists</title>
<p>ChatGPT 5 achieved the highest overall accuracy (0.82; 95&#x0025; CI: 0.79&#x2013;0.85), followed by ChatGPT 4o (0.74; 95&#x0025; CI: 0.71&#x2013;0.78) and ChatGPT 3.5 (0.60; 95&#x0025; CI: 0.55&#x2013;0.64). Junior radiologists obtained an accuracy of 0.77 (95&#x0025; CI: 0.73&#x2013;0.80). ChatGPT 5 significantly outperformed both ChatGPT 3.5 (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) and junior radiologists (<italic>p</italic>&#x2009;&#x003D;&#x2009;0.041), whereas ChatGPT 4o performed better than ChatGPT 3.5 (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001) but not junior radiologists (<italic>p</italic>&#x2009;&#x003D;&#x2009;0.12) (<xref ref-type="table" rid="T2">Table&#x00A0;2</xref>).</p>
<table-wrap id="T2" position="float"><label>Table&#x00A0;2</label>
<caption><p>Overall performance of large language models and junior radiologists.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Model/evaluator</th>
<th valign="top" align="center">Overall accuracy (95&#x0025; CI)</th>
<th valign="top" align="center">Processing time (s)</th>
<th valign="top" align="center">T Staging accuracy</th>
<th valign="top" align="center">N Staging accuracy</th>
<th valign="top" align="center">M Staging accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">ChatGPT 5</td>
<td valign="top" align="center">0.82 (0.79&#x2013;0.85)</td>
<td valign="top" align="center">8.3&#x2009;&#x00B1;&#x2009;3.2</td>
<td valign="top" align="center">0.84 (465/552)</td>
<td valign="top" align="center">0.79 (436/552)</td>
<td valign="top" align="center">0.76 (420/552)</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT 4o</td>
<td valign="top" align="center">0.74 (0.71&#x2013;0.78)</td>
<td valign="top" align="center">7.6&#x2009;&#x00B1;&#x2009;2.9</td>
<td valign="top" align="center">0.77 (425/552)</td>
<td valign="top" align="center">0.70 (386/552)</td>
<td valign="top" align="center">0.67 (370/552)</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT 3.5</td>
<td valign="top" align="center">0.60 (0.56&#x2013;0.64)</td>
<td valign="top" align="center">12.8&#x2009;&#x00B1;&#x2009;5.1</td>
<td valign="top" align="center">0.63 (348/552)</td>
<td valign="top" align="center">0.56 (309/552)</td>
<td valign="top" align="center">0.53 (292/552)</td>
</tr>
<tr>
<td valign="top" align="left">Junior Radiologists</td>
<td valign="top" align="center">0.77 (0.73&#x2013;0.80)</td>
<td valign="top" align="center">92.5&#x2009;&#x00B1;&#x2009;21.7</td>
<td valign="top" align="center">0.78 (430/552)</td>
<td valign="top" align="center">0.72 (397/552)</td>
<td valign="top" align="center">0.68 (375/552)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Processing time per report was shortest for ChatGPT 5 (8.3&#x2009;&#x00B1;&#x2009;3.2&#x2005;s) and ChatGPT 4o (7.6&#x2009;&#x00B1;&#x2009;2.9&#x2005;s), both markedly faster than junior radiologists (92.5&#x2009;&#x00B1;&#x2009;21.7&#x2005;s; <italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). ChatGPT 3.5 required 12.8&#x2009;&#x00B1;&#x2009;5.1 s per case, still significantly faster than humans (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). Reader-wise performance of the five junior radiologists and the majority-vote aggregate are summarized in <xref ref-type="sec" rid="s12">Supplementary Table S1</xref>.</p>
</sec>
<sec id="s3c"><title>Component-level performance</title>
<p>Staging accuracy differed by component (<xref ref-type="fig" rid="F2">Figure&#x00A0;2</xref>). T staging showed the highest accuracy across models: ChatGPT-5 0.84 (95&#x0025; CI: 0.81&#x2013;0.87), ChatGPT-4o 0.77 (95&#x0025; CI: 0.73&#x2013;0.80), and ChatGPT-3.5 0.63 (95&#x0025; CI: 0.59&#x2013;0.67). N staging ranked second: ChatGPT-5 0.79 (95&#x0025; CI: 0.75&#x2013;0.82), ChatGPT-4o 0.70 (95&#x0025; CI: 0.66&#x2013;0.74), and ChatGPT-3.5 0.56 (95&#x0025; CI: 0.52&#x2013;0.60). M staging was lowest: ChatGPT-5 0.76 (95&#x0025; CI: 0.72&#x2013;0.79), ChatGPT-4o 0.67 (95&#x0025; CI: 0.63&#x2013;0.71), and ChatGPT-3.5 0.53 (95&#x0025; CI: 0.49&#x2013;0.57). To clarify the sources of misclassification, we quantified component-level error rates for ChatGPT-5 stratified by cancer type (<xref ref-type="sec" rid="s12">Supplementary Table S2</xref>). Across cancer types, errors were more frequently attributable to N- and M-component misclassification than to T-component misclassification.</p>
<fig id="F2" position="float"><label>Figure&#x00A0;2</label>
<caption><p>Comparison of staging accuracy by TNM component <bold>(T,N,M)</bold> across three large language models and junior radiologists. ChatGPT 5 consistently achieved the highest accuracy, with T staging showing the best performance across all models. Error bars indicate 95&#x0025; confidence intervals. Asterisks denote statistically significant differences between models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1741973-g002.tif"><alt-text content-type="machine-generated">Bar chart comparing the accuracy of ChatGPT 5, ChatGPT 4o, ChatGPT 3.5, and junior radiologists for T staging, N staging, and M staging. ChatGPT 5 consistently achieves the highest accuracy across all tasks, followed by ChatGPT 4o, junior radiologists, and ChatGPT 3.5, with annotated p-values indicating statistical significance between groups.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3d"><title>Stage-level performance</title>
<p>Heatmap analyses further illustrated variation in staging accuracy across subcategories (<xref ref-type="fig" rid="F3">Figure&#x00A0;3</xref>). For T staging, accuracy was highest in early disease (T1&#x2013;T2) and gradually decreased in advanced stages (T3&#x2013;T4) across all models. In N staging, recognition of N0 disease was consistently better than higher nodal burden (N2&#x2013;N3). For M staging, classification of M0 was more reliable than M1, with misclassifications clustering around borderline cases of suspected distant metastases. These trends were consistent across models and aligned with overall component-level accuracy patterns.</p>
<fig id="F3" position="float"><label>Figure&#x00A0;3</label>
<caption><p>Heatmaps showing classification accuracy by stage subgroup across TNM components and models. Staging accuracy is displayed for T (T1&#x2013;T4), N (N0&#x2013;N3), and M (M0&#x2013;M1) categories. Accuracy declined with increasing stage complexity, particularly for nodal and metastatic disease. ChatGPT 5 and ChatGPT 4o demonstrated higher consistency compared with ChatGPT 3.5 and junior radiologists.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1741973-g003.tif"><alt-text content-type="machine-generated">Heatmap comparing T, N, and M stage accuracy for ChatGPT 5, ChatGPT 4o, ChatGPT 3.5, and Junior Radiologists. ChatGPT 5 consistently shows the highest accuracy across all sub-stages and categories, while ChatGPT 3.5 shows the lowest. Color gradients indicate accuracy levels, ranging from blue (lower accuracy) to red (higher accuracy), with a legend bar on the right.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3e"><title>Cancer-specific performance</title>
<p>Accuracy varied by cancer type (<xref ref-type="fig" rid="F4">Figure&#x00A0;4</xref>, <xref ref-type="table" rid="T3">Table&#x00A0;3</xref>). Lung and breast cancers demonstrated the highest performance, with ChatGPT 5 exceeding 0.85 in both, significantly higher than ChatGPT 3.5 (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.01). Liver cancer showed slightly lower performance, with ChatGPT 5 at 0.79 (95&#x0025; CI: 0.73&#x2013;0.84) and ChatGPT 4o at 0.77 (95&#x0025; CI: 0.71&#x2013;0.82), both outperforming ChatGPT 3.5 (0.59; 95&#x0025; CI: 0.52&#x2013;0.65). Pancreatic cancer was the most challenging, with ChatGPT 5 achieving 0.74 (95&#x0025; CI: 0.66&#x2013;0.81) and ChatGPT 3.5 dropping below 0.50 (95&#x0025; CI: 0.41&#x2013;0.54) (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.001). Renal and prostate cancers showed intermediate results, with ChatGPT 5 consistently outperforming ChatGPT 4o by 0.06&#x2013;0.08, and both models substantially surpassing ChatGPT 3.5 (all <italic>p</italic>&#x2009;&#x003C;&#x2009;0.05). Representative correct and misclassified cases with sentence-level evidence citation and component-level error attribution are provided in <xref ref-type="sec" rid="s12">Supplementary Material I</xref>.</p>
<fig id="F4" position="float"><label>Figure&#x00A0;4</label>
<caption><p>Accuracy of TNM staging by cancer type across large language models and junior radiologists. ChatGPT 5 achieved the highest accuracy in all six malignancies, with lung and breast cancers showing the best performance and pancreatic cancer remaining the most challenging. Statistical significance markers indicate pairwise differences between models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-08-1741973-g004.tif"><alt-text content-type="machine-generated">Bar chart comparing accuracy for six cancer types among ChatGPT 5, ChatGPT 4o, ChatGPT 3.5, and junior radiologists. ChatGPT 5 consistently achieves the highest accuracy, followed by ChatGPT 4o, junior radiologists, and ChatGPT 3.5, with p-values indicating statistical significance for multiple comparisons across all cancer types.</alt-text>
</graphic>
</fig>
<table-wrap id="T3" position="float"><label>Table&#x00A0;3</label>
<caption><p>Intra-model and inter-model agreement analysis.</p></caption>
<table>
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Agreement type</th>
<th valign="top" align="center">Models/comparison</th>
<th valign="top" align="center">Kappa (95&#x0025; CI)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="4">Intra-model Agreement</td>
<td valign="top" align="left">ChatGPT 5</td>
<td valign="top" align="center">0.96 (0.93&#x2013;0.98)</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT 4o</td>
<td valign="top" align="center">0.89 (0.85&#x2013;0.92)</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT 3.5</td>
<td valign="top" align="center">0.77 (0.71&#x2013;0.82)</td>
</tr>
<tr>
<td valign="top" align="left">Junior Radiologists</td>
<td valign="top" align="center">0.81 (0.77&#x2013;0.85)</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Inter-model Agreement</td>
<td valign="top" align="left">ChatGPT 5 vs. ChatGPT 4o</td>
<td valign="top" align="center">0.78 (0.73&#x2013;0.83)</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT 5 vs. ChatGPT 3.5</td>
<td valign="top" align="center">0.61 (0.56&#x2013;0.66)</td>
</tr>
<tr>
<td valign="top" align="left">ChatGPT 4o vs. ChatGPT 3.5</td>
<td valign="top" align="center">0.58 (0.52&#x2013;0.63)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TF2"><p>Data are <italic>&#x03BA;</italic> values, with 95&#x0025; confidence intervals. Intra-LLM agreement was measured using Fleiss <italic>&#x03BA;</italic>; inter-LLM agreement was measured using Cohen <italic>&#x03BA;</italic>. Agreement was categorized as slight (&#x2264;0.20), fair (0.21&#x2013;0.40), moderate (0.41&#x2013;0.60), substantial (0.61&#x2013;0.80), or almost perfect (0.81&#x2013;1.00).</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3f"><title>Agreement analysis</title>
<p>Intra-model agreement was almost perfect for ChatGPT 5 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.96; 95&#x0025; CI: 0.93&#x2013;0.98) and ChatGPT 4o (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.89; 95&#x0025; CI: 0.85&#x2013;0.92), and substantial for ChatGPT 3.5 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.77; 95&#x0025; CI: 0.71&#x2013;0.82). Inter-model agreement was substantial for ChatGPT 5 vs. ChatGPT 4o (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.78; 95&#x0025; CI: 0.73&#x2013;0.83) and ChatGPT 5 vs. ChatGPT 3.5 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.61; 95&#x0025; CI: 0.56&#x2013;0.66), and moderate for ChatGPT 4o vs. ChatGPT 3.5 (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.58; 95&#x0025; CI: 0.52&#x2013;0.63). Agreement between junior radiologists and the reference standard was also almost perfect (<italic>&#x03BA;</italic>&#x2009;&#x003D;&#x2009;0.81; 95&#x0025; CI: 0.77&#x2013;0.85), although numerically lower than that of ChatGPT 5 and ChatGPT 4o.</p>
</sec>
</sec>
<sec id="s4" sec-type="discussion"><title>Discussion</title>
<p>This study assessed the ability of three large language models including ChatGPT 5, ChatGPT 4o, and ChatGPT 3.5, to automate TNM staging from PET-CT reports across six common malignancies. By directly comparing different generations of models with junior radiologists, we provide evidence on the evolving role of LLMs in radiologic oncology. The results show that ChatGPT 5 achieved the highest overall accuracy, consistently surpassing ChatGPT 4o and ChatGPT 3.5, and outperforming junior radiologists in both staging accuracy and intra-observer consistency. These findings suggest that the newest generation of models may be approaching the reliability needed for clinical applications where structured interpretation of free-text reports is required.</p>
<p>Earlier studies of ChatGPT in radiology have mostly emphasized report structuring, protocol selection, or answering board-style questions (<xref ref-type="bibr" rid="B12">12</xref>&#x2013;<xref ref-type="bibr" rid="B14">14</xref>). Few have examined its performance on systematic staging, a task that requires not only recognition of keywords but also integration of scattered observations into codified staging rules. Our results indicate that newer model generations can more reliably perform report-to-stage mapping under AJCC criteria at the case level. The contrast between ChatGPT 5 and its predecessors underscores the rapid progress in LLM capabilities, with accuracy gains evident across nearly all cancer types and staging components. In this regard, the current findings extend previous work by showing that LLMs are not limited to impression generation but can also support more logic-driven clinical tasks (<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B16">16</xref>).</p>
<p>Differences across staging components provide useful insight into where LLMs succeed and where they struggle. T staging achieved the highest accuracy, reflecting the relatively standardized language used to describe tumor size and local invasion (<xref ref-type="bibr" rid="B17">17</xref>). Report phrases such as &#x201C;mass measuring 3.5&#x2005;cm&#x201D; or &#x201C;extension into chest wall&#x201D; are clear and map directly to staging thresholds, allowing even earlier models to perform reasonably well. In contrast, N staging posed greater challenges. Lymph node involvement is often described with variability and uncertainty, such as &#x201C;borderline enlarged&#x201D; or &#x201C;suspicious for metastasis,&#x201D; making consistent interpretation more difficult (<xref ref-type="bibr" rid="B18">18</xref>). This explains the lower accuracy across all models and highlights an area where automated systems may need additional refinement. M staging showed mixed results. Negative statements about distant disease were generally interpreted correctly, but scattered positive findings, particularly when presented with cautionary wording, were frequently misclassified (<xref ref-type="bibr" rid="B19">19</xref>). This mirrors the difficulties faced by less experienced radiologists, emphasizing that some challenges are inherent to the task itself rather than to the model alone.</p>
<p>Performance also varied by cancer type. Lung and breast cancers achieved the highest accuracy, likely reflecting the fact that their staging criteria are well standardized and frequently encountered in radiology reports used to train LLMs (<xref ref-type="bibr" rid="B20">20</xref>). Pancreatic cancer, by contrast, proved the most difficult. This is consistent with clinical experience, as pancreatic tumors often involve subtle findings such as vascular encasement or peritoneal spread that may not be consistently described in text (<xref ref-type="bibr" rid="B21">21</xref>). Intermediate results were seen for liver, kidney, and prostate cancers, again suggesting that the clarity and consistency of reporting language strongly influence model performance. These differences highlight the dual dependency of automated staging on both model reasoning and the structure of the source reports.</p>
<p>Reliability is as important as accuracy in any tool intended for clinical decision support. Intra-model agreement was strongest for ChatGPT 5, approaching the threshold of &#x201C;almost perfect&#x201D; agreement, while ChatGPT 4o and ChatGPT 3.5 showed progressively lower consistency (<xref ref-type="bibr" rid="B22">22</xref>). The stability of outputs across repeated runs is reassuring, given concerns about stochasticity in generative models (<xref ref-type="bibr" rid="B23">23</xref>). Interestingly, junior radiologists demonstrated lower repeatability than ChatGPT 5, underscoring that human interpretation is not free from variability even in structured tasks (<xref ref-type="bibr" rid="B24">24</xref>). These findings suggest that advanced LLMs may contribute not only speed but also consistency, potentially reducing inter-reader variability that has long challenged oncologic staging (<xref ref-type="bibr" rid="B25">25</xref>).</p>
<p>The clinical consequences of TNM mis-staging warrant careful consideration before deployment. Errors in N or M classification may carry disproportionate downstream impact, including inappropriate treatment intensity, incorrect eligibility assessment for clinical trials, and flawed longitudinal comparisons. These risks may be amplified by ambiguity or incomplete specification in narrative reports, which can complicate consistent mapping to AJCC staging rules. Accordingly, outputs from LLMs should be treated as decision-support suggestions rather than definitive staging. Practical guardrails can mitigate these risks by limiting use to human-verified decision support, requiring sentence-level evidence citation for each TNM component, automatically flagging uncertain or internally discordant outputs for senior review, and preserving an audit trail for quality assurance and governance.</p>
<p>Automated staging could provide rapid preliminary classifications, assisting junior radiologists in daily practice and accelerating preparation for multidisciplinary tumor boards (<xref ref-type="bibr" rid="B26">26</xref>). The substantial time savings observed with ChatGPT models compared with human readers further support their potential role in workflow optimization (<xref ref-type="bibr" rid="B27">27</xref>). In a practical workflow, the model could be deployed after report finalization to pre-populate structured TNM fields with sentence-level evidence from the report for rapid human verification before tumor board submission. This human-in-the-loop design preserves clinical accountability while leveraging the observed time savings and improving staging documentation consistency. More broadly, the ability to extract structured staging data from large numbers of free-text reports may facilitate clinical research, registry development, and quality monitoring (<xref ref-type="bibr" rid="B28">28</xref>). Importantly, these models are best conceived as augmentative rather than replacement tools. Their outputs can serve as staging suggestions that require expert confirmation, fitting naturally into the paradigm of augmented intelligence in radiology (<xref ref-type="bibr" rid="B29">29</xref>).</p>
<p>Our study has several limitations. The sample size, while spanning six malignancies, still provided limited representation of advanced stages such as T4 or N3 disease, which may restrict generalizability for these categories. The retrospective design relied on existing reports, and subtle features influencing staging could have been omitted or inconsistently described. All data were in English and derived from two institutions, which limits applicability to multilingual and more diverse clinical environments. Furthermore, we evaluated only one LLM family (ChatGPT) and did not benchmark against other competitive general-purpose or medical-oriented models. As a result, the absolute performance reported here may not directly translate to models with different training corpora, safety constraints, or inference settings, and our conclusions should be interpreted primarily as comparative evidence across ChatGPT generations. The models were tested through API calls without domain-specific fine-tuning, mirroring real-world use but leaving open the possibility of improved performance with adaptation. Finally, staging was assessed in isolation, whereas in clinical workflows decisions are integrated with pathology, laboratory, and clinical data. Future prospective evaluations in routine practice will be necessary before clinical deployment.</p>
<p>In conclusion, this study demonstrates that LLMs, and particularly ChatGPT 5, can achieve reliable performance in TNM staging from PET-CT reports across multiple cancer types. Accuracy was highest for T staging and lowest for nodal assessment, with clear differences across malignancies. Compared with junior radiologists, the newest model showed both higher accuracy and greater consistency, while processing cases substantially faster. Although limitations remain, these results suggest that LLMs are approaching a level of performance that could make them valuable as adjunctive tools in oncologic imaging, capable of enhancing efficiency, standardization, and reproducibility in cancer staging.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability"><title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s12">Supplementary Material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s7" sec-type="ethics-statement"><title>Ethics statement</title>
<p>Ethical approval for this study was obtained from the medical ethics committee of the Hangzhou First People&#x0027;s Hospital. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation was not required from the participants or the participants&#x0027; legal guardians/next of kin in accordance with the national legislation and institutional requirements.</p>
</sec>
<sec id="s8" sec-type="author-contributions"><title>Author contributions</title>
<p>WX: Conceptualization, Methodology, Data curation, Writing &#x2013; original draft. LC: Conceptualization, Methodology, Data curation, Writing &#x2013; original draft. QS: Writing &#x2013; review &#x0026; editing. YS: Formal analysis, Writing &#x2013; review &#x0026; editing. SP: Data curation, Writing &#x2013; original draft. MR: Supervision, Conceptualization, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec id="s10" sec-type="COI-statement"><title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s11" sec-type="ai-statement"><title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. During the preparation of this work the authors used ChatGPT 5 (OpenAI, <ext-link ext-link-type="uri" xlink:href="https://chat.openai.com/">https://chat.openai.com/</ext-link>) as a supplementary tool to enhance the linguistic quality and readability of our work. After using this tool, the authors reviewed and edited the content as needed and take full responsibility for the content of the published article.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s13" sec-type="disclaimer"><title>Publisher&#x0027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s12" sec-type="supplementary-material"><title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fdgth.2026.1741973/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fdgth.2026.1741973/full&#x0023;supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
</sec>
<ref-list><title>References</title>
<ref id="B1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Amin</surname> <given-names>MB</given-names></name> <name><surname>Greene</surname> <given-names>FL</given-names></name> <name><surname>Edge</surname> <given-names>SB</given-names></name> <name><surname>Compton</surname> <given-names>CC</given-names></name> <name><surname>Gershenwald</surname> <given-names>JE</given-names></name> <name><surname>Brookland</surname> <given-names>RK</given-names></name><etal/></person-group> <article-title>The eighth edition AJCC cancer staging manual: continuing to build a bridge from a population-based to a more &#x201C;personalized&#x201D; approach to cancer staging</article-title>. <source>CA Cancer J Clin</source>. (<year>2017</year>) <volume>67</volume>(<issue>2</issue>):<fpage>93</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.3322/caac.21388</pub-id><pub-id pub-id-type="pmid">28094848</pub-id></mixed-citation></ref>
<ref id="B2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>S</given-names></name> <name><surname>Lee</surname> <given-names>CK</given-names></name> <name><surname>Kim</surname> <given-names>SS</given-names></name></person-group>. <article-title>Large language models: a guide for radiologists</article-title>. <source>Korean J Radiol</source>. (<year>2024</year>) <volume>25</volume>(<issue>2</issue>):<fpage>126</fpage>&#x2013;<lpage>33</lpage>. <pub-id pub-id-type="doi">10.3348/kjr.2023.0997</pub-id><pub-id pub-id-type="pmid">38288895</pub-id></mixed-citation></ref>
<ref id="B3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Adams</surname> <given-names>LC</given-names></name> <name><surname>Truhn</surname> <given-names>D</given-names></name> <name><surname>Busch</surname> <given-names>F</given-names></name> <name><surname>Kader</surname> <given-names>A</given-names></name> <name><surname>Niehues</surname> <given-names>SM</given-names></name> <name><surname>Makowski</surname> <given-names>MR</given-names></name><etal/></person-group> <article-title>Leveraging GPT-4 for <italic>post hoc</italic> transformation of free-text radiology reports into structured reporting: a multilingual feasibility study</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>307</volume>(<issue>4</issue>):<fpage>e230725</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.230725</pub-id><pub-id pub-id-type="pmid">37014240</pub-id></mixed-citation></ref>
<ref id="B4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>Z</given-names></name> <name><surname>Ong</surname> <given-names>H</given-names></name> <name><surname>Kennedy</surname> <given-names>P</given-names></name> <name><surname>Tang</surname> <given-names>L</given-names></name> <name><surname>Chen</surname> <given-names>S</given-names></name> <name><surname>Elias</surname> <given-names>J</given-names></name><etal/></person-group> <article-title>Evaluating GPT-4 on impressions generation in radiology reports</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>307</volume>(<issue>5</issue>):<fpage>e231259</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.231259</pub-id><pub-id pub-id-type="pmid">37367439</pub-id></mixed-citation></ref>
<ref id="B5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cozzi</surname> <given-names>A</given-names></name> <name><surname>Pinker</surname> <given-names>K</given-names></name> <name><surname>Hidber</surname> <given-names>A</given-names></name> <name><surname>Zhang</surname> <given-names>T</given-names></name> <name><surname>Bonomo</surname> <given-names>L</given-names></name> <name><surname>Lo Gullo</surname> <given-names>R</given-names></name><etal/></person-group> <article-title>BI-RADS category assignments by GPT-3.5, GPT-4, and google bard: a multilanguage study</article-title> <source>Radiology</source>. (<year>2024</year>) <volume>311</volume>(<issue>1</issue>):<fpage>e232133</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.232133</pub-id><pub-id pub-id-type="pmid">38687216</pub-id></mixed-citation></ref>
<ref id="B6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhayana</surname> <given-names>R</given-names></name> <name><surname>Krishna</surname> <given-names>S</given-names></name> <name><surname>Bleakney</surname> <given-names>RR</given-names></name></person-group>. <article-title>Performance of ChatGPT on a radiology board-style examination: insights into current strengths and limitations</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>307</volume>(<issue>5</issue>):<fpage>e230582</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.230582</pub-id><pub-id pub-id-type="pmid">37191485</pub-id></mixed-citation></ref>
<ref id="B7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yalamanchili</surname> <given-names>A</given-names></name> <name><surname>Sengupta</surname> <given-names>B</given-names></name> <name><surname>Song</surname> <given-names>J</given-names></name> <name><surname>Lim</surname> <given-names>S</given-names></name> <name><surname>Thomas</surname> <given-names>TO</given-names></name> <name><surname>Mittal</surname> <given-names>BB</given-names></name></person-group>,&#x00A0;<etal>et al.</etal> <article-title>Quality of large language model responses to radiation oncology patient care questions</article-title>. <source>JAMA Netw Open</source>. (<year>2024</year>) <volume>7</volume>(<issue>4</issue>):<fpage>e244630</fpage>. <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.4630</pub-id><pub-id pub-id-type="pmid">38564215</pub-id></mixed-citation></ref>
<ref id="B8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Clusmann</surname> <given-names>J</given-names></name> <name><surname>Kolbinger</surname> <given-names>FR</given-names></name> <name><surname>Muti</surname> <given-names>HS</given-names></name> <name><surname>Carrero</surname> <given-names>ZI</given-names></name> <name><surname>Eckardt</surname> <given-names>JN</given-names></name> <name><surname>Laleh</surname> <given-names>NG</given-names></name><etal/></person-group> <article-title>The future landscape of large language models in medicine</article-title>. <source>Commun Med</source>. (<year>2023</year>) <volume>3</volume>:<fpage>141</fpage>. <pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id><pub-id pub-id-type="pmid">37816837</pub-id></mixed-citation></ref>
<ref id="B9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rao</surname> <given-names>A</given-names></name> <name><surname>Pang</surname> <given-names>M</given-names></name> <name><surname>Kim</surname> <given-names>J</given-names></name> <name><surname>Kamineni</surname> <given-names>M</given-names></name> <name><surname>Lie</surname> <given-names>W</given-names></name> <name><surname>Prasad</surname> <given-names>AK</given-names></name><etal/></person-group> <article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title>. <source>J Med Internet Res</source>. (<year>2023</year>) <volume>25</volume>:<fpage>e48659</fpage>. <pub-id pub-id-type="doi">10.2196/48659</pub-id><pub-id pub-id-type="pmid">37606976</pub-id></mixed-citation></ref>
<ref id="B10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Singhal</surname> <given-names>K</given-names></name> <name><surname>Azizi</surname> <given-names>S</given-names></name> <name><surname>Tu</surname> <given-names>T</given-names></name> <name><surname>Mahdavi</surname> <given-names>SS</given-names></name> <name><surname>Wei</surname> <given-names>J</given-names></name> <name><surname>Chung</surname> <given-names>HW</given-names></name><etal/></person-group> <article-title>Large language models encode clinical knowledge</article-title>. <source>Nature</source>. (<year>2023</year>) <volume>620</volume>:<fpage>172</fpage>&#x2013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="pmid">37438534</pub-id></mixed-citation></ref>
<ref id="B11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ueda</surname> <given-names>D</given-names></name> <name><surname>Mitsuyama</surname> <given-names>Y</given-names></name> <name><surname>Takita</surname> <given-names>H</given-names></name> <name><surname>Horiuchi</surname> <given-names>D</given-names></name> <name><surname>Walston</surname> <given-names>SL</given-names></name> <name><surname>Tatekawa</surname> <given-names>H</given-names></name><etal/></person-group> <article-title>Diagnostic performance of ChatGPT from patient history and imaging findings on the diagnosis please quizzes</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>308</volume>(<issue>1</issue>):<fpage>e231040</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.231040</pub-id><pub-id pub-id-type="pmid">37462501</pub-id></mixed-citation></ref>
<ref id="B12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gertz</surname> <given-names>RJ</given-names></name> <name><surname>Dratsch</surname> <given-names>T</given-names></name> <name><surname>Bunck</surname> <given-names>AC</given-names></name> <name><surname>Lennartz</surname> <given-names>S</given-names></name> <name><surname>Iuga</surname> <given-names>AI</given-names></name> <name><surname>Hellmich</surname> <given-names>MG</given-names></name><etal/></person-group> <article-title>Potential of GPT-4 for detecting errors in radiology reports: implications for reporting accuracy</article-title>. <source>Radiology</source>. (<year>2024</year>) <volume>311</volume>(<issue>1</issue>):<fpage>e232714</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.232714</pub-id><pub-id pub-id-type="pmid">38625012</pub-id></mixed-citation></ref>
<ref id="B13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jeblick</surname> <given-names>K</given-names></name> <name><surname>Schachtner</surname> <given-names>B</given-names></name> <name><surname>Dexl</surname> <given-names>J</given-names></name> <name><surname>Mittermeier</surname> <given-names>A</given-names></name> <name><surname>St&#x00FC;ber</surname> <given-names>AT</given-names></name> <name><surname>Topalis</surname> <given-names>J</given-names></name><etal/></person-group> <article-title>ChatGPT makes medicine easy to swallow: an exploratory case study on simplified radiology reports</article-title>. <source>Eur Radiol</source>. (<year>2024</year>) <volume>34</volume>:<fpage>2817</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1007/s00330-023-10213-1</pub-id><pub-id pub-id-type="pmid">37794249</pub-id></mixed-citation></ref>
<ref id="B14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kaya</surname> <given-names>K</given-names></name> <name><surname>Gietzen</surname> <given-names>C</given-names></name> <name><surname>Hahnfeldt</surname> <given-names>R</given-names></name> <name><surname>Zoubi</surname> <given-names>M</given-names></name> <name><surname>Emrich</surname> <given-names>T</given-names></name> <name><surname>Halfmann</surname> <given-names>MC</given-names></name><etal/></person-group> <article-title>Generative Pre-trained transformer 4 analysis of cardiovascular magnetic resonance reports in suspected myocarditis: a multicenter study</article-title>. <source>J Cardiovasc Magn Reson</source>. (<year>2024</year>) <volume>26</volume>:<fpage>101068</fpage>. <pub-id pub-id-type="doi">10.1016/j.jocmr.2024.101068</pub-id><pub-id pub-id-type="pmid">39079602</pub-id></mixed-citation></ref>
<ref id="B15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Suthar</surname> <given-names>PP</given-names></name> <name><surname>Kounsal</surname> <given-names>A</given-names></name> <name><surname>Chhetri</surname> <given-names>L</given-names></name> <name><surname>Saini</surname> <given-names>D</given-names></name> <name><surname>Dua</surname> <given-names>SG</given-names></name></person-group>. <article-title>Artificial intelligence (AI) in radiology: a deep dive into ChatGPT 4.0&#x2019;s accuracy with the American journal of neuroradiology&#x2019;s (AJNR) &#x201C;case of the month&#x201D;</article-title>. <source>Cureus</source>. (<year>2023</year>) <volume>15</volume>(<issue>8</issue>):<fpage>e43958</fpage>. <pub-id pub-id-type="doi">10.7759/cureus.43958</pub-id><pub-id pub-id-type="pmid">37746411</pub-id></mixed-citation></ref>
<ref id="B16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Raza</surname> <given-names>MM</given-names></name> <name><surname>Venkatesh</surname> <given-names>KP</given-names></name> <name><surname>Kvedar</surname> <given-names>JC</given-names></name></person-group>. <article-title>Generative AI and large language models in health care: pathways to implementation</article-title>. <source>NPJ Dig Med</source>. (<year>2024</year>) <volume>7</volume>:<fpage>62</fpage>. <pub-id pub-id-type="doi">10.1038/s41746-023-00988-4</pub-id></mixed-citation></ref>
<ref id="B17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hockmann</surname> <given-names>J</given-names></name> <name><surname>Hautzel</surname> <given-names>H</given-names></name> <name><surname>Darwiche</surname> <given-names>K</given-names></name> <name><surname>Eberhard</surname> <given-names>W</given-names></name> <name><surname>Stuschke</surname> <given-names>M</given-names></name> <name><surname>Aigner</surname> <given-names>C</given-names></name><etal/></person-group> <article-title>Accuracy of nodal staging by 18F-FDG PET/CT in limited stage small-cell lung cancer</article-title>. <italic>Asian Cardiovasc Thorac Ann.</italic> (<year>2023</year>) <volume>31</volume>(<issue>6</issue>):<fpage>506</fpage>-<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1177/02184923231187279</pub-id><pub-id pub-id-type="pmid">37438928</pub-id></mixed-citation></ref>
<ref id="B18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benjamens</surname> <given-names>S</given-names></name> <name><surname>Dhunnoo</surname> <given-names>P</given-names></name> <name><surname>Mesk&#x00F3;</surname> <given-names>B</given-names></name></person-group>. <article-title>The state of artificial intelligence-based FDA-approved medical devices and algorithms: an online database</article-title>. <source>NPJ Digit Med</source>. (<year>2020</year>) <volume>3</volume>:<fpage>118</fpage>. <pub-id pub-id-type="doi">10.1038/s41746-020-00324-0</pub-id><pub-id pub-id-type="pmid">32984550</pub-id></mixed-citation></ref>
<ref id="B19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Halling-Brown</surname> <given-names>MD</given-names></name> <name><surname>Warren</surname> <given-names>LM</given-names></name> <name><surname>Ward</surname> <given-names>D</given-names></name> <name><surname>Lewis</surname> <given-names>E</given-names></name> <name><surname>Mackenzie</surname> <given-names>A</given-names></name> <name><surname>Wallis</surname> <given-names>MG</given-names></name><etal/></person-group> <article-title>OPTIMAM mammography image database: a large-scale resource of mammography images and clinical data</article-title>. <source>Radiol Artif Intell</source>. (<year>2020</year>)&#x00A0;<volume>3</volume>(<issue>1</issue>):<fpage>e200103</fpage>. <pub-id pub-id-type="doi">10.1148/ryai.2020200103</pub-id><pub-id pub-id-type="pmid">33937853</pub-id></mixed-citation></ref>
<ref id="B20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>JE</given-names></name> <name><surname>Park</surname> <given-names>KS</given-names></name> <name><surname>Kim</surname> <given-names>YH</given-names></name> <name><surname>Song</surname> <given-names>HC</given-names></name> <name><surname>Park</surname> <given-names>B</given-names></name> <name><surname>Jeong</surname> <given-names>YJ.</given-names></name></person-group> (<year>2024</year>). <article-title>Lung cancer staging using chest CT and FDG PET/CT free-text reports: comparison among three ChatGPT large-language models and six human readers</article-title>. <source>AJR Am J Roentgenol</source>. (<year>2024</year>)&#x00A0;<volume>223</volume>(<issue>6</issue>):<fpage>e2431696</fpage>. <pub-id pub-id-type="doi">10.2214/AJR.24.31696</pub-id><pub-id pub-id-type="pmid">39230409</pub-id></mixed-citation></ref>
<ref id="B21"><label>21.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Jiang</surname> <given-names>H</given-names></name> <name><surname>Zhong</surname> <given-names>T</given-names></name> <name><surname>Wu</surname> <given-names>Z</given-names></name> <name><surname>Ma</surname> <given-names>C</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name><etal/></person-group> <article-title>Holistic evaluation of GPT-4V for biomedical imaging</article-title>. <comment><italic>arXiv</italic></comment> (<year>2023</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2312.05256</pub-id></mixed-citation></ref>
<ref id="B22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gertz</surname> <given-names>RJ</given-names></name> <name><surname>Bunck</surname> <given-names>AC</given-names></name> <name><surname>Lennartz</surname> <given-names>S</given-names></name> <name><surname>Dratsch</surname> <given-names>T</given-names></name> <name><surname>Iuga</surname> <given-names>A-I</given-names></name> <name><surname>Maintz</surname> <given-names>D</given-names></name><etal/></person-group> <article-title>GPT-4 for automated determination of radiological study and protocol based on radiology request forms: a feasibility study</article-title>. <source>Radiology</source>. (<year>2023</year>) <volume>307</volume>(<issue>5</issue>):<fpage>e230877</fpage>. <pub-id pub-id-type="doi">10.1148/radiol.230877</pub-id><pub-id pub-id-type="pmid">37310247</pub-id></mixed-citation></ref>
<ref id="B23"><label>23.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Saab</surname> <given-names>K</given-names></name> <name><surname>Tu</surname> <given-names>T</given-names></name> <name><surname>Weng</surname> <given-names>WH</given-names></name> <name><surname>Tanno</surname> <given-names>R</given-names></name> <name><surname>Stutz</surname> <given-names>D</given-names></name> <name><surname>Wulczyn</surname> <given-names>E</given-names></name><etal/></person-group> <article-title>Capabilities of Gemini models in medicine</article-title>. <comment><italic>arXiv</italic></comment> (<year>2024</year>). <pub-id pub-id-type="doi">10.48550/arXiv.2404.18416</pub-id></mixed-citation></ref>
<ref id="B24"><label>24.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>S</given-names></name> <name><surname>Gilbert</surname> <given-names>M</given-names></name> <name><surname>Chetty</surname> <given-names>I</given-names></name> <name><surname>Siddiqui</surname> <given-names>F</given-names></name></person-group>. <article-title>The 2021 landscape of FDA-approved artificial intelligence/machine learning-enabled medical devices: an analysis of the characteristics and intended use</article-title>. <source>Int J Med Inf</source>. (<year>2022</year>) <volume>165</volume>:<fpage>104828</fpage>. <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2022.104828</pub-id></mixed-citation></ref>
<ref id="B25"><label>25.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Willemink</surname> <given-names>MJ</given-names></name> <name><surname>Roth</surname> <given-names>HR</given-names></name> <name><surname>Sandfort</surname> <given-names>V</given-names></name></person-group>. <article-title>Toward foundational deep learning models for medical imaging in the new era of transformer networks</article-title>. <source>Radiol Artif Intell</source>. (<year>2022</year>) <volume>4</volume>(<issue>6</issue>):<fpage>e210284</fpage>. <pub-id pub-id-type="doi">10.1148/ryai.210284</pub-id><pub-id pub-id-type="pmid">36523642</pub-id></mixed-citation></ref>
<ref id="B26"><label>26.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bajaj</surname> <given-names>S</given-names></name> <name><surname>Gandhi</surname> <given-names>D</given-names></name> <name><surname>Nayar</surname> <given-names>D</given-names></name></person-group>. <article-title>Potential applications and impact of ChatGPT in radiology</article-title>. <source>Acad Radiol</source>. (<year>2024</year>) <volume>31</volume>(<issue>4</issue>):<fpage>1256</fpage>&#x2013;<lpage>61</lpage>. <pub-id pub-id-type="doi">10.1016/j.acra.2023.08.039</pub-id><pub-id pub-id-type="pmid">37802673</pub-id></mixed-citation></ref>
<ref id="B27"><label>27.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lyu</surname> <given-names>Q</given-names></name> <name><surname>Tan</surname> <given-names>J</given-names></name> <name><surname>Zapadka</surname> <given-names>ME</given-names></name> <name><surname>Ponnatapura</surname> <given-names>J</given-names></name> <name><surname>Niu</surname> <given-names>C</given-names></name> <name><surname>Myers</surname> <given-names>KJ</given-names></name><etal/></person-group> <article-title>Translating radiology reports into plain language using ChatGPT and GPT-4 with prompt learning: results, limitations, and potential</article-title>. <source>Vis Comput Ind Biomed Art</source>. (<year>2023</year>) <volume>6</volume>:<fpage>9</fpage>. <pub-id pub-id-type="doi">10.1186/s42492-023-00136-5</pub-id><pub-id pub-id-type="pmid">37198498</pub-id></mixed-citation></ref>
<ref id="B28"><label>28.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schmidt</surname> <given-names>RA</given-names></name> <name><surname>Seah</surname> <given-names>JCY</given-names></name> <name><surname>Cao</surname> <given-names>K</given-names></name> <name><surname>Lim</surname> <given-names>L</given-names></name> <name><surname>Lim</surname> <given-names>W</given-names></name> <name><surname>Yeung</surname> <given-names>J</given-names></name><etal/></person-group> <article-title>Generative large language models for detection of speech recognition errors in radiology reports</article-title>. <source>Radiol Artif Intell</source>. (<year>2023</year>) <volume>5</volume>:<fpage>e230205</fpage>. <pub-id pub-id-type="doi">10.1148/ryai.230205</pub-id></mixed-citation></ref>
<ref id="B29"><label>29.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Freyer</surname> <given-names>O</given-names></name> <name><surname>Wiest</surname> <given-names>IC</given-names></name> <name><surname>Kather</surname> <given-names>JN</given-names></name> <name><surname>Gilbert</surname> <given-names>S</given-names></name></person-group>. <article-title>A future role for health applications of large language models depends on regulators enforcing safety standards</article-title>. <source>Lancet Digit Health</source>. (<year>2024</year>) <volume>6</volume>(<issue>9</issue>):<fpage>e662</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1016/S2589-7500(24)00124-9</pub-id><pub-id pub-id-type="pmid">39179311</pub-id></mixed-citation></ref></ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2048290/overview">Fahim Sufi</ext-link>, Monash University, Australia</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1473638/overview">Xiangyuan Ma</ext-link>, Shantou University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2596983/overview">Troy Teo</ext-link>, Northwestern Medicine, United States</p></fn>
</fn-group>
</back>
</article>