<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Public Health</journal-id>
<journal-title-group>
<journal-title>Frontiers in Public Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Public Health</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-2565</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpubh.2026.1777577</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Decoupled quality and readability in skin cancer education from large language models</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Yanping</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Lei</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Weiqiang</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Lan</surname>
<given-names>Weifeng</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2621001"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Plastic Surgery, Longyan First Affiliated Hospital of Fujian Medical University</institution>, <city>Longyan</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Science and Education, Longyan First Hospital</institution>, <city>Longyan</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Weifeng Lan, <email xlink:href="mailto:1064801300@qq.com">1064801300@qq.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-20">
<day>20</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>14</volume>
<elocation-id>1777577</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>22</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Zhang, Wang, Zhang and Lan.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Zhang, Wang, Zhang and Lan</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-20">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec id="sec1001">
<title>Introduction</title>
<p>Large language models (LLMs) are increasingly used by the public to obtain health information, yet the relationship between content quality and readability in LLM-generated patient education remains unclear.</p>
</sec>
<sec id="sec1002">
<title>Methods</title>
<p>We benchmarked five LLMs (Doubao, DeepSeek, Wenxin Yiyan, Tongyi Qianwen, and GPT-5) using an identical set of 20 Mandarin Chinese skin-cancer FAQs (100 total outputs). Quality was assessed using c-PEMAT-P and the Global Quality Scale (GQS), and readability was assessed using seven indices (ARI, FRES, GFOG, FKGL, CL, SMOG, and LW). Group differences and correlations were evaluated with appropriate statistical tests.</p>
</sec>
<sec id="sec1003">
<title>Results</title>
<p>Models showed comparable understandability/actionability (c-PEMAT-P), while overall quality (GQS) differed, with GPT-5 scoring highest. Readability varied substantially by both model and content category, and no single model performed best across all readability metrics. Correlation analyses indicated that quality and readability were largely decoupled.</p>
</sec>
<sec id="sec1004">
<title>Discussion</title>
<p>High-quality outputs do not necessarily have high readability. Optimizing AI-generated skin-cancer education requires multi-faceted strategies that jointly consider model choice and content topic.</p>
</sec>
</abstract>
<kwd-group>
<kwd>digital public health communication</kwd>
<kwd>health information quality (C-PEMAT, GQS)</kwd>
<kwd>large language models (LLMs)</kwd>
<kwd>readability assessment</kwd>
<kwd>skin cancer education</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="24"/>
<page-count count="9"/>
<word-count count="6475"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Public Health Education and Promotion</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<title>Introduction</title>
<p>Public health education, essential for disease prevention and management, relies on disseminating information that is accurate, actionable, and accessible across diverse audiences (<xref ref-type="bibr" rid="ref1">1</xref>). Skin cancer, a globally prevalent malignancy, illustrates this requirement, given that public knowledge directly influences early detection and subsequent outcomes (<xref ref-type="bibr" rid="ref2">2</xref>, <xref ref-type="bibr" rid="ref3">3</xref>). Health communicators have long faced challenges balancing the dual imperatives of information quality&#x2014;encompassing clarity, utility, and reliability&#x2014;and textual readability, which governs cognitive accessibility (<xref ref-type="bibr" rid="ref4">4</xref>). Successfully developing materials that meet both these stringent criteria demands significant expertise and iterative refinement.</p>
<sec id="sec2">
<title>Target audience, language context, and model selection</title>
<p>This study focuses on Chinese-language (Mandarin) skin cancer educational materials intended for lay readers and patients, reflecting a common real-world scenario in which the public uses consumer-facing LLMs to obtain health information. We selected Doubao, DeepSeek, Wenxin Yiyan, and Tongyi Qianwen as representative publicly accessible LLM products widely available to Chinese users, and included GPT-5 as a widely used international reference model to enable cross-ecosystem comparison under identical prompts. Given the Chinese-language outputs, we employed the Chinese version of the Patient Education Materials Assessment Tool for Printable Materials (c-PEMAT-P) to support linguistically appropriate quality evaluation.</p>
<p>Generative Large Language Models (LLMs) offer unprecedented automation for creating health education content at scale (<xref ref-type="bibr" rid="ref5">5</xref>). While their text-generation capacity is evident, their responsible deployment in healthcare necessitates rigorous, multidimensional validation (<xref ref-type="bibr" rid="ref5">5</xref>). A pivotal question is whether LLMs consistently achieve the requisite integration of substantive quality and public readability (<xref ref-type="bibr" rid="ref6">6</xref>). Existing research typically examines these metrics in isolation, often assuming correlation or treating them as a unitary construct (<xref ref-type="bibr" rid="ref7">7</xref>, <xref ref-type="bibr" rid="ref8">8</xref>). This approach leaves a critical gap: the lack of systematic investigation into the explicit relationship between the quality and readability of LLM-generated health information. Without this understanding, the promise of LLMs in public health communication remains uncertain, creating a potential hazard since deficiencies in either dimension undermine essential educational objectives.</p>
<p>The pressing demand for reliable, accessible skin cancer information in dermatology drives interest in LLMs as a scalable source for generating consistent educational content (<xref ref-type="bibr" rid="ref6 ref7 ref8">6&#x2013;8</xref>). Initial LLM benchmarking utilizes standardized instruments&#x2014;such as the c-PEMAT-P (for understandability/actionability) and the GQS (Global Quality Scale, for overall quality)&#x2014;alongside readability formulas (e.g., Flesch&#x2013;Kincaid, SMOG) (<xref ref-type="bibr" rid="ref1">1</xref>, <xref ref-type="bibr" rid="ref9">9</xref>). However, the prevailing trend is to report these metrics separately, yielding a fragmented view of performance (<xref ref-type="bibr" rid="ref10">10</xref>). The fundamental and largely overlooked problem concerns the inherent link between these evaluative axes: Does superior quality inherently correlate with improved readability, or are they distinct characteristics influenced independently by model choice and content subdomain? Resolving this empirical gap is critical for developing optimized deployment protocols. If coupled, quality benchmarking suffices; if decoupled (the central hypothesis of this study), the strategy requires nuance, as high factual quality may yield overly complex text, and vice versa. Thus, a focused analysis is essential to disentangle the effects of model architecture, content topic, assessed quality, and measured readability within AI-generated skin cancer education. While large language models (LLMs) show promise in generating high-quality content for public health, it is essential to address the risks of hallucinations and biases inherent in these models. Such issues could result in the spread of incorrect or biased information, which may compromise the reliability of the generated content.</p>
</sec>
<sec id="sec3">
<title>Quality versus readability and clinical expectations</title>
<p>In this study, content quality refers to the clarity, usefulness/actionability, and perceived reliability of patient-oriented information (assessed using validated instruments such as c-PEMAT-P and GQS), whereas readability refers to cognitive accessibility approximated by surface-text complexity captured by readability formulas. In patient education practice, materials are commonly expected to use plain language, minimize jargon, and present information with clear structure and actionable steps. Because widely cited grade-level readability targets are primarily defined for English texts and there is no universally accepted grade-level threshold for Chinese patient materials, the readability indices in this study are interpreted mainly as comparative benchmarks across models and topics, and are complemented by c-PEMAT-P to reflect patient-oriented understandability/actionability.</p>
</sec>
<sec id="sec4">
<title>Research questions and hypotheses</title>
<p>Accordingly, we formulated three operational research questions (RQs). RQ1 (model effect): Under identical prompts, do different LLMs generate patient-oriented skin cancer education materials with significantly different quality and readability? RQ2 (topic effect): Do quality and readability vary across content categories (etiology/risk, clinical manifestations, diagnosis/screening, treatment/prognosis, and prevention/patient education)? RQ3 (quality&#x2013;readability relationship): Across all generated outputs, is content quality associated with readability? We hypothesized that (H1) both quality and readability would differ by model, (H2) readability (and potentially quality) would differ by content category, and (H3; central hypothesis) quality and readability would show weak or non-significant correlations, indicating partial decoupling.</p>
<p>Addressing this need, our study compared five prominent LLMs (Doubao, DeepSeek, Wenxin Yiyan, Tongyi Qianwen, and GPT-5), evaluating their skin cancer educational outputs using validated quality scales (C-PEMAT, GQS) and seven readability indices. Results show that while models yielded materials of comparable understandability/actionability (C-PEMAT), overall perceived quality (GQS) differed substantially; GPT-5 achieved the highest score. Crucially, readability fluctuated significantly by LLM and content category (e.g., Prevention/Treatment vs. Clinical Manifestations), revealing no single model excelled across all metrics. Furthermore, correlation analysis established that quality and readability are largely independent dimensions. This disparity&#x2014;that high quality does not necessitate high readability&#x2014;indicates that optimizing AI-generated health education materials requires tailored, multifaceted strategies, necessitating joint consideration of the selected LLM and the specific content topic for intended public health applications. While expert evaluations offer valuable insights, it is equally important to validate AI-generated content with end-users (patients) to ensure its clarity and relevance in real-world healthcare contexts (<xref ref-type="bibr" rid="ref11">11</xref>).</p>
</sec>
</sec>
<sec sec-type="materials|methods" id="sec5">
<title>Materials and methods</title>
<sec id="sec6">
<title>Ethical considerations</title>
<p>Data utilized in this investigation were exclusively synthetic, derived from Large Language Models (LLMs). This methodology involved no human or animal experimentation, biological samples, or personal identifying information. Consequently, ethical review is waived under established academic standards.</p>
</sec>
<sec id="sec7">
<title>Research procedure</title>
<p>To evaluate the performance of Large Language Models (LLMs) in dermatological knowledge dissemination, two clinical experts compiled 20 frequently asked questions (FAQs) concerning skin cancer on November 14, 2025. These questions were systematically grouped into five categories: (1) Etiology and Risk Factors, (2) Clinical Manifestations and Classification, (3) Diagnosis and Screening, (4) Treatment and Prognosis, and (5) Prevention and Patient Education. The complete set is detailed in <xref ref-type="table" rid="tab1">Table 1</xref>. Researchers subsequently queried five publicly accessible LLMs&#x2014;Doubao, DeepSeek, Wenxin Yiyan, Tongyi Qianwen, and GPT-5&#x2014;with these FAQs. The resulting responses were analyzed across three critical metrics: readability, reliability, and quality. It must be acknowledged that using a fixed set of 20 FAQs may not fully capture the diversity of prompts and real-world contexts, thereby limiting the generalizability of the results. Furthermore, as the study was conducted in Chinese, its findings may not be entirely applicable to multilingual or cross-cultural settings. An overview of the study workflow is provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure S1</xref>.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Issue list.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Issue list</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">1. Etiology and risk factors</td>
</tr>
<tr>
<td align="left" valign="middle">1. Is long-term UV exposure the primary cause of skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">2. Does a family history of skin cancer significantly increase the risk of developing the disease?</td>
</tr>
<tr>
<td align="left" valign="middle">3. Are chronic skin injuries or scars prone to inducing skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">4. Do immunocompromised individuals have a higher risk of developing skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">2. Clinical manifestations and classification</td>
</tr>
<tr>
<td align="left" valign="middle">1. Does basal cell carcinoma (BCC) often present as painless nodules or plaques?</td>
</tr>
<tr>
<td align="left" valign="middle">2. Does melanoma typically have uneven pigmentation and irregular borders as its typical features?</td>
</tr>
<tr>
<td align="left" valign="middle">3. Is squamous cell carcinoma (SCC) prone to skin ulceration and non-healing?</td>
</tr>
<tr>
<td align="left" valign="middle">4. Do skin cancer masses often be accompanied by bleeding, itching, or pain?</td>
</tr>
<tr>
<td align="left" valign="middle">3. Diagnosis and screening</td>
</tr>
<tr>
<td align="left" valign="middle">1. Can dermoscopy improve the diagnostic accuracy of early-stage skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">2. Is pathological tissue biopsy the sole gold standard for confirming skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">3. Should skin screening for high-risk groups be included in routine physical examinations?</td>
</tr>
<tr>
<td align="left" valign="middle">4. Can PET-CT evaluate the distant metastasis of skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">4. Treatment and prognosis</td>
</tr>
<tr>
<td align="left" valign="middle">1. What is the cure rate of surgical resection for early-stage skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">2. Can targeted therapy be used for patients with advanced skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">3. Does the prognosis of melanoma mainly depend on the depth of tumor invasion?</td>
</tr>
<tr>
<td align="left" valign="middle">4. Can palliative radiotherapy be an option for inoperable skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">5. Prevention and patient education</td>
</tr>
<tr>
<td align="left" valign="middle">1. Can proper application of SPF30&#x202F;+&#x202F;sunscreen effectively prevent skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">2. Can wearing physical sun protection gear outdoors reduce the risk of skin cancer?</td>
</tr>
<tr>
<td align="left" valign="middle">3. How can patients identify suspicious skin lesions that require urgent medical attention?</td>
</tr>
<tr>
<td align="left" valign="middle">4. How often should post-operative skin cancer patients undergo re-examinations to monitor recurrence?</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec8">
<title>LLM querying protocol and generation settings (for replication)</title>
<p>To strengthen reproducibility of the benchmarking procedure, we standardized the querying workflow across all five LLMs. For each of the 20 FAQs (<xref ref-type="table" rid="tab1">Table 1</xref>), the same prompt template was applied to every model, and each FAQ was submitted in an independent session (i.e., a new chat/conversation was initiated for every FAQ) to avoid contextual carryover. The prompt template specified the intended audience (lay public/patients), the requested output language (Mandarin Chinese), and basic formatting requirements (e.g., plain language, structured bullet points/headings, and avoidance of unnecessary jargon) (full prompt text provided in Appendix/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>).</p>
<p>Where the platform allowed user control of generation parameters (e.g., temperature, top-p, maximum output length/tokens), these parameters were set identically across models; when such parameters were not exposed in the public interface, default platform settings were used and explicitly recorded. We also recorded whether web browsing/search augmentation was enabled (if applicable and user-configurable) and ensured that the same setting was applied consistently within each model. The access date (and interface/channel) for each LLM was documented to account for possible model updates over time.</p>
<p>The unit of analysis was one model response per FAQ (20 FAQs &#x00D7; 5 models&#x202F;=&#x202F;100 outputs). Each output was exported as plain text for subsequent quality evaluation (c-PEMAT-P and GQS) and readability analysis. Complete model-specific access information and generation settings are summarized in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table S1</xref>, and the full prompt set is provided in the Supplementary appendix to enable replication.</p>
</sec>
<sec id="sec9">
<title>Readability evaluation</title>
<p>Readability analysis of large language model (LLM) responses utilized multiple formulas derived from the Text Readability Assessment Tool.<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref> Acknowledging the current absence of an authoritative gold standard or consensus regarding the most reliable metric, this study consequently employs the mainstream system documented extensively in prior literature (<xref ref-type="bibr" rid="ref9">9</xref>, <xref ref-type="bibr" rid="ref12">12</xref>, <xref ref-type="bibr" rid="ref13">13</xref>).</p>
<p>This study employed seven established readability metrics&#x2014;the Coleman-Liau Index (CL), Linsear Write (LW), Automated Readability Index (ARI), Simple Measure of Gobbledygook (SMOG), Gunning Fog Index (GFOG), Flesch Reading Ease Score (FRES), and Flesch&#x2013;Kincaid Grade Level (FKGL)&#x2014;to quantify the comprehensibility and linguistic proximity of large language model (LLM)-generated output to natural, everyday speech. All outputs were generated in Mandarin Chinese and analyzed without translation. As these readability formulas were originally developed for English, we use them primarily for relative comparison of text complexity across models/topics rather than for absolute grade-level interpretation in Chinese; this is complemented by c-PEMAT-P for linguistically appropriate assessment.</p>
<p>The study did not conduct external factual validation by comparing the generated content with current clinical guidelines or other authoritative sources, which could have strengthened the robustness of the findings.</p>
</sec>
<sec id="sec10">
<title>Quality assessment</title>
<p>The assessment of literature quality and accessibility utilized two instruments: the c-PEMAT-P (Chinese version of the Patient Education Material Readability Assessment Tool) and the GQS (Global Quality Score) (<xref ref-type="bibr" rid="ref1">1</xref>, <xref ref-type="bibr" rid="ref14 ref15 ref16">14&#x2013;16</xref>). The c-PEMAT-P comprises 24 indicators across two dimensions: Comprehensibility (16 items, focusing on logical structure and terminology accessibility) and Practicality (8 items, evaluating specific action guidance and audience suitability). Scoring is binary (0/1), resulting in a 0&#x2013;24 total score, where higher values signify greater user accessibility. For analysis, c-PEMAT-P was calculated per output as the summed 0&#x2013;24 total score and then summarized as mean (&#x00B1;SD) across outputs within each model/category, yielding non-integer means. The GQS employs a 1&#x2013;5 point scale, ranging from &#x2018;Poor quality&#x2019; (Score 1: illogical content, no practical value) to &#x2018;Excellent quality&#x2019; (Score 5: rigorous logic, significant practical value). Two independent clinical experts, each with over 3 years of relevant clinical experience, conducted the evaluations. Inter-rater reliability was quantified using Cohen&#x2019;s Kappa (<italic>&#x03BA;</italic>), interpreted as follows: <italic>&#x03BA;</italic> &#x003E;&#x202F;0.75 indicated excellent agreement, 0.40&#x202F;&#x2264;&#x202F;&#x03BA;&#x202F;&#x2264;&#x202F;0.75 acceptable agreement, and &#x03BA;&#x202F;&#x003C;&#x202F;0.40 poor agreement. Discrepancies between the initial two raters were resolved through consensus discussions, with a third expert adjudicating the final rating when necessary. Verification confirmed that both the c-PEMAT-P and GQS scales achieved &#x03BA;&#x202F;&#x003E;&#x202F;0.75, confirming excellent inter-rater reliability (<xref ref-type="bibr" rid="ref17">17</xref>, <xref ref-type="bibr" rid="ref18">18</xref>). While the study did not include real-world comprehension tests or patient validation, it does not directly assess health literacy from a patient&#x2019;s perspective.</p>
</sec>
<sec id="sec11">
<title>Statistical analysis</title>
<p>All statistical analyses were performed using IBM SPSS Statistics 25.0 (IBM Corp., Armonk, NY, USA). Continuous outcomes were summarized as mean &#x00B1; standard deviation when appropriate, or as median with interquartile range for non-normally distributed variables. Normality was assessed using the Shapiro&#x2013;Wilk test, and homogeneity of variance for ANOVA was assessed using Levene&#x2019;s test. For outcomes meeting parametric assumptions, one-way analysis of variance (ANOVA) was used to compare groups, followed by Tukey&#x2019;s honestly significant difference (HSD) test for post-hoc pairwise comparisons. For outcomes violating normality and/or homoscedasticity assumptions, the Kruskal&#x2013;Wallis H test was used for multi-group comparisons, followed by Dunn&#x2019;s post-hoc pairwise comparisons. Multiple comparisons were adjusted using the Bonferroni correction. Correlations between quality metrics (GQS and c-PEMAT-P) and readability indices were assessed using Pearson&#x2019;s correlation coefficient (r). All tests were two-tailed, and <italic>p</italic> &#x003C;&#x202F;0.05 was considered statistically significant.</p>
</sec>
</sec>
<sec sec-type="results" id="sec12">
<title>Results</title>
<sec id="sec13">
<title>Readability analysis</title>
<p>To compare the performance of five mainstream large language models (Doubek, DeepSeek, Wenxin Yiyan, Tongyi Qianwen, and GPT-5) when generating skin cancer popularization materials, we initially assessed model-level distinctions by utilizing the following metrics extracted from <xref ref-type="table" rid="tab2">Table 2</xref>: quality scores (C-PEMAT and GQS) and readability indices (ARI, FRES, GFOG, FKGL, CL, SMOG, and LW).</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Analysis results of different large models.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Variables</th>
<th align="left" valign="top">Total (<italic>n</italic>&#x202F;=&#x202F;100)</th>
<th align="left" valign="top">Deep seek (<italic>n</italic>&#x202F;=&#x202F;20)</th>
<th align="left" valign="top">Doubao (<italic>n</italic>&#x202F;=&#x202F;20)</th>
<th align="left" valign="top">GPT-5 (<italic>n</italic>&#x202F;=&#x202F;20)</th>
<th align="left" valign="top">Tongyi Qianwen (<italic>n</italic>&#x202F;=&#x202F;20)</th>
<th align="left" valign="top">Wenxin Yiyan (<italic>n</italic>&#x202F;=&#x202F;20)</th>
<th align="left" valign="top">Statistic</th>
<th align="left" valign="top">
<italic>P</italic>
</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">C-PEMAT score, Mean &#x00B1; SD</td>
<td align="center" valign="top">8.92&#x202F;&#x00B1;&#x202F;0.86</td>
<td align="center" valign="top">8.80&#x202F;&#x00B1;&#x202F;0.77</td>
<td align="center" valign="top">8.75&#x202F;&#x00B1;&#x202F;1.21</td>
<td align="center" valign="top">9.40&#x202F;&#x00B1;&#x202F;0.50</td>
<td align="center" valign="top">8.95&#x202F;&#x00B1;&#x202F;0.83</td>
<td align="center" valign="top">8.70&#x202F;&#x00B1;&#x202F;0.73</td>
<td align="center" valign="top"><italic>F</italic> =&#x202F;2.29</td>
<td align="center" valign="top">0.065</td>
</tr>
<tr>
<td align="left" valign="top">GQS score, Mean &#x00B1; SD</td>
<td align="center" valign="top">3.09&#x202F;&#x00B1;&#x202F;1.08</td>
<td align="center" valign="top">3.30&#x202F;&#x00B1;&#x202F;0.57</td>
<td align="center" valign="top">3.45&#x202F;&#x00B1;&#x202F;0.60</td>
<td align="center" valign="top">4.40&#x202F;&#x00B1;&#x202F;0.68</td>
<td align="center" valign="top">2.20&#x202F;&#x00B1;&#x202F;0.83</td>
<td align="center" valign="top">2.10&#x202F;&#x00B1;&#x202F;0.64</td>
<td align="center" valign="top">F&#x202F;=&#x202F;40.50</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">ARI, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">16.67 (14.86, 18.17)</td>
<td align="center" valign="top">16.14 (13.76, 18.23)</td>
<td align="center" valign="top">16.91 (16.20, 18.09)</td>
<td align="center" valign="top">18.22 (16.35, 19.12)</td>
<td align="center" valign="top">17.00 (15.55, 18.17)</td>
<td align="center" valign="top">14.72 (13.23, 16.35)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;17.82#</td>
<td align="center" valign="top"><bold>0.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">FRES, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">26.00 (13.00, 37.50)</td>
<td align="center" valign="top">30.50 (20.75, 39.25)</td>
<td align="center" valign="top">23.00 (12.75, 34.75)</td>
<td align="center" valign="top">21.00 (9.75, 31.00)</td>
<td align="center" valign="top">15.00 (7.75, 35.00)</td>
<td align="center" valign="top">38.50 (25.75, 44.75)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;16.52#</td>
<td align="center" valign="top"><bold>0.002</bold></td>
</tr>
<tr>
<td align="left" valign="top">GFOG, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">15.20 (13.97, 17.50)</td>
<td align="center" valign="top">15.15 (12.78, 16.70)</td>
<td align="center" valign="top">15.85 (14.67, 16.60)</td>
<td align="center" valign="top">15.95 (14.20, 18.15)</td>
<td align="center" valign="top">16.65 (14.52, 18.07)</td>
<td align="center" valign="top">13.75 (12.33, 15.03)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;11.89#</td>
<td align="center" valign="top"><bold>0.018</bold></td>
</tr>
<tr>
<td align="left" valign="top">FKGL, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">15.00 (12.95, 16.57)</td>
<td align="center" valign="top">14.96 (13.42, 16.68)</td>
<td align="center" valign="top">14.96 (13.53, 16.39)</td>
<td align="center" valign="top">16.07 (14.85, 17.63)</td>
<td align="center" valign="top">15.20 (13.24, 16.91)</td>
<td align="center" valign="top">13.29 (12.01, 15.64)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;11.09#</td>
<td align="center" valign="top"><bold>0.026</bold></td>
</tr>
<tr>
<td align="left" valign="top">CL, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">15.53 (13.78, 17.78)</td>
<td align="center" valign="top">14.15 (12.53, 16.16)</td>
<td align="center" valign="top">16.19 (13.85, 17.43)</td>
<td align="center" valign="top">16.82 (15.07, 18.12)</td>
<td align="center" valign="top">18.23 (15.43, 19.04)</td>
<td align="center" valign="top">13.59 (12.40, 14.83)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;28.32#</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">SMOG, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">13.32 (12.22, 14.57)</td>
<td align="center" valign="top">13.32 (11.98, 14.84)</td>
<td align="center" valign="top">13.28 (12.61, 14.11)</td>
<td align="center" valign="top">14.36 (13.47, 15.18)</td>
<td align="center" valign="top">13.02 (12.20, 14.14)</td>
<td align="center" valign="top">12.32 (10.96, 13.69)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;13.02#</td>
<td align="center" valign="top"><bold>0.011</bold></td>
</tr>
<tr>
<td align="left" valign="top">LW, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">60.00 (56.75, 64.00)</td>
<td align="center" valign="top">62.00 (58.50, 64.50)</td>
<td align="center" valign="top">57.00 (54.00, 61.25)</td>
<td align="center" valign="top">58.50 (57.00, 61.00)</td>
<td align="center" valign="top">60.00 (56.75, 64.25)</td>
<td align="center" valign="top">62.50 (58.00, 64.75)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;7.74#</td>
<td align="center" valign="top">0.102</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>F, ANOVA, #: Kruskal-waills test. SD: standard deviation, M: Median, Q&#x2081;: 1st Quartile, Q&#x2083;: 3st Quartile.Bold values indicate statistically significant results (<italic>p</italic> &#x003C; 0.05).</p>
</table-wrap-foot>
</table-wrap>
<p>At the model level, GQS scores exhibited significant variation (<italic>F</italic>&#x202F;=&#x202F;40.50, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001). GPT-5 recorded the highest mean GQS score (4.40), underscoring its superior overall content quality compared to lower scores registered by Doubao (3.45), DeepSeek (3.30), Tongyi Qianwen (2.20), and Wenxin Yiyan (~2.10). Conversely, C-PEMAT scores did not differ significantly across models ($<italic>p</italic>&#x202F;=&#x202F;0.065$), as evidenced by mean values clustered within a restricted range (8.70&#x2013;9.40), suggesting broad equivalence in understandability and actionability across the five models.</p>
<p>Readability metrics demonstrated greater heterogeneity across models. Six indices showed significant inter-model variation: ARI (GPT-5 median: 18.22 vs. DeepSeek: 16.14; &#x03C7;<sup>2</sup> =&#x202F;17.82, <italic>p</italic> =&#x202F;0.001), FRES (&#x03C7;<sup>2</sup> =&#x202F;16.52, <italic>p</italic> =&#x202F;0.002), CL (&#x03C7;<sup>2</sup> =&#x202F;28.32, <italic>p</italic> &#x003C;&#x202F;0.001), SMOG (&#x03C7;<sup>2</sup> =&#x202F;13.02, <italic>p</italic> =&#x202F;0.011), GFOG (&#x03C7;<sup>2</sup> =&#x202F;11.89, <italic>p</italic> =&#x202F;0.018), and FKGL (&#x03C7;<sup>2</sup> =&#x202F;11.09, <italic>p</italic> =&#x202F;0.026). The LW index, however, failed to reach statistical significance (<italic>p</italic> =&#x202F;0.102). Collectively, these findings confirm that the readability of skin cancer educational content is model-dependent, though no single model demonstrated consistent superiority across all indices. Post-hoc pairwise comparisons were conducted using Dunn&#x2019;s test with Bonferroni correction; detailed pairwise Z statistics and adjusted <italic>p</italic> values are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table S2</xref>.</p>
<p>Analysis of quality and readability by content category (<xref ref-type="table" rid="tab3">Table 3</xref>) revealed a significant divergence in C-PEMAT scores (<italic>F</italic>&#x202F;=&#x202F;3.86, <italic>p</italic>&#x202F;=&#x202F;0.006). The Prevention and Treatment category recorded the highest mean score (9.50), considerably exceeding the lower mean (8.65) of the Clinical Manifestation category. This disparity suggests model-generated prevention-and treatment-related content was inherently more understandable and actionable. Conversely, GQS scores exhibited no significant difference across content categories (<italic>p</italic>&#x202F;=&#x202F;0.336).</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Analysis results by content category.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Variables</th>
<th align="center" valign="top">Total (<italic>n</italic> =&#x202F;100)</th>
<th align="center" valign="top">Clinical manifestations and classification (<italic>n</italic> =&#x202F;20)</th>
<th align="center" valign="top">Diagnosis and screening (<italic>n</italic> =&#x202F;20)</th>
<th align="center" valign="top">Etiology and risk factors (<italic>n</italic> =&#x202F;20)</th>
<th align="center" valign="top">Prevention and patient education (n&#x202F;=&#x202F;20)</th>
<th align="center" valign="top">Treatment and prognosis (<italic>n</italic> =&#x202F;20)</th>
<th align="center" valign="top">Statistic</th>
<th align="center" valign="top">
<italic>P</italic>
</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">C-PEMAT score, Mean &#x00B1; SD</td>
<td align="center" valign="top">8.92&#x202F;&#x00B1;&#x202F;0.86</td>
<td align="center" valign="top">8.65&#x202F;&#x00B1;&#x202F;0.88</td>
<td align="center" valign="top">8.90&#x202F;&#x00B1;&#x202F;0.85</td>
<td align="center" valign="top">8.95&#x202F;&#x00B1;&#x202F;0.94</td>
<td align="center" valign="top">9.50&#x202F;&#x00B1;&#x202F;0.51</td>
<td align="center" valign="top">8.60&#x202F;&#x00B1;&#x202F;0.82</td>
<td align="center" valign="top"><italic>F</italic> =&#x202F;3.86</td>
<td align="center" valign="top"><bold>0.006</bold></td>
</tr>
<tr>
<td align="left" valign="top">GQS score, Mean &#x00B1; SD</td>
<td align="center" valign="top">3.09&#x202F;&#x00B1;&#x202F;1.08</td>
<td align="center" valign="top">3.10&#x202F;&#x00B1;&#x202F;1.21</td>
<td align="center" valign="top">2.85&#x202F;&#x00B1;&#x202F;0.93</td>
<td align="center" valign="top">3.10&#x202F;&#x00B1;&#x202F;1.07</td>
<td align="center" valign="top">3.15&#x202F;&#x00B1;&#x202F;1.04</td>
<td align="center" valign="top">3.25&#x202F;&#x00B1;&#x202F;1.21</td>
<td align="center" valign="top"><italic>F</italic> =&#x202F;0.36</td>
<td align="center" valign="top">0.836</td>
</tr>
<tr>
<td align="left" valign="top">ARI, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">16.67 (14.86, 18.17)</td>
<td align="center" valign="top">13.66 (13.03, 15.62)</td>
<td align="center" valign="top">17.77 (16.73, 18.84)</td>
<td align="center" valign="top">17.23 (16.14, 18.04)</td>
<td align="center" valign="top">16.75 (14.92, 18.00)</td>
<td align="center" valign="top">16.21 (15.18, 18.33)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;18.19#</td>
<td align="center" valign="top"><bold>0.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">FRES, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">26.00 (13.00, 37.50)</td>
<td align="center" valign="top">38.00 (25.50, 44.25)</td>
<td align="center" valign="top">13.00 (8.50, 21.00)</td>
<td align="center" valign="top">18.50 (13.00, 26.75)</td>
<td align="center" valign="top">36.50 (29.25, 45.75)</td>
<td align="center" valign="top">27.50 (9.00, 34.25)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;32.88#</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">GFOG, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">15.20 (13.97, 17.50)</td>
<td align="center" valign="top">12.80 (11.78, 14.53)</td>
<td align="center" valign="top">17.40 (15.40, 18.47)</td>
<td align="center" valign="top">16.60 (15.88, 18.07)</td>
<td align="center" valign="top">14.15 (12.80, 14.62)</td>
<td align="center" valign="top">15.80 (14.67, 17.80)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;42.51#</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">FKGL, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">15.00 (12.95, 16.57)</td>
<td align="center" valign="top">12.59 (11.62, 14.40)</td>
<td align="center" valign="top">16.52 (15.86, 17.85)</td>
<td align="center" valign="top">15.71 (14.77, 16.50)</td>
<td align="center" valign="top">13.82 (12.12, 15.23)</td>
<td align="center" valign="top">14.79 (13.60, 16.66)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;25.88#</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">CL, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">15.53 (13.78, 17.78)</td>
<td align="center" valign="top">13.82 (12.47, 14.80)</td>
<td align="center" valign="top">17.24 (15.73, 18.32)</td>
<td align="center" valign="top">16.62 (15.47, 18.23)</td>
<td align="center" valign="top">13.94 (12.67, 16.14)</td>
<td align="center" valign="top">15.48 (13.32, 18.24)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;21.92#</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">SMOG, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">13.32 (12.22, 14.57)</td>
<td align="center" valign="top">11.34 (10.60, 12.26)</td>
<td align="center" valign="top">14.11 (13.70, 14.84)</td>
<td align="center" valign="top">13.96 (13.07, 14.81)</td>
<td align="center" valign="top">12.44 (11.69, 13.47)</td>
<td align="center" valign="top">13.28 (12.68, 14.43)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;31.84#</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
<tr>
<td align="left" valign="top">LW, M (Q&#x2081;, Q&#x2083;)</td>
<td align="center" valign="top">60.00 (56.75, 64.00)</td>
<td align="center" valign="top">65.00 (60.00, 68.00)</td>
<td align="center" valign="top">56.00 (53.00, 61.00)</td>
<td align="center" valign="top">60.50 (59.00, 64.00)</td>
<td align="center" valign="top">61.50 (58.50, 66.50)</td>
<td align="center" valign="top">56.50 (53.00, 58.00)</td>
<td align="center" valign="top">&#x03C7;<sup>2</sup> =&#x202F;27.19#</td>
<td align="center" valign="top"><bold>&#x003C;.001</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>F, ANOVA; #, Kruskal-waills test.Bold values indicate statistically significant results (<italic>p</italic> &#x003C; 0.05).</p>
</table-wrap-foot>
</table-wrap>
<p>Readability indices, however, varied markedly across content categories, exhibiting significant differences across all tested metrics: ARI (&#x03C7;<sup>2</sup>&#x202F;=&#x202F;18.19, <italic>p</italic>&#x202F;=&#x202F;0.001), FRES (&#x03C7;<sup>2</sup>&#x202F;=&#x202F;32.88, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), GFOG (&#x03C7;<sup>2</sup>&#x202F;=&#x202F;42.51, p&#x202F;&#x003C;&#x202F;0.001), FKGL (&#x03C7;<sup>2</sup>&#x202F;=&#x202F;25.88, p&#x202F;&#x003C;&#x202F;0.001), CL (&#x03C7;<sup>2</sup>&#x202F;=&#x202F;21.92, p&#x202F;&#x003C;&#x202F;0.001), SMOG (&#x03C7;<sup>2</sup>&#x202F;=&#x202F;31.84, p&#x202F;&#x003C;&#x202F;0.001), and LW (&#x03C7;<sup>2</sup>&#x202F;=&#x202F;27.19, p&#x202F;&#x003C;&#x202F;0.001). The Prevention and Treatment category, overall, demonstrated the most favorable performance across several of these metrics.</p>
<p>These findings demonstrate that GPT-5 achieves the highest overall Content Quality Score (GQS) in skin cancer science popularization. Conversely, readability depends significantly on both model choice and content category. Consequently, optimizing educational materials for specific applications necessitates the joint consideration of model selection and content tailoring.</p>
<p>Because these readability formulas were originally developed for English and there is no universally accepted grade-level threshold for Chinese patient materials, we interpret the indices primarily for relative comparisons across models and content categories. For transparency and comparability with prior English-language health communication literature, we additionally summarized the proportion of outputs with FKGL &#x2264; 8 (and &#x2264; 6) as an exploratory analysis; detailed proportions are reported in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table S4, S5</xref>.</p>
</sec>
<sec id="sec14">
<title>Quality analysis</title>
<p>The C-PEMAT score distributions across the five models were visualized (<xref ref-type="fig" rid="fig1">Figure 1</xref>) to assess differences in the educational content&#x2019;s understandability and actionability. While visual inspection suggested variation in distribution shape and density&#x2014;GPT-5 scores, for instance, appeared more concentrated at higher values&#x2014;all pairwise comparisons proved statistically non-significant (&#x201C;ns&#x201D;). This outcome indicates that the C-PEMAT quality of educational materials generated by these large language models (LLMs) was broadly similar, demonstrating no statistically demonstrable advantage for any single model.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>c-PEMAT-P scores of different large models.</p>
</caption>
<graphic xlink:href="fpubh-14-1777577-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Violin plot comparing C PEMAT scores across five platforms: Doubao, Deep Seek, Wenxin Yiyan, Tongyi Qianwen, and GPT-5. Each platform is represented by a different color, with scores ranging from approximately 6 to 14. Statistical significance between groups is indicated above the plots, with "ns" for not significant and "&#x002A;&#x002A;" for p less than 0.01. A legend on the right associates each platform with its specific color.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="fig2">Figure 2</xref> illustrates the distribution of Global Quality Scale (GQS) scores, which reveal statistically significant disparities across the five models, a stark contrast to C-PEMAT results. Specifically, GPT-5 registered the highest median GQS score, correlating with a demonstrable upward shift in its distribution. Subsequent pairwise analyses confirmed GPT-5&#x2019;s consistent superior ranking over all tested competitors&#x2014;Doubao, DeepSeek, Wenxin Yiyan, and Tongyi Qianwen (all statistically significant, labeled &#x201C;&#x002A;&#x002A;&#x002A;&#x002A;&#x201D;)&#x2014;whereas Wenxin Yiyan consistently exhibited comparatively lower scores. These findings collectively establish GPT-5&#x2019;s superior overall perceived quality (GQS) for skin cancer educational content.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>GQS scores of different large models.</p>
</caption>
<graphic xlink:href="fpubh-14-1777577-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Violin plot comparing GQS scores for five platforms: Doubao, Deep Seek, Wenxin Yiyan, Tongyi Qianwen, and GPT-5. GPT-5 shows the highest median GQS score and Wenxin Yiyan the lowest. Horizontal brackets indicate statistically significant differences between most pairs, with asterisks denoting significance levels and "ns" indicating non-significance. Color legend on the right matches each platform.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec15">
<title>Correlation analysis</title>
<p>Correlation analyses were conducted using Pearson&#x2019;s correlation coefficient (r) across all outputs (<italic>n</italic> =&#x202F;100). The full correlation matrix (r) with two-tailed <italic>p</italic> values is provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table S3</xref>, and the heatmap is shown in <xref ref-type="fig" rid="fig3">Figure 3</xref>. A correlation heatmap was employed to analyze the correlation structure between quality assessments and textual readability, specifically comparing the two quality metrics (GQS and C-PEMAT) against seven readability indices (ARI, FRES, GFOG, FKGL, CL, SMOG, and LW) across all skin cancer educational texts.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Heatmap of correlations among different indicators.</p>
</caption>
<graphic xlink:href="fpubh-14-1777577-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Correlation heatmap displaying relationships among ARI, FRES, GFOG, FKGL, CL, SMOG, LW, C PEMAT score, and GQS score, with values from -1 to 1. Red indicates positive correlation, blue indicates negative, and non-significant values are marked with an X.</alt-text>
</graphic>
</fig>
<p>The analysis revealed a strong positive correlation between GQS and C-PEMAT, demonstrating consistency in ranking overall content quality. Substantial shared variance in capturing textual complexity was similarly evident across several readability indices, where strong positive correlations emerged between SMOG and CL, GFOG and FKGL, and ARI and FRES.</p>
<p>By contrast, GQS and C-PEMAT demonstrated only weak, often negative, correlations with most readability indices; the associations were weakest with LW. The weak correlation between content quality (GQS/C-PEMAT) and readability indices suggests that these two factors function independently. While high-quality content can be comprehensive and accurate, it may not always meet simplified readability requirements, indicating that improving one aspect does not necessarily lead to improvement in the other. Consequently, optimizing AI-generated skin cancer educational materials requires complementary strategies that target these elements separately, as high quality does not correlate with inherent readability.</p>
</sec>
</sec>
<sec sec-type="discussion" id="sec16">
<title>Discussion</title>
<p>Large Language Models (LLMs) yield skin cancer education content of high quality; however, textual readability proves highly heterogeneous across models and topics, being only weakly correlated with content quality. Consequently, selecting and configuring LLMs for patient education mandates explicit consideration of both content quality and readability, as improvements in one dimension do not automatically translate to the other.</p>
<p>The long-standing observation that conventional skin cancer patient information often exceeds internationally recommended reading levels, even from professional organizations (<xref ref-type="bibr" rid="ref19 ref20 ref21">19&#x2013;21</xref>), motivated this study. Prior analyses of online resources consistently report median reading levels spanning tenth to twelfth grade (<xref ref-type="bibr" rid="ref19 ref20 ref21">19&#x2013;21</xref>), rendering materials inaccessible to the average reader despite the critical need for clarity in prevention and treatment decision-making. We therefore investigated whether contemporary Large Language Models (LLMs) could overcome these accessibility limitations while simultaneously maintaining informational quality. Results indicated GPT-5 achieved the highest Global Quality Scale (GQS) score, significantly outperforming Doubao, DeepSeek, Tongyi Qianwen, and Wenxin Yiyan, whereas C-PEMAT scores showed no statistically significant inter-model differences. This suggests GPT-5 provides superior comprehensiveness and currency, while all five models generate similarly understandable and actionable content when assessed using the validated Chinese version of the Patient Education Materials Assessment Tool (C-PEMAT) (<xref ref-type="bibr" rid="ref1">1</xref>, <xref ref-type="bibr" rid="ref22">22</xref>). These findings complement prior work demonstrating that LLMs match or exceed the quality of traditional oncology and dermatology patient resources (<xref ref-type="bibr" rid="ref6 ref7 ref8">6&#x2013;8</xref>), simultaneously underscoring significant non-interchangeability among models.</p>
<p>Our second contribution involves comparing readability systematically across models using seven established indices. Unlike the convergent C-PEMAT scores, these indices revealed pronounced cross-model variability: no single system emerged as best-performing across measures including the Automated Readability Index, Flesch&#x2013;Kincaid Grade Level, Gunning Fog, SMOG, Coleman&#x2013;Liau, Linsear Write, and Chinese-specific metrics. This parallels large-scale analyses showing that different readability formulas agree only moderately, and that most conventional health materials persistently exceed the recommended sixth- to eighth-grade level (<xref ref-type="bibr" rid="ref1">1</xref>, <xref ref-type="bibr" rid="ref19">19</xref>, <xref ref-type="bibr" rid="ref20">20</xref>). These findings also extend recent LLM-focused studies in dermatology and other specialties, where ChatGPT-generated handouts offered marginal readability improvements over legacy patient education materials but failed to reach the target level consistently (<xref ref-type="bibr" rid="ref5">5</xref>, <xref ref-type="bibr" rid="ref8">8</xref>, <xref ref-type="bibr" rid="ref22">22</xref>, <xref ref-type="bibr" rid="ref23">23</xref>). Crucially, by exposing substantial cross-model heterogeneity on identical prompts, our results underscore that model choice alone dictates the reading burden patients face, even under ostensibly similar generation instructions.</p>
<p>Thirdly, quality and readability strongly depend on the type of skin cancer content generated. Specifically, texts focusing on prevention and treatment achieved higher C-PEMAT scores and more favorable readability values than sections describing clinical manifestations, whereas GQS scores remained statistically uniform across content categories. This aligns with earlier work on internet-based skin cancer resources, where campaign-driven prevention and treatment pages were modestly easier to read than pathology- or prognosis-focused pages, though they still remained above recommended levels (<xref ref-type="bibr" rid="ref19">19</xref>, <xref ref-type="bibr" rid="ref24">24</xref>). Our results suggest that large language models inherit &#x2014; and may even amplify&#x2014;this imbalance: broad prompting leads models to prioritize mechanistic and diagnostic details that increase lexical and syntactic complexity, particularly in sections on clinical features. Crucially, the relatively uniform GQS scores across categories indicate expert perception of stable medical accuracy and usefulness. Consequently, these patterns reinforce the need for deliberate prompt engineering or post-processing when generating education materials specifically aimed at symptom recognition and early presentation.</p>
<p>The study&#x2019;s most conceptually significant finding is the decoupling of content quality from readability. A strong positive correlation between GQS and C-PEMAT demonstrated that comprehensive and reliable materials were perceived by expert raters as more understandable and actionable, a result consistent with prior PEMAT-based evaluations (<xref ref-type="bibr" rid="ref8">8</xref>, <xref ref-type="bibr" rid="ref19">19</xref>, <xref ref-type="bibr" rid="ref22">22</xref>). Conversely, these quality metrics showed only weak, predominantly negative correlations with most readability indices. That is, greater perceived quality and patient-friendliness were not systematically associated with simplified surface text structure or reduced grade-level scores. This finding refutes the common assumption within the LLM and health communication literature that optimizing readability metrics necessarily advances health literacy. For example, studies focusing on optimizing Flesch&#x2013;Kincaid or SMOG scores via ChatGPT re-writing reported numerical gains but failed to assess the accuracy and actionability of key messages directly (<xref ref-type="bibr" rid="ref5">5</xref>). Similarly, recent blinded comparisons of multiple LLMs generating dermatology handouts have emphasized mean readability and PEMAT scores but omitted formal modeling of their interrelationship (<xref ref-type="bibr" rid="ref6">6</xref>, <xref ref-type="bibr" rid="ref22">22</xref>). By jointly quantifying these dimensions in a single framework, our data strongly suggest that readability formulas capture only a narrow slice of the factors governing the genuine usability of LLM-generated content for patients.</p>
<p>Our results validate and nuance the existing literature positioning large language models (LLMs) as promising patient education tools. Scoping reviews and empirical evaluations routinely showcase LLMs&#x2019; flexibility, scalability, and high face validity across oncology and dermatology. Yet, these studies also raise concerns regarding hallucinations, lack of personalization, and potential biases (<xref ref-type="bibr" rid="ref6">6</xref>, <xref ref-type="bibr" rid="ref8">8</xref>, <xref ref-type="bibr" rid="ref22">22</xref>, <xref ref-type="bibr" rid="ref23">23</xref>). Building upon this foundation, our study offers a focused examination in skin cancer education that foregrounds the critical tension between quality and readability, rather than treating these metrics as interchangeable surrogates. Crucially, the finding that GPT-5 yielded the highest Global Quality Score (GQS) but failed to universally dominate readability suggests that future evaluations must report both outcomes and consider task-specific trade-offs&#x2014;e.g., maximizing accuracy for complex treatment discussions while allowing aggressive simplification for primary prevention messaging.</p>
<p>This study presents several critical limitations. First, our scope was narrow: the analysis restricted itself to a single disease area, a fixed set of standardized prompts, and content categories. Consequently, the observed patterns&#x2014;specifically the decoupling of quality and readability&#x2014;may not generalize across other diseases, languages, or cultural contexts. Second, we relied on expert ratings (GQS and C-PEMAT) as proxies for patient-centered outcomes; while these instruments are widely validated, they fail to directly capture patients&#x2019; comprehension, trust, or resultant behavioral change. Third, readability was assessed via automated indices applied solely to single model outputs, thereby ignoring potential variability stemming from differing sampling temperatures, prompt refinements, or iterative clinical co-creation. Fourth, the systematic assessment of factual accuracy, hallucination rates, or harmful content was limited to the implicit GQS ratings. Critically, LLM outputs were not compared with contemporaneous external resources, such as web pages or professionally authored pamphlets. Fifth, this study did not include a subgroup analysis based on education levels, limiting our ability to assess the practical relevance of the materials for low-literacy populations. Sixth, this study provides valuable insights into the quality and readability of AI-generated health content, the findings are based on expert assessments rather than direct feedback from patients. Finally, given the rapid evolution of the commercial LLM landscape, our snapshot of five systems is inherently fleeting, risking rapid obsolescence as architectures and safety layers are updated.</p>
<p>Addressing these limitations requires several future research avenues. First, we must expand this approach to multiple diseases, languages, and health-care settings, simultaneously integrating both expert- and patient-level outcomes, including comprehension tests, decisional conflict, and behaviorally anchored measures of health literacy. Second, prospective trials should rigorously compare pipelines that leverage state-of-the-art models (e.g., GPT-5) for initial content generation, followed by algorithmic or interactive rewriting to target specific reading levels, while explicitly monitoring for C-PEMAT and GQS score degradation. Third, collaborations among clinicians, health communication specialists, and AI developers must define domain-specific prompt templates, guardrails, and evaluation dashboards that simultaneously track readability, quality, and safety. Third, Future research should focus on incorporating patient-centered validation to refine AI-generated educational materials and ensure their relevance in real-world healthcare settings. Finally, at a systems level, embedding LLM-generated but clinician-reviewed skin cancer education within electronic health records and teledermatology platforms could close long-standing gaps in patient information, thereby ensuring accountability. Ultimately, by treating quality and readability as complementary rather than interchangeable design targets, our work establishes the foundation for safer and more equitable deployment of large language models in skin cancer education and patient education more broadly.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec17">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="sec18">
<title>Author contributions</title>
<p>YZ: Data curation, Resources, Supervision, Visualization, Writing &#x2013; original draft. LW: Investigation, Methodology, Validation, Writing &#x2013; review &#x0026; editing. WZ: Writing &#x2013; review &#x0026; editing, Project administration, Software. WL: Conceptualization, Data curation, Funding acquisition, Investigation, Methodology, Software, Validation, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec19">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec20">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec21">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec22">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fpubh.2026.1777577/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fpubh.2026.1777577/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.xlsx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_2.docx" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image_1.png" id="SM3" mimetype="image/png" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shan</surname><given-names>Y</given-names></name> <name><surname>Ji</surname><given-names>M</given-names></name> <name><surname>Dong</surname><given-names>Z</given-names></name> <name><surname>Xing</surname><given-names>Z</given-names></name> <name><surname>Wang</surname><given-names>D</given-names></name> <name><surname>Cao</surname><given-names>X</given-names></name></person-group>. <article-title>The Chinese version of the patient education materials assessment tool for printable materials: translation, adaptation, and validation study</article-title>. <source>J Med Internet Res</source>. (<year>2023</year>) <volume>25</volume>:<fpage>e39808</fpage>. doi: <pub-id pub-id-type="doi">10.2196/39808</pub-id>, <pub-id pub-id-type="pmid">37200085</pub-id></mixed-citation></ref>
<ref id="ref2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nadratowski</surname><given-names>A</given-names></name> <name><surname>Shoots-Reinhard</surname><given-names>B</given-names></name> <name><surname>Shafer</surname><given-names>A</given-names></name> <name><surname>Detweiler-Bedell</surname><given-names>J</given-names></name> <name><surname>Detweiler-Bedell</surname><given-names>B</given-names></name> <name><surname>Leachman</surname><given-names>S</given-names></name> <etal/></person-group>. <article-title>Evidence-based communication to increase melanoma knowledge and skin checks</article-title>. <source>JID Innov</source>. (<year>2023</year>) <volume>4</volume>:<fpage>100253</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.xjidi.2023.100253</pub-id></mixed-citation></ref>
<ref id="ref3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jawad</surname><given-names>R</given-names></name> <name><surname>Jawad</surname><given-names>R</given-names></name></person-group>. <article-title>Comparison feed forward back propagation networks (FFBPNs) with support vector machine (SVM) for diagnosis skin cancer based on images</article-title>. <source>Vokasi Unesa Bull Eng Technol Appl Sci</source>. (<year>2025</year>) <volume>2</volume>:<fpage>127</fpage>&#x2013;<lpage>35</lpage>. doi: <pub-id pub-id-type="doi">10.26740/vubeta.v2i2.36117</pub-id></mixed-citation></ref>
<ref id="ref4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mac</surname><given-names>O</given-names></name> <name><surname>Ayre</surname><given-names>J</given-names></name> <name><surname>Bell</surname><given-names>K</given-names></name> <name><surname>McCaffery</surname><given-names>K</given-names></name> <name><surname>Muscat</surname><given-names>DM</given-names></name></person-group>. <article-title>Comparison of readability scores for written health information across formulas using automated vs manual measures</article-title>. <source>JAMA Netw Open</source>. (<year>2022</year>) <volume>5</volume>:<fpage>e2246051</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2022.46051</pub-id>, <pub-id pub-id-type="pmid">36508219</pub-id></mixed-citation></ref>
<ref id="ref5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aydin</surname><given-names>S</given-names></name> <name><surname>Karabacak</surname><given-names>M</given-names></name> <name><surname>Vlachos</surname><given-names>V</given-names></name> <name><surname>Margetis</surname><given-names>K</given-names></name></person-group>. <article-title>Large language models in patient education: a scoping review of applications in medicine</article-title>. <source>Front Med</source>. (<year>2024</year>) <volume>11</volume>:<fpage>1477898</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmed.2024.1477898</pub-id>, <pub-id pub-id-type="pmid">39534227</pub-id></mixed-citation></ref>
<ref id="ref6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lambert</surname><given-names>R</given-names></name> <name><surname>Choo</surname><given-names>Z</given-names></name> <name><surname>Gradwohl</surname><given-names>K</given-names></name> <name><surname>Choo</surname><given-names>Z-Y</given-names></name> <name><surname>Schroedl</surname><given-names>L</given-names></name> <name><surname>Ruiz De Luzuriaga</surname><given-names>A</given-names></name></person-group>. <article-title>Assessing the application of large language models in generating dermatologic patient education materials according to reading level: qualitative study</article-title>. <source>JMIR Dermatol</source>. (<year>2024</year>) <volume>7</volume>:<fpage>e55898</fpage>. doi: <pub-id pub-id-type="doi">10.2196/55898</pub-id>, <pub-id pub-id-type="pmid">38754096</pub-id></mixed-citation></ref>
<ref id="ref7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Roster</surname><given-names>K</given-names></name> <name><surname>Kann</surname><given-names>RB</given-names></name> <name><surname>Farabi</surname><given-names>B</given-names></name> <name><surname>Gronbeck</surname><given-names>C</given-names></name> <name><surname>Brownstone</surname><given-names>N</given-names></name> <name><surname>Lipner</surname><given-names>SR</given-names></name></person-group>. <article-title>Readability and health literacy scores for ChatGPT-generated dermatology public education materials: cross-sectional analysis of sunscreen and melanoma questions</article-title>. <source>JMIR Dermatol</source>. (<year>2024</year>) <volume>7</volume>:<fpage>e50163</fpage>. doi: <pub-id pub-id-type="doi">10.2196/50163</pub-id>, <pub-id pub-id-type="pmid">38446502</pub-id></mixed-citation></ref>
<ref id="ref8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chang</surname><given-names>CT</given-names></name> <name><surname>Ticknor</surname><given-names>IL</given-names></name> <name><surname>Spinelli</surname><given-names>J</given-names></name> <name><surname>Spinelli</surname><given-names>J-A</given-names></name> <name><surname>Bhatia</surname><given-names>BK</given-names></name> <name><surname>Marwaha</surname><given-names>S</given-names></name> <etal/></person-group>. <article-title>Comparison of large language models in generating patient handouts for the dermatology clinic: a blinded study</article-title>. <source>JAAD Int</source>. (<year>2024</year>) <volume>15</volume>:<fpage>152</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jdin.2024.02.010</pub-id>, <pub-id pub-id-type="pmid">38571697</pub-id></mixed-citation></ref>
<ref id="ref9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Steeb</surname><given-names>T</given-names></name> <name><surname>Reinhardt</surname><given-names>L</given-names></name> <name><surname>Harla</surname><given-names>M</given-names></name> <name><surname>Ting</surname><given-names>PS</given-names></name> <name><surname>Trivedi</surname><given-names>H</given-names></name> <name><surname>Vipani</surname><given-names>A</given-names></name> <etal/></person-group>. <article-title>Assessment of the quality, understandability, and reliability of YouTube videos as a source of information on basal cell carcinoma: web-based analysis</article-title>. <source>JMIR Cancer</source>. (<year>2022</year>) <volume>8</volume>:<fpage>e29581</fpage>. doi: <pub-id pub-id-type="doi">10.3350/cmh.2023.0089</pub-id></mixed-citation></ref>
<ref id="ref10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname><given-names>K</given-names></name> <name><surname>Yee</surname><given-names>D</given-names></name> <name><surname>Peterson</surname><given-names>H</given-names></name> <name><surname>Huang</surname><given-names>MY</given-names></name> <name><surname>Kingston</surname><given-names>P</given-names></name> <name><surname>Ag&#x00FC;ero</surname><given-names>R</given-names></name> <etal/></person-group>. <article-title>Readability, quality, and comprehensiveness of online health resources for skin cancer in skin of color</article-title>. <source>Int J Dermatol</source>. (<year>2023</year>) <volume>62</volume>:<fpage>e532</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1111/ijd.16674</pub-id>, <pub-id pub-id-type="pmid">37039526</pub-id></mixed-citation></ref>
<ref id="ref11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hamam</surname><given-names>H</given-names></name></person-group>. <article-title>Rethinking intelligence: from human cognition to artificial futures</article-title>. <source>Vokasi Unesa Bull Eng Technol Appl Sci</source>. (<year>2025</year>) <volume>2</volume>:<fpage>531</fpage>&#x2013;<lpage>48</lpage>. doi: <pub-id pub-id-type="doi">10.26740/vubeta.v2i3.44232</pub-id></mixed-citation></ref>
<ref id="ref12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mondal</surname><given-names>H</given-names></name> <name><surname>Mondal</surname><given-names>S</given-names></name></person-group>. <article-title>ChatGPT in academic writing: maximizing its benefits and minimizing the risks</article-title>. <source>Indian J Ophthalmol</source>. (<year>2023</year>) <volume>71</volume>:<fpage>3600</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.4103/ijo.ijo</pub-id>, <pub-id pub-id-type="pmid">37991290</pub-id></mixed-citation></ref>
<ref id="ref13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hamnes</surname><given-names>B</given-names></name> <name><surname>van Eijk-Hustings</surname><given-names>Y</given-names></name> <name><surname>Primdahl</surname><given-names>J</given-names></name></person-group>. <article-title>Readability of patient information and consent documents in rheumatological studies</article-title>. <source>BMC Med Ethics</source>. (<year>2016</year>) <volume>17</volume>:<fpage>42</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12910-016-0126-0</pub-id>, <pub-id pub-id-type="pmid">27422433</pub-id></mixed-citation></ref>
<ref id="ref14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barlas</surname><given-names>T</given-names></name> <name><surname>Ecem Avci</surname><given-names>D</given-names></name> <name><surname>Cinici</surname><given-names>B</given-names></name> <name><surname>Ozkilicaslan</surname><given-names>H</given-names></name> <name><surname>Muhittin Yalcin</surname><given-names>M</given-names></name> <name><surname>Eroglu Altinova</surname><given-names>A</given-names></name></person-group>. <article-title>The quality and reliability analysis of YouTube videos about insulin resistance</article-title>. <source>Int J Med Inform</source>. (<year>2023</year>) <volume>170</volume>:<fpage>104960</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2022.104960</pub-id>, <pub-id pub-id-type="pmid">36525801</pub-id></mixed-citation></ref>
<ref id="ref15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname><given-names>F</given-names></name> <name><surname>Zheng</surname><given-names>S</given-names></name> <name><surname>Wu</surname><given-names>J</given-names></name></person-group>. <article-title>Quality of information in gallstone disease videos on TikTok: cross-sectional study</article-title>. <source>J Med Internet Res</source>. (<year>2023</year>) <volume>25</volume>:<fpage>e39162</fpage>. doi: <pub-id pub-id-type="doi">10.2196/39162</pub-id>, <pub-id pub-id-type="pmid">36753307</pub-id></mixed-citation></ref>
<ref id="ref16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Subramanian</surname><given-names>T</given-names></name> <name><surname>Araghi</surname><given-names>K</given-names></name> <name><surname>Akosman</surname><given-names>I</given-names></name> <name><surname>Tuma</surname><given-names>O</given-names></name> <name><surname>Hassan</surname><given-names>A</given-names></name> <name><surname>Lahooti</surname><given-names>A</given-names></name> <etal/></person-group>. <article-title>Quality of spine surgery information on social media: a DISCERN analysis of TikTok videos</article-title>. <source>Neurospine</source>. (<year>2023</year>) <volume>20</volume>:<fpage>1443</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.14245/ns.2346700.350</pub-id>, <pub-id pub-id-type="pmid">38171310</pub-id></mixed-citation></ref>
<ref id="ref17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname><given-names>I</given-names></name> <name><surname>Shen</surname><given-names>Y</given-names></name> <name><surname>Shih</surname><given-names>M</given-names></name> <name><surname>Lin</surname><given-names>I-T</given-names></name> <name><surname>Shen</surname><given-names>Y-M</given-names></name> <name><surname>Shih</surname><given-names>M-J</given-names></name> <etal/></person-group>. <article-title>Short video addiction on the interaction of creative self-efficacy and career interest to innovative design profession students</article-title>. <source>Healthcare</source>. (<year>2023</year>) <volume>11</volume>:<fpage>579</fpage>. doi: <pub-id pub-id-type="doi">10.3390/healthcare11040579</pub-id>, <pub-id pub-id-type="pmid">36833113</pub-id></mixed-citation></ref>
<ref id="ref18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>L</given-names></name> <name><surname>Li</surname><given-names>Y</given-names></name> <name><surname>Gu</surname><given-names>J</given-names></name> <name><surname>Xiao</surname><given-names>L</given-names></name></person-group>. <article-title>A quality analysis of thyroid cancer videos available on TikTok</article-title>. <source>Front Public Health</source>. (<year>2023</year>) <volume>11</volume>:<fpage>1049728</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpubh.2023.1049728</pub-id></mixed-citation></ref>
<ref id="ref19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Basch</surname><given-names>CH</given-names></name> <name><surname>Fera</surname><given-names>J</given-names></name> <name><surname>Ethan</surname><given-names>D</given-names></name> <name><surname>Garcia</surname><given-names>P</given-names></name> <name><surname>Perin</surname><given-names>D</given-names></name> <name><surname>Basch</surname><given-names>CE</given-names></name></person-group>. <article-title>Readability of online material related to skin cancer</article-title>. <source>Public Health</source>. (<year>2018</year>) <volume>163</volume>:<fpage>137</fpage>&#x2013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.puhe.2018.07.009</pub-id></mixed-citation></ref>
<ref id="ref20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>John</surname><given-names>AM</given-names></name> <name><surname>John</surname><given-names>ES</given-names></name> <name><surname>Hansberry</surname><given-names>DR</given-names></name> <name><surname>Lambert</surname><given-names>WC</given-names></name></person-group>. <article-title>Assessment of online patient education materials from major dermatologic associations</article-title>. <source>J Clin Aesthet Dermatol</source>. (<year>2016</year>) <volume>9</volume>:<fpage>23</fpage>&#x2013;<lpage>8</lpage>.</mixed-citation></ref>
<ref id="ref21"><label>21.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Steeb</surname><given-names>T</given-names></name> <name><surname>Reinhardt</surname><given-names>L</given-names></name> <name><surname>Harla</surname><given-names>M</given-names></name> <name><surname>Heppt</surname><given-names>MV</given-names></name> <name><surname>Meier</surname><given-names>F</given-names></name> <name><surname>Berking</surname><given-names>C</given-names></name></person-group>. <article-title>Assessment of the quality, understandability, and reliability of YouTube videos as a source of information on basal cell carcinoma: web-based analysis</article-title>. <source>JMIR Cancer</source>. (<year>2022</year>) <volume>8</volume>:<fpage>e29581</fpage>. doi: <pub-id pub-id-type="doi">10.2196/29581</pub-id></mixed-citation></ref>
<ref id="ref22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Furukawa</surname><given-names>E</given-names></name> <name><surname>Okuhara</surname><given-names>T</given-names></name> <name><surname>Liu</surname><given-names>M</given-names></name> <name><surname>Okada</surname><given-names>H</given-names></name> <name><surname>Kiuchi</surname><given-names>T</given-names></name></person-group>. <article-title>Evaluating online and offline health information with the patient education materials assessment tool: protocol for a systematic review</article-title>. <source>JMIR Res Protoc</source>. (<year>2025</year>) <volume>14</volume>:<fpage>e63489</fpage>. doi: <pub-id pub-id-type="doi">10.2196/63489</pub-id>, <pub-id pub-id-type="pmid">39813665</pub-id></mixed-citation></ref>
<ref id="ref23"><label>23.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Swisher</surname><given-names>AR</given-names></name> <name><surname>Wu</surname><given-names>AW</given-names></name> <name><surname>Liu</surname><given-names>GC</given-names></name> <name><surname>Lee</surname><given-names>MK</given-names></name> <name><surname>Carle</surname><given-names>TR</given-names></name> <name><surname>Tang</surname><given-names>DM</given-names></name></person-group>. <article-title>Enhancing health literacy: evaluating the readability of patient handouts revised by ChatGPT's large language model</article-title>. <source>Otolaryngol Head Neck Surg</source>. (<year>2024</year>) <volume>171</volume>:<fpage>1751</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1002/ohn.927</pub-id>, <pub-id pub-id-type="pmid">39105460</pub-id></mixed-citation></ref>
<ref id="ref24"><label>24.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goorman</surname><given-names>E</given-names></name> <name><surname>Mittal</surname><given-names>S</given-names></name> <name><surname>Choi</surname><given-names>JN</given-names></name></person-group>. <article-title>Assessing readability of skin cancer screening resources: a comparison of online websites and ChatGPT responses</article-title>. <source>J Cancer Educ</source>. (<year>2025</year>) <volume>2025</volume>:<fpage>10</fpage>&#x2013;<lpage>1007</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s13187-025-02683-2</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0002">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2999327/overview">Hewa Majeed Zangana</ext-link>, University of Duhok, Iraq</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0003">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3330850/overview">Widi Aribowo</ext-link>, Surabaya State University, Indonesia</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3331898/overview">Anik Vitianingsih</ext-link>, Universitas Dr. Soetomo, Indonesia</p>
</fn>
</fn-group>
<fn-group>
<fn id="fn0001">
<label>1</label>
<p><ext-link xlink:href="http://readabilityformulas.com/" ext-link-type="uri">http://readabilityformulas.com/</ext-link></p>
</fn>
</fn-group>
</back>
</article>