<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Public Health</journal-id>
<journal-title-group>
<journal-title>Frontiers in Public Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Public Health</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-2565</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpubh.2026.1760872</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Can large language models be trusted? Reliability and readability of responses to perinatal depression FAQs</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author"><name><surname>Huang</surname> <given-names>Jingyu</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3305138"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author"><name><surname>Yu</surname> <given-names>Hua</given-names></name><xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author"><name><surname>Chen</surname> <given-names>Junjian</given-names></name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author"><name><surname>Wang</surname> <given-names>Xinyue</given-names></name><xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author"><name><surname>Huang</surname> <given-names>Lizhi</given-names></name><xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author"><name><surname>Wen</surname> <given-names>Junjie</given-names></name><xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Li</surname> <given-names>Hui</given-names></name><xref ref-type="aff" rid="aff2"><sup>2</sup></xref><xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Faculty of Health Sciences, University of Macau</institution>, <city>Taipa</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Nursing, Ruikang Hospital Affiliated to Guangxi University of Chinese Medicine</institution>, <city>Nanning</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>Second Affiliated Hospital of Guangxi Medical University</institution>, <city>Nanning</city>, <country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>GuangXi University of Chinese Medicine</institution>, <city>Nanning</city>, <country country="cn">China</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Medical Informatics, Harbin Medical University</institution>, <city>Harbin</city>, <country country="cn">China</country></aff>
<aff id="aff6"><label>6</label><institution>Southwest Jiaotong University Hope College</institution>, <city>Chengdu</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Hui Li, <email xlink:href="mailto:907365337@qq.com">907365337@qq.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-23">
<day>23</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>14</volume>
<elocation-id>1760872</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>09</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Huang, Yu, Chen, Wang, Huang, Wen and Li.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Huang, Yu, Chen, Wang, Huang, Wen and Li</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-23">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Objective</title>
<p>Large language models (LLMs), a core technology of generative artificial intelligence (AI), are increasingly used in health education and promotion. Although they may expand access to medical information, concerns remain regarding the reliability and readability of AI generated content for the public. This study evaluated the reliability and readability of answers generated by five LLMs to common questions about perinatal depression. The primary aims were to determine (1) the reliability of LLM responses to frequently asked questions about perinatal depression and (2) whether the readability of the generated content aligns with public health literacy levels.</p>
</sec>
<sec>
<title>Methods</title>
<p>Twenty-seven frequently asked questions were derived from Google Trends and patient facing resources from the American College of Obstetricians and Gynecologists (ACOG). Each question was submitted to ChatGPT-5, Gemini-2.5, Microsoft Copilot, Grok4, and DeepSeek. Two obstetricians independently rated responses using five validated instruments (DISCERN, EQIP, JAMA, GQS, and HONCODE) and inter-rater agreement was quantified using the interclass correlation coefficient (ICC). Readability was assessed using six indices: ARI, GFI, CLI, OLWF, LWGLF, and FRF. Differences among models were analyzed using the Friedman test.</p>
</sec>
<sec>
<title>Results</title>
<p>Inter rater agreement was high across 27 perinatal depression questions. ICC values ranged from 0.729 to 0.847. Significant between model differences emerged for DISCERN, EQIP, and HONCODE. All had <italic>p</italic> less than 0.001. No overall differences were found for JAMA and GQS. Grok4 scored highest on DISCERN at 60.33&#x202F;&#x00B1;&#x202F;5.48. DeepSeek scored highest on EQIP at 53.04&#x202F;&#x00B1;&#x202F;4.91. Copilot scored highest on HONCODE at 9.26&#x202F;&#x00B1;&#x202F;1.85. These results highlight distinct strengths in quality constructs across instruments. Readability posed a common limitation. All models exceeded the NIH recommended sixth grade level on grade-based indices (for example, ARI ranged from 13.49&#x202F;&#x00B1;&#x202F;2.92 to 15.81&#x202F;&#x00B1;&#x202F;3.25). Similarly, OLWF scores fell well below the sixth-grade benchmark of 94 (ranging from 61.44&#x202F;&#x00B1;&#x202F;6.80 to 72.96&#x202F;&#x00B1;&#x202F;10.39, where higher scores denote easier reading). Most models produced empathetic and informative content. However, they fell short in fully addressing clinical safety standards.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>Most LLMs demonstrated moderate to high reliability when responding to perinatal depression questions, supporting their potential as supplementary sources of health information. However, readability levels above recommended benchmarks suggest that current outputs may remain challenging for individuals with lower health literacy. While LLMs improve information accessibility, further improvements in readability, source attribution, and ethical transparency are needed to maximize public benefit and support equitable health communication. Future work should focus on defining and standardizing safety behaviors in high-risk mental health contexts to enable reliable clinical deployment.</p>
</sec>
</abstract>
<kwd-group>
<kwd>generative artificial intelligence</kwd>
<kwd>health information quality</kwd>
<kwd>large language models</kwd>
<kwd>perinatal depression</kwd>
<kwd>postpartum depression</kwd>
<kwd>readability</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="2"/>
<table-count count="5"/>
<equation-count count="0"/>
<ref-count count="44"/>
<page-count count="10"/>
<word-count count="7808"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Digital Public Health</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Perinatal depression refers to depressive symptoms occurring during pregnancy or within 1&#x202F;year postpartum, encompassing both antenatal and postpartum depression (<xref ref-type="bibr" rid="ref1">1</xref>). This mood disorder not only undermines maternal mental health but also affects fetal and infant development (<xref ref-type="bibr" rid="ref2">2</xref>). It may disrupt neural networks involved in emotion regulation and is associated with poorer offspring outcomes, including reduced attentional control, impaired emotion regulation, and altered social responsiveness (<xref ref-type="bibr" rid="ref3">3</xref>). Globally, approximately 10%&#x2013;20% of women experience perinatal depression (<xref ref-type="bibr" rid="ref4">4</xref>). In low and middle-income countries, the overall prevalence is 24.7%, substantially higher than in high-income countries (<xref ref-type="bibr" rid="ref5">5</xref>).</p>
<p>Among high-risk groups including racial/ethnic minorities, low-income populations, adolescent mothers, individuals with unintended pregnancies, and survivors of intimate partner violence the prevalence of perinatal depression is higher. A 2025 study reported that the prevalence of postpartum depression among adolescent mothers reached 40% and was even higher among minority and low-income adolescents (<xref ref-type="bibr" rid="ref6">6</xref>). Unintended pregnancy is associated with an approximately 51%&#x2013;59% increase in the relative risk of perinatal depression (<xref ref-type="bibr" rid="ref7">7</xref>), and the prevalence among individuals exposed to intimate partner violence has been reported as 38.9% (<xref ref-type="bibr" rid="ref8">8</xref>). Overall, these high-risk populations commonly exhibit prevalence rates in the range of approximately 30%&#x2013;40%.</p>
<p>Access to treatment for perinatal depression remains insufficient worldwide, and minority groups and low-income women face substantial barriers to obtaining mental health services during pregnancy and the postpartum period (<xref ref-type="bibr" rid="ref9 ref10 ref11">9&#x2013;11</xref>). A systematic review highlighted the effectiveness of psychotherapy for postpartum depression but noted that, due to limited-service availability, cultural barriers, and disparities in resources, only a minority of patients can receive psychotherapy (<xref ref-type="bibr" rid="ref12">12</xref>). Psychological, economic, and social factors often deter individuals from seeking professional help. Common psychological barriers include stigma, fear, and misconceptions about perinatal depression, all of which significantly reduce willingness to seek mental health support among perinatal women (<xref ref-type="bibr" rid="ref13 ref14 ref15">13&#x2013;15</xref>). Financial constraints, such as inadequate insurance coverage and the out-of-pocket cost of psychotherapy, also impede help seeking (<xref ref-type="bibr" rid="ref16">16</xref>). In low and middle-income countries, the economic burden of perinatal depression is substantial, including direct treatment costs and productivity losses, and represents a major structural barrier to care (<xref ref-type="bibr" rid="ref17">17</xref>). Social stigma and cultural attitudes, such as limited support from partners or family, fear of community judgment, and the stigmatization of postpartum mental health problems, further restrict access to care (<xref ref-type="bibr" rid="ref13">13</xref>, <xref ref-type="bibr" rid="ref18">18</xref>, <xref ref-type="bibr" rid="ref19">19</xref>).</p>
<p>With the rapid development of artificial intelligence, obtaining health information through AI has become a simple and potentially effective alternative, and is increasingly adopted by the public (<xref ref-type="bibr" rid="ref20">20</xref>). Generative artificial intelligence (GenAI) refers to a new generation of machine learning systems, such as large language models, that can generate novel, human like text rather than merely retrieving pre-existing information from a database. These models can synthesize complex medical information into conversational responses, making them increasingly popular for patient health education (<xref ref-type="bibr" rid="ref21">21</xref>). As an emerging technology, GenAI has been reported to enhance the accessibility and dissemination of health information (<xref ref-type="bibr" rid="ref22">22</xref>).</p>
<p>Generative AI (GenAI) has attracted substantial attention in recent years as a key branch of artificial intelligence in healthcare, with applications spanning diagnostic support, health education, image synthesis, and clinical decision making (<xref ref-type="bibr" rid="ref23">23</xref>, <xref ref-type="bibr" rid="ref24">24</xref>). In perinatal mental health, GenAI has been explored for potential use in delivering personalized informational support and assisting with screening-oriented interactions aimed at identifying and addressing postpartum depression, which may improve the efficiency of psychosocial support (<xref ref-type="bibr" rid="ref25">25</xref>). It can generate patient friendly health summaries in response to user queries, potentially supporting clinical communication and self-management (<xref ref-type="bibr" rid="ref26">26</xref>). A systematic review noted that in perinatal mental health settings, where risk communication and information reliability are particularly critical, GenAI may improve accessibility to information and the comprehensibility of content, but its real world effectiveness and safety boundaries require more rigorous evaluation (<xref ref-type="bibr" rid="ref27">27</xref>). In addition, GenAI chatbots may provide health education and conversational support, which could help improve access to information and health literacy; however, evidence for benefits on clinical outcomes, such as psychological recovery or sustained behavior change, remains limited and warrants further validation under appropriate governance and risk controls (<xref ref-type="bibr" rid="ref28">28</xref>).</p>
<p>Despite the established potential benefits of GenAI, research on the quality and readability of AI generated responses to perinatal depression&#x2013;related queries remain limited. One study evaluated AI models and search engines in the context of postpartum depression using a single quality metric and did not assess readability (<xref ref-type="bibr" rid="ref29">29</xref>). Overall, evidence regarding the quality and readability of LLM responses to frequently asked questions about perinatal depression is still scarce. Given the public health burden of perinatal depression, the present study systematically compared the reliability and readability of responses generated by different models. We benchmarked five state of the art LLMs, ChatGPT-5, Gemini-2.5, Microsoft Copilot, Grok4, and DeepSeek, on 27 commonly searched perinatal depression queries derived from Google Trends and the American College of Obstetricians and Gynecologists (ACOG) website. This work provides foundational evidence to inform the use of AI-generated health information and to support future research on safety and effectiveness.</p>
</sec>
<sec sec-type="materials|methods" id="sec2">
<label>2</label>
<title>Materials and methods</title>
<p>Perinatal depression includes both antenatal and postpartum depression. We identified 27 publicly available questions related to perinatal depression using a two step approach. First, we queried Google Trends using the terms &#x201C;prenatal depression&#x201D; and &#x201C;postpartum depression,&#x201D; with the time range set from 2020 to May 2025 and the geographic scope set to worldwide, which yielded 25 candidate questions. Duplicate, irrelevant, or nonsensical entries were removed. Second, we searched the American College of Obstetricians and Gynecologists (ACOG) website for the sections on &#x201C;Postpartum Depression&#x201D; and &#x201C;Depression During Pregnancy (<xref ref-type="bibr" rid="ref30">30</xref>, <xref ref-type="bibr" rid="ref31">31</xref>).&#x201D; After excluding items overlapping with the Google Trends results and screening the remaining items, we obtained a final set of 27 questions. The 27 unique questions (<xref ref-type="table" rid="tab1">Table 1</xref>) were submitted verbatim to each LLM.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Common public questions about perinatal depression (<italic>n</italic>&#x202F;=&#x202F;27).</p>
</caption>
<table frame="hsides" rules="groups">
<tbody>
<tr>
<td align="left" valign="top">1. What is depression?</td>
</tr>
<tr>
<td align="left" valign="top">2. What is postpartum depression?</td>
</tr>
<tr>
<td align="left" valign="top">3. What is perinatal depression?</td>
</tr>
<tr>
<td align="left" valign="top">4. How common is depression during pregnancy?</td>
</tr>
<tr>
<td align="left" valign="top">5. What are the signs of depression during pregnancy?</td>
</tr>
<tr>
<td align="left" valign="top">6. How can untreated depression affect me during pregnancy?</td>
</tr>
<tr>
<td align="left" valign="top">7. How can untreated depression affect my fetus and newborn?</td>
</tr>
<tr>
<td align="left" valign="top">8. How can I get help for depression during pregnancy?</td>
</tr>
<tr>
<td align="left" valign="top">9. How is depression during pregnancy treated?</td>
</tr>
<tr>
<td align="left" valign="top">10. What is psychotherapy?</td>
</tr>
<tr>
<td align="left" valign="top">11. What are antidepressants?</td>
</tr>
<tr>
<td align="left" valign="top">12. What should I know about taking an antidepressant during pregnancy?</td>
</tr>
<tr>
<td align="left" valign="top">13. Can antidepressants pass to a baby through breast milk?</td>
</tr>
<tr>
<td align="left" valign="top">14. Can antidepressants cause side effects?</td>
</tr>
<tr>
<td align="left" valign="top">15. What other mental health conditions are common during pregnancy?</td>
</tr>
<tr>
<td align="left" valign="top">16. What is perinatal anxiety?</td>
</tr>
<tr>
<td align="left" valign="top">17. What is perinatal mental health?</td>
</tr>
<tr>
<td align="left" valign="top">18. What is prenatal depression?</td>
</tr>
<tr>
<td align="left" valign="top">19. What are perinatal depression symptoms?</td>
</tr>
<tr>
<td align="left" valign="top">20. What are the baby blues?</td>
</tr>
<tr>
<td align="left" valign="top">21. How long do the baby blues usually last?</td>
</tr>
<tr>
<td align="left" valign="top">22. When does postpartum depression occur?</td>
</tr>
<tr>
<td align="left" valign="top">23. What causes postpartum depression?</td>
</tr>
<tr>
<td align="left" valign="top">24. I think I have postpartum depression. What should I do?</td>
</tr>
<tr>
<td align="left" valign="top">25. How is postpartum depression treated?</td>
</tr>
<tr>
<td align="left" valign="top">26. What can be done to help prevent postpartum depression in women with a history of depression?</td>
</tr>
<tr>
<td align="left" valign="top">27. What support is available to help me cope with postpartum depression?</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="sec3">
<label>2.1</label>
<title>Experimental setup and model evaluation</title>
<p>All interactions were conducted in Macau, China, between November 1 and November 10, 2025. To approximate typical end user usage, all models were evaluated via their official web interfaces rather than <italic>via</italic> APIs. The evaluated products included ChatGPT using GPT-5, released on August 7, 2025; Gemini 2.5 Pro, released on June 17, 2025; DeepSeek (V3.1), released on August 21, 2025; Grok 4, released on July 9, 2025; and Microsoft Copilot. All products were commercially deployed closed source systems, so training data and model parameters were not accessible and generation settings such as temperature and top p remained at platform defaults. Each question was asked in a single turn without follow up prompts and submitted in a new chat session to minimize carry over effects. Outputs were captured verbatim via scripted logging with manual cross checking. We did not actively enable any explicit browsing or citation modes. However, in closed source web products the backend retrieval and routing behavior cannot be fully verified; Microsoft documentation indicates that Copilot Chat runs on GPT-5 by default and automatically routes prompts to the best performing models for each task, so the exact model and routing path used for any given response cannot be reliably identified or controlled, which may limit strict reproducibility.</p>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Data handling and analysis</title>
<p>This study employed a double-blind design to ensure objective evaluation and minimize bias. The data from each model&#x2019;s output were anonymized by an independent third party to eliminate any identifying information. The scoring process was carried out by two trained clinical evaluators who were blinded to the model type, prompt variations, search augmentation, generation parameters, and any metadata. Both evaluators, each with more than 10&#x202F;years of clinical experience and based at tertiary (Grade A) hospitals in China, independently assessed the outputs. They had extensive experience in obstetrics and some research experience in perinatal maternal mental health. Prior to scoring, the evaluators received standardized training on the use of the assessment instruments. Any discrepancies in ratings were adjudicated by a third senior physician. Reliability was assessed using five validated tools; detailed scoring criteria are provided in <xref ref-type="table" rid="tab2">Table 2</xref>.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Scoring ranges and interpretation thresholds for reliability quality instruments used to evaluate LLM generated responses.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Tool</th>
<th align="center" valign="top">Score range</th>
<th align="center" valign="top">Excellent</th>
<th align="center" valign="top">Good</th>
<th align="center" valign="top">Fair</th>
<th align="center" valign="top">Poor</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">DISCERN</td>
<td align="center" valign="top">16&#x2013;80</td>
<td align="center" valign="top">63&#x2013;80</td>
<td align="center" valign="top">50&#x2013;62</td>
<td align="center" valign="top">31&#x2013;49</td>
<td align="center" valign="top">16&#x2013;30</td>
</tr>
<tr>
<td align="left" valign="middle">EQIP</td>
<td align="center" valign="top">0&#x2013;100</td>
<td align="center" valign="top">76&#x2013;100</td>
<td align="center" valign="top">51&#x2013;75</td>
<td align="center" valign="top">26&#x2013;50</td>
<td align="center" valign="top">0&#x2013;25</td>
</tr>
<tr>
<td align="left" valign="middle">JAMA</td>
<td align="center" valign="top">0&#x2013;4</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">0&#x2013;1</td>
</tr>
<tr>
<td align="left" valign="middle">GQS</td>
<td align="center" valign="top">1&#x2013;5</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">1&#x2013;2</td>
</tr>
<tr>
<td align="left" valign="middle">HONCODE</td>
<td align="center" valign="top">0&#x2013;16</td>
<td align="center" valign="top">13&#x2013;16</td>
<td align="center" valign="top">9&#x2013;12</td>
<td align="center" valign="top">5&#x2013;8</td>
<td align="center" valign="top">0&#x2013;4</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>Evaluation criteria and statistical analysis</title>
<p>DISCERN is used to evaluate consumer health publications, particularly the quality of information on treatment choices (<xref ref-type="bibr" rid="ref32">32</xref>). The instrument comprises 16 items grouped into three domains: reliability of the publication, quality of information on treatment options, and an overall rating. Each item is scored from 1 (lowest) to 5 (highest), yielding a total score ranging from 16 to 80.</p>
<p>The EQIP (Ensuring Quality Information for Patients) tool assesses the quality of online patient information. Developed by Moult et al. (<xref ref-type="bibr" rid="ref33">33</xref>) in 2004, EQIP includes 20 items designed to evaluate the completeness, presentation, and usability of written patient information, and it has been widely used in health information quality assessment studies (<xref ref-type="bibr" rid="ref34">34</xref>, <xref ref-type="bibr" rid="ref35">35</xref>). Each item is rated using a binary yes/no criterion (&#x201C;yes&#x201D;&#x202F;=&#x202F;5 points; &#x201C;no&#x201D;&#x202F;=&#x202F;0 points). The overall score is calculated as the proportion of &#x201C;yes&#x201D; responses multiplied by 100, resulting in a total score ranging from 0 to 100.</p>
<p>The JAMA benchmarks, proposed by the <italic>J</italic>ournal of the American Medical Association in 1997, provide a simple framework for assessing the quality of online health information sources (<xref ref-type="bibr" rid="ref36">36</xref>). They evaluate four transparency components: authorship, attribution, disclosure, and currency. Each criterion is scored as 1 if present and 0 if absent, yielding a total score from 0 to 4, with higher scores indicating better information quality.</p>
<p>GQS (The Global Quality Score), developed by Bernard and colleagues in 2007, is a subjective overall rating tool commonly used to assess the quality of online health information, particularly in internet-based health promotion research (<xref ref-type="bibr" rid="ref37">37</xref>). It provides a rapid global judgment of medical accuracy, completeness, and educational usefulness. GQS is typically rated on a 5-point Likert scale, where 1 indicates very poor/misleading information and 5 indicates excellent/highly useful information.</p>
<p>HONCODE is maintained by the Health on the Net Foundation (founded in 1995) and is one of the oldest and most widely recognized certification standards for online medical and health information. It assesses compliance with eight ethical and transparency principles. Each principle is scored as 0, 1, or 2 (0&#x202F;=&#x202F;not compliant; 1&#x202F;=&#x202F;partially compliant; 2&#x202F;=&#x202F;fully compliant), producing a total score ranging from 0 to 16, with higher scores indicating greater reliability and transparency (<xref ref-type="bibr" rid="ref38">38</xref>).</p>
<p>Readability was evaluated using six indices, each reflecting a different aspect of textual complexity. The Automated Readability Index (ARI) estimates the grade level required to comprehend a text based on sentence length and word length. The Gunning Fog Index (GFI) reflects reading difficulty using average sentence length and the proportion of complex words. The Coleman&#x2013;Liau Index (CLI) estimates grade level based on characters per word and sentences per text. The Original Linsear Write Formula (OLWF) captures the balance between word length and sentence length on a 0&#x2013;100 scale. The Linsear Write Grade Level Formula (LWGLF) reflects difficulty based on the proportion of simple versus complex words. The FORCAST Readability Formula (FRF) evaluates readability based on word complexity and sentence structure.</p>
<p>The National Institutes of Health (NIH) recommends that public health information be written at approximately a sixth to seventh grade reading level (<xref ref-type="bibr" rid="ref39">39</xref>).therefore, we adopted the sixth-grade level as our benchmark. For grade-based indices (ARI, GFI, CLI, LWGLF, and FRF), the output approximates U.S. School grade levels, with higher scores indicating more difficult text. Accordingly, a target value of 6 was used, and scores closer to 6 indicate better alignment with recommended public readability levels. For OLWF, higher scores indicate easier reading; based on the published correspondence between OLWF values and sixth-grade materials, we used OLWF&#x202F;=&#x202F;94 as the sixth-grade benchmark. Using these benchmarks, we compared model outputs against the reference values in the Results: grade-based indices substantially above 6 indicate higher reading burden, whereas OLWF values substantially below 94 indicate insufficient readability.</p>
<p>Means and standard deviations were used as descriptive statistics to summarize reliability and readability scores. Given the repeated measures design, we used the Friedman test to assess overall differences among models, followed by paired Wilcoxon signed rank tests with Bonferroni adjustment for <italic>post hoc</italic> pairwise comparisons. Effect sizes (<italic>r</italic>) were calculated to quantify the magnitude of pairwise differences. Inter rater agreement between the two physicians for reliability ratings was assessed using the interclass correlation coefficient (ICC). Data analyses were performed in R (version 4.5.2).</p>
</sec>
</sec>
<sec sec-type="results" id="sec6">
<label>3</label>
<title>Results</title>
<sec id="sec7">
<label>3.1</label>
<title>Reliability analysis</title>
<p>Reliability was assessed using five instruments: DISCERN, EQIP, JAMA, GQS, and HONCODE. Inter rater agreement between the two obstetricians was high across instruments, with ICC values of 0.787 for DISCERN, 0.847 for GQS, 0.797 for JAMA, 0.729 for EQIP, and 0.777 for HONCODE. <xref ref-type="fig" rid="fig1">Figure 1</xref> summarizes model scores, and <xref ref-type="table" rid="tab3">Table 3</xref> reports overall between model differences using Friedman tests. Significant differences were observed for DISCERN, EQIP, and HONCODE (all <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), whereas overall differences were not significant for JAMA and GQS.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Distribution of reliability quality scores across five large language models (<italic>n</italic>&#x202F;=&#x202F;27 questions). Scores were evaluated using DISCERN, EQIP, JAMA, GQS, and HONCODE. Higher scores indicate better information quality/credibility across instruments.</p>
</caption>
<graphic xlink:href="fpubh-14-1760872-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Five grouped box and violin plots compare scoring results for five AI models&#x2014;ChatGPT-5, Copilot, Deepseek, Gemini-2.5, and Grok4&#x2014;across five criteria: DISCERN, EQIP, GQS, HONCODE, and JAMA. Different colors represent each model, and vertical axes show each metric&#x2019;s numeric range for the corresponding scoring standard.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Reliability scores (Mean &#x00B1; SD) of large language models and overall comparisons using Friedman tests.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">DISCERN</th>
<th align="center" valign="top">EQIP</th>
<th align="center" valign="top">JAMA</th>
<th align="center" valign="top">GQS</th>
<th align="center" valign="top">HONCODE</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">ChatGPT-5</td>
<td align="center" valign="middle">44.07&#x202F;&#x00B1;&#x202F;5.31</td>
<td align="center" valign="middle">50.11&#x202F;&#x00B1;&#x202F;4.11</td>
<td align="center" valign="middle">1.07&#x202F;&#x00B1;&#x202F;0.55</td>
<td align="center" valign="middle">3.67&#x202F;&#x00B1;&#x202F;1.04</td>
<td align="center" valign="middle">4.59&#x202F;&#x00B1;&#x202F;2.22</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot</td>
<td align="center" valign="middle">54.48&#x202F;&#x00B1;&#x202F;6.06</td>
<td align="center" valign="middle">52.41&#x202F;&#x00B1;&#x202F;4.48</td>
<td align="center" valign="middle">1.48&#x202F;&#x00B1;&#x202F;1.09</td>
<td align="center" valign="middle">3.96&#x202F;&#x00B1;&#x202F;1.02</td>
<td align="center" valign="middle">9.26&#x202F;&#x00B1;&#x202F;1.85</td>
</tr>
<tr>
<td align="left" valign="middle">DeepSeek</td>
<td align="center" valign="middle">46.63&#x202F;&#x00B1;&#x202F;5.47</td>
<td align="center" valign="middle">53.04&#x202F;&#x00B1;&#x202F;4.91</td>
<td align="center" valign="middle">0.96&#x202F;&#x00B1;&#x202F;0.76</td>
<td align="center" valign="middle">3.70&#x202F;&#x00B1;&#x202F;0.78</td>
<td align="center" valign="middle">3.70&#x202F;&#x00B1;&#x202F;2.32</td>
</tr>
<tr>
<td align="left" valign="middle">Gemini-2.5</td>
<td align="center" valign="middle">51.11&#x202F;&#x00B1;&#x202F;4.17</td>
<td align="center" valign="middle">47.81&#x202F;&#x00B1;&#x202F;3.63</td>
<td align="center" valign="middle">1.37&#x202F;&#x00B1;&#x202F;1.08</td>
<td align="center" valign="middle">3.78&#x202F;&#x00B1;&#x202F;0.89</td>
<td align="center" valign="middle">5.41&#x202F;&#x00B1;&#x202F;1.87</td>
</tr>
<tr>
<td align="left" valign="middle">Grok4</td>
<td align="center" valign="middle">60.33&#x202F;&#x00B1;&#x202F;5.48</td>
<td align="center" valign="middle">52.18&#x202F;&#x00B1;&#x202F;6.43</td>
<td align="center" valign="middle">0.93&#x202F;&#x00B1;&#x202F;0.96</td>
<td align="center" valign="middle">4.00&#x202F;&#x00B1;&#x202F;0.83</td>
<td align="center" valign="middle">8.89&#x202F;&#x00B1;&#x202F;2.36</td>
</tr>
<tr>
<td align="left" valign="middle">Friedman <italic>&#x03C7;<sup>2</sup></italic> (df)</td>
<td align="center" valign="middle">70.79 (4)</td>
<td align="center" valign="middle">19.08 (4)</td>
<td align="center" valign="middle">4.16 (4)</td>
<td align="center" valign="middle">4.07 (4)</td>
<td align="center" valign="middle">60.38 (4)</td>
</tr>
<tr>
<td align="left" valign="middle"><italic>p</italic></td>
<td align="center" valign="middle">&#x003C;0.001</td>
<td align="center" valign="middle">&#x003C;0.001</td>
<td align="center" valign="middle">0.385</td>
<td align="center" valign="middle">0.396</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>For DISCERN, which reflects the completeness and quality of information about treatment options, models differed significantly in performance. Grok4 and Copilot achieved the highest scores, suggesting that their responses were more clearly structured and comprehensive, particularly in describing treatment options as well as potential risks and benefits. ChatGPT-5 scored the lowest on this dimension; overall scores fell in the good to fair range, implying room for improvement in content depth and information organization.</p>
<p>For EQIP, DeepSeek performed best in clarity of expression, formatting standardization, and logical flow, followed by Copilot and Grok4. Gemini-2.5 scored the lowest, reflecting limitations in readability, information structure, and explanatory detail. Overall, except for Gemini-2.5 and ChatGPT-5, most models were within the good range, indicating generally acceptable clarity but with opportunities for further optimization.</p>
<p>For JAMA, which emphasize authority and transparency, Copilot and Gemini-2.5 performed relatively better, particularly with respect to authorship identification, source attribution, and disclosure statements, aligning more closely with standards commonly expected in medical publishing. Grok4 scored the lowest on this dimension, which may reflect more limited or conservative practices in providing explicit attribution and disclosures within its responses.</p>
<p>For GQS, Grok4 and Copilot scored slightly higher than other models in overall perceived information quality, including usefulness and user perceived value. ChatGPT-5 was relatively weaker, suggesting that it may require improvement in conveying key clinical details effectively. Most models were rated in the fair range, and mean differences were modest, indicating that the generated health information was generally adequate but not consistently high quality.</p>
<p>For HONCODE, which evaluates ethical and transparency principles of health information, Copilot and Grok4 performed best, scoring significantly higher than other models, suggesting better alignment with expectations related to transparency, accountability, and information governance. Gemini-2.5 and ChatGPT-5 were slightly above average, whereas DeepSeek scored the lowest, indicating that ethical and transparency aspects may warrant further strengthening for some models.</p>
<p>Based on the Friedman results shown in <xref ref-type="table" rid="tab3">Table 3</xref> and the effect size analysis in <xref ref-type="supplementary-material" rid="SM1">Supplementary document</xref>, significant between model differences were observed in overall quality assessments, including DISCERN (<italic>&#x03C7;<sup>2</sup></italic>&#x202F;=&#x202F;70.79, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), EQIP (<italic>&#x03C7;<sup>2</sup></italic>&#x202F;=&#x202F;19.08, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001), and HONCODE (<italic>&#x03C7;<sup>2</sup></italic>&#x202F;=&#x202F;60.38, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001). These findings indicate that the evaluated models differed in their performance on quality constructs relevant to complex medical information.</p>
<p><italic>Post hoc</italic> pairwise comparisons further showed that the magnitude of these differences varied across instruments. For DISCERN and HONCODE, several comparisons yielded large effect sizes (<italic>r</italic>&#x202F;&#x003E;&#x202F;0.6), particularly for model pairs involving Grok4 and Copilot, suggesting substantial differences in reliability related performance. For EQIP, effect sizes spanned large, moderate, and small ranges, indicating that this instrument provided meaningful discrimination among models. In contrast, no statistically significant overall differences were detected for the simpler, more subjective scales such as GQS (<italic>p</italic>&#x202F;=&#x202F;0.396) and JAMA (<italic>p</italic>&#x202F;=&#x202F;0.385). Consistent with this, <italic>post hoc</italic> analyses generally showed small or negligible effect sizes, suggesting that model performance was broadly comparable when evaluated using these basic quality benchmarks.</p>
<p><xref ref-type="table" rid="tab4">Table 4</xref> presents Bonferroni corrected post hoc pairwise comparisons of reliability scores among the five large language models. Significant pairwise differences were primarily observed for DISCERN, EQIP, and HONCODE, whereas few significant differences were detected for JAMA and GQS. Notably, DeepSeek and ChatGPT-5 showed no significant differences across most reliability metrics, while Grok4 differed significantly from several other models on multiple measures.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Post hoc pairwise comparisons of reliability scores among five LLMs using paired Wilcoxon signed rank tests (Bonferroni-adjusted <italic>p</italic>-values).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Comparison (model A <italic>vs.</italic> model B)</th>
<th align="center" valign="top">DISCERN</th>
<th align="center" valign="top">EQIP</th>
<th align="center" valign="top">JAMA</th>
<th align="center" valign="top">GQS</th>
<th align="center" valign="top">HONCODE</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">DeepSeek&#x2014;Copilot</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0.81</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="middle">DeepSeek&#x2014;ChatGPT-5</td>
<td align="center" valign="top">0.20</td>
<td align="center" valign="top">0.33</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="middle">DeepSeek&#x2014;Gemini-2.5</td>
<td align="center" valign="top">0.04</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0.08</td>
</tr>
<tr>
<td align="left" valign="middle">DeepSeek&#x2014;Grok4</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot&#x2014;ChatGPT-5</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">0.93</td>
<td align="center" valign="top">0.74</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot&#x2014;Gemini-2.5</td>
<td align="center" valign="top">0.07</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="middle">Copilot&#x2014;Grok4</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="middle">ChatGPT-5&#x2014;Gemini-2.5</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">0.45</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
</tr>
<tr>
<td align="left" valign="middle">ChatGPT-5&#x2014;Grok4</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="middle">Gemini-2.5&#x2014;Grok4</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">&#x003C;0.01</td>
<td align="center" valign="top">0.99</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Values are Bonferroni-adjusted <italic>p</italic>-values (<italic>p</italic>_adj). Statistical significance was set at <italic>p</italic>_adj &#x003C; 0.05.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec8">
<label>3.2</label>
<title>Readability analysis</title>
<p>Across the five AI models and six readability indices, detailed scores and Friedman test results are presented in <xref ref-type="table" rid="tab5">Table 5</xref> and <xref ref-type="fig" rid="fig2">Figure 2</xref>. None of the models achieved the recommended sixth-grade benchmark on ARI, GFI, CLI, OLWF, LWGLF, or FRF. Unlike the grade level indices, higher OLWF scores indicate easier readability; therefore, OLWF values substantially below the sixth-grade benchmark reflect increased reading difficulty. Overall, this indicates that text generated by all models exceeded the complexity expected for sixth grade reading level.</p>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>Readability scores (Mean &#x00B1; SD) of LLM generated responses across six indices, including a sixth-grade benchmark and overall comparisons using the Friedman test.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">ARI</th>
<th align="center" valign="top">GFI</th>
<th align="center" valign="top">CLI</th>
<th align="center" valign="top">OLWF</th>
<th align="center" valign="top">LWGLF</th>
<th align="center" valign="top">FRF</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT-5</td>
<td align="center" valign="top">13.98&#x202F;&#x00B1;&#x202F;2.39</td>
<td align="center" valign="top">13.25&#x202F;&#x00B1;&#x202F;2.39</td>
<td align="center" valign="top">14.88&#x202F;&#x00B1;&#x202F;2.18</td>
<td align="center" valign="top">65.04&#x202F;&#x00B1;&#x202F;7.42</td>
<td align="center" valign="top">13.47&#x202F;&#x00B1;&#x202F;7.64</td>
<td align="center" valign="top">12.65&#x202F;&#x00B1;&#x202F;0.94</td>
</tr>
<tr>
<td align="left" valign="top">Copilot</td>
<td align="center" valign="top">13.49&#x202F;&#x00B1;&#x202F;2.92</td>
<td align="center" valign="top">13.00&#x202F;&#x00B1;&#x202F;2.79</td>
<td align="center" valign="top">13.77&#x202F;&#x00B1;&#x202F;3.39</td>
<td align="center" valign="top">72.96&#x202F;&#x00B1;&#x202F;10.39</td>
<td align="center" valign="top">14.10&#x202F;&#x00B1;&#x202F;7.78</td>
<td align="center" valign="top">12.83&#x202F;&#x00B1;&#x202F;1.12</td>
</tr>
<tr>
<td align="left" valign="top">DeepSeek</td>
<td align="center" valign="top">13.88&#x202F;&#x00B1;&#x202F;2.08</td>
<td align="center" valign="top">12.94&#x202F;&#x00B1;&#x202F;1.63</td>
<td align="center" valign="top">13.91&#x202F;&#x00B1;&#x202F;1.65</td>
<td align="center" valign="top">66.63&#x202F;&#x00B1;&#x202F;5.77</td>
<td align="center" valign="top">12.74&#x202F;&#x00B1;&#x202F;3.39</td>
<td align="center" valign="top">12.19&#x202F;&#x00B1;&#x202F;0.60</td>
</tr>
<tr>
<td align="left" valign="top">Gemini-2.5</td>
<td align="center" valign="top">15.12&#x202F;&#x00B1;&#x202F;2.90</td>
<td align="center" valign="top">13.07&#x202F;&#x00B1;&#x202F;2.35</td>
<td align="center" valign="top">13.95&#x202F;&#x00B1;&#x202F;2.20</td>
<td align="center" valign="top">64.96&#x202F;&#x00B1;&#x202F;10.07</td>
<td align="center" valign="top">15.74&#x202F;&#x00B1;&#x202F;10.72</td>
<td align="center" valign="top">12.21&#x202F;&#x00B1;&#x202F;0.95</td>
</tr>
<tr>
<td align="left" valign="top">Grok4</td>
<td align="center" valign="top">15.81&#x202F;&#x00B1;&#x202F;3.25</td>
<td align="center" valign="top">13.89&#x202F;&#x00B1;&#x202F;2.39</td>
<td align="center" valign="top">15.37&#x202F;&#x00B1;&#x202F;1.64</td>
<td align="center" valign="top">61.44&#x202F;&#x00B1;&#x202F;6.80</td>
<td align="center" valign="top">15.44&#x202F;&#x00B1;&#x202F;4.81</td>
<td align="center" valign="top">12.79&#x202F;&#x00B1;&#x202F;0.76</td>
</tr>
<tr>
<td align="left" valign="middle">6th grade level score</td>
<td align="center" valign="middle">6</td>
<td align="center" valign="middle">6</td>
<td align="center" valign="middle">6</td>
<td align="center" valign="middle">94</td>
<td align="center" valign="middle">6</td>
<td align="center" valign="middle">6</td>
</tr>
<tr>
<td align="left" valign="middle"><italic>p</italic></td>
<td align="center" valign="middle">&#x003C;0.001</td>
<td align="center" valign="middle">0.064</td>
<td align="center" valign="middle">&#x003C;0.001</td>
<td align="center" valign="middle">&#x003C;0.001</td>
<td align="center" valign="middle">0.003</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="middle">Friedman <italic>&#x03C7;</italic><sup>2</sup> (<italic>df</italic>)</td>
<td align="center" valign="middle">34.90 (4)</td>
<td align="center" valign="middle">8.88 (4)</td>
<td align="center" valign="middle">26.22 (4)</td>
<td align="center" valign="middle">33.74 (4)</td>
<td align="center" valign="middle">15.92 (4)</td>
<td align="center" valign="middle">21.62 (4)</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Distribution of readability metrics across five large language models with sixth-grade benchmarks (<italic>n</italic>&#x202F;=&#x202F;27 questions). Indices include ARI, GFI, CLI, OLWF, LWGLF, and FRF. Higher scores indicate greater reading difficulty for ARI/GFI/CLI/LWGLF/FRF, whereas higher OLWF indicates better readability. Sixth-grade benchmark values are shown as reference lines.</p>
</caption>
<graphic xlink:href="fpubh-14-1760872-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Violin box plots compare six AI models&#x2014;deepseek, grok4, copilot, gemini-2.5, and chatgpt-5&#x2014;across six metrics: ARI, GFI, CLI, OLWF, LWGLF, and FRF, showing distributions and median scores for each metric.</alt-text>
</graphic>
</fig>
<p>Grok4 and Gemini-2.5 obtained the highest scores on multiple indices, suggesting that they produced the most complex text and thus imposed the highest reading burden. Copilot and DeepSeek had slightly lower scores, but remained well above recommended levels, indicating marginally easier text that may nonetheless be unsuitable for individuals with lower health literacy. ChatGPT-5 was closer to the overall average on some indices, yet its outputs still tended toward higher complexity, particularly on CLI and ARI. Collectively, these findings suggest that model generated responses were generally complex and associated with a relatively high readability threshold.</p>
<p>The Friedman test results indicated statistically significant between model differences for ARI, CLI, OLWF, LWGLF, and FRF (all <italic>p</italic>&#x202F;&#x003C;&#x202F;0.01), whereas GFI did not reach statistical significance (<italic>p</italic>&#x202F;=&#x202F;0.064). The non-significant GFI result likely reflects a shared characteristic across models: when responding to perinatal depression&#x2013;related queries, all models inevitably used a substantial number of multisyllabic medical terms (e.g., &#x201C;postpartum,&#x201D; &#x201C;antidepressant&#x201D;), resulting in broadly similar levels of complex vocabulary. In contrast, indices more directly related to reading difficulty, such as ARI, CLI, and FRF, showed distinguishable differences across models, suggesting variability in linguistic complexity and reading burden.</p>
<p>By comparison, differences were smaller for indices that emphasize sentence structure and word combination features, such as GFI and LWGLF, indicating that models were broadly similar in average sentence length and in their overall control of complex word proportions. Effect size analyses further suggested that the magnitude of these differences was generally small to moderate, implying limited practical separation in readability across models. <italic>Post hoc</italic> pairwise comparisons showed that differences varied by readability index and did not yield a consistent pattern of superiority across all measures.</p>
</sec>
</sec>
<sec sec-type="discussion" id="sec9">
<label>4</label>
<title>Discussion</title>
<p>The use of generative AI in mental health is expanding, with emerging applications in risk identification and early assessment of psychiatric disorders, personalized intervention and treatment support, and conversational emotional support. Prior research has shown that access to healthcare services among individuals with perinatal depression remains limited, suggesting a public need for AI assisted responses to perinatal depression&#x2013;related questions. Therefore, the reliability and readability of AI generated information are critical for determining whether such content can meaningfully support the public. In this study, we evaluated the reliability and readability of information generated by five widely used AI models to characterize their performance on reliability related outcomes.</p>
<sec id="sec10">
<label>4.1</label>
<title>Safety related response characteristics of LLMs</title>
<p>There were substantial differences across large language models in how they handled safety critical perinatal mental health scenarios, particularly with respect to clinical appropriateness, risk communication, and referral to professional care. Although all models included general disclaimers that they cannot replace clinicians, they varied markedly in the depth of their clinical framing and in how closely they aligned with established care pathways.</p>
<p>Copilot and Grok4 provided the most comprehensive responses and were most consistent with clinical practice. Both models repeatedly emphasized that treatment decisions during pregnancy require individualized risk&#x2013;benefit assessment under joint supervision by obstetric and psychiatric clinicians. Their responses explicitly acknowledged the risks of untreated perinatal depression for both the mother and the fetus, which is consistent with consensus guidance, while appropriately avoiding directive or prescriptive medical advice. ChatGPT consistently emphasized that it cannot replace a clinician and, in the context of postpartum depression, demonstrated strong safety awareness by encouraging timely professional evaluation, highlighting warning signs of self-harm, and directing users to crisis resources when indicated.</p>
<p>Grok4 performed particularly well in escalation behaviors, proactively providing region specific crisis hotlines and explicitly encouraging urgent care when severe symptoms were mentioned. It also attempted to identify nearby counseling or medical services by providing local addresses and phone numbers. However, Grok4&#x2019;s responses were more variable in structure and sometimes prioritized extensive resource lists over a systematic clinical explanation, which may pose comprehension challenges for users with lower health literacy.</p>
<p>Gemini-2.5 generally recognized perinatal depression as a medical condition and encouraged help seeking; however, its responses were less consistent across similar prompts. Variation in emphasis across related questions suggests weaker internal consistency, and recommendations for crisis resources were less consistently embedded within an explicit clinical risk framework. Such inconsistency may limit the reliability of model outputs in high-risk mental health contexts, where stable and consistent safety signaling is essential.</p>
<p>DeepSeek&#x2019;s responses were generally supportive and non-harmful, but they often remained at a broad educational level. Compared with other models, it less frequently provided explicit guidance on when urgent or emergency care is warranted, and its referral pathways were less specific. While this level of response may be acceptable for general psych education, it offers more limited support for risk recognition, stratification, and action planning when risk is elevated, or safety signals are ambiguous.</p>
</sec>
<sec id="sec11">
<label>4.2</label>
<title>Reliability of information</title>
<p>Based on our results, Grok4 achieved the highest score under the DISCERN framework. As Grok4 is a relatively new model, there is currently limited published evidence available for direct comparison of DISCERN scores. The scoring profile of Grok4 suggests that its strengths are not primarily reflected in presentation style or formal citation conventions, but rather in its safety-oriented responses in high-risk scenarios. This characteristic may carry potential clinical value in perinatal mental health contexts, while also indicating room for improvement in the standardized presentation expected for public facing health education materials. In this sense, its outputs appear more aligned with a &#x201C;clinical communication&#x201D; style than an &#x201C;educational material&#x201D; style.</p>
<p>Among the remaining models, Copilot achieved the highest DISCERN score, outperforming Gemini-2.5 and DeepSeek, which is consistent with findings from two prior studies (<xref ref-type="bibr" rid="ref40">40</xref>, <xref ref-type="bibr" rid="ref41">41</xref>). Copilot frequently emphasizes source attribution and often references academic journals and official guidelines; its strong performance on the JAMA further suggests that it strikes a balance between quality, breadth, and prioritization of key information. Although Gemini-2.5 obtained a relatively high JAMA score, it performed less well on DISCERN. This discrepancy may reflect the fact that DISCERN evaluates not only accuracy but also the structured, user oriented, and comprehensive presentation of information, including discussion of risks and alternative options. Thus, while Gemini-2.5 may perform well on transparency related criteria captured by JAMA, limitations in organization, user centered framing, or comprehensiveness may contribute to a lower DISCERN score. DeepSeek performed particularly well on EQIP, a pattern also reported in prior work. Its outputs resemble a &#x201C;health education/health literacy&#x201D; mode rather than a &#x201C;clinical decision support&#x201D; mode, which may be well suited for public health education.</p>
<p>Joint analysis of DISCERN and HONCODE showed a positive association between the two scores, consistent with previous findings (<xref ref-type="bibr" rid="ref42">42</xref>). High performing models such as Grok4 and Copilot ranked higher on both instruments, whereas lower-performing models such as ChatGPT-5 tended to score lower on both. This suggests some overlap between information quality and adherence to established standards in consumer health contexts: higher-quality information is often more likely to align with transparency and accountability principles. A high DISCERN score indicates that the model&#x2019;s content is reliable, well organized, and comprehensive, while a high HONCODE score indicates stronger alignment with ethical and transparency standards, such as attribution and authority. Together, these properties may increase public confidence and reduce exposure to misleading or incorrect information. If AI systems can further improve transparency and authority signaling, they may help bridge the gap between public understanding and professional medical knowledge&#x2014;an especially important goal in an environment of information overload, uneven online content quality, lower health literacy in parts of the population, and persistent information asymmetry.</p>
</sec>
<sec id="sec12">
<label>4.3</label>
<title>Readability of information</title>
<p>Readability is critical to determining whether health information can be effectively communicated. The NIH explicitly recommends that public health materials be written at approximately a sixth to seventh grade reading level to ensure that the public, particularly individuals with lower health literacy, can understand medical content (<xref ref-type="bibr" rid="ref43">43</xref>). In our study, readability scores for all AI generated responses exceeded the sixth-grade level, with most approaching a high school reading level. This pattern is inconsistent with the NIH recommended benchmark for public facing health information. When the readability of health information exceeds the public&#x2019;s reading ability, several problems may arise. First, comprehension barriers may occur information may be available but not truly accessible, and readers may struggle with vocabulary, sentence structure, or logical organization. Even well-educated readers may misinterpret specialized medical terminology, treatment recommendations, or risk explanations, leading to misunderstanding or disregard of key information. Second, trust may be undermined: overly technical content can appear less trustworthy, and highly complex text may create psychological distance or trigger skepticism, especially when source attribution, update dates, or author credentials are unclear, thereby reducing acceptability and communication effectiveness (<xref ref-type="bibr" rid="ref44">44</xref>). Third, health behaviors may be impeded: information may be encountered but not translated into action. Poor readability reduces usability, making it difficult for lay readers to extract actionable points, which can weaken adherence and limit behavior change. This concern is particularly salient for perinatal women, who may experience substantial psychological stress; when health information is obscure or difficult to understand, uncertainty and anxiety may increase. In contrast, clear and plain language information can be reassuring. Low readability may also lead pregnant or postpartum individuals to misinterpret advice regarding medications, diet, or psychological interventions, potentially resulting in poor adherence or avoidable risk. By improving readability, AI based tools may help reduce such misunderstandings. Moreover, high quality, readable information may enable healthcare professionals to guide women toward reliable resources more efficiently, potentially reducing clinical workload. Therefore, to maximize the public health value of AI generated content, future efforts should prioritize improving readability without compromising information quality, ensuring that medical information is both accurate and understandable.</p>
</sec>
<sec id="sec13">
<label>4.4</label>
<title>Strengths</title>
<p>This study examined the reliability and readability of AI generated responses to perinatal depression related questions. Although numerous information quality assessments have been conducted in other medical domains, evidence in perinatal mental health remains limited. We evaluated five widely used, up to date AI models using well established instruments for reliability and readability, providing practical evidence to inform public facing consultation and the use of AI generated health information in this context.</p>
</sec>
<sec id="sec14">
<label>4.5</label>
<title>Limitations</title>
<p>This study has several limitations. First, we interacted with commercially deployed models through their official web interfaces; as a result, generation parameters such as temperature and top p were governed by platform defaults, and the generation process could not be fully standardized. Second, closed-source web products such as Microsoft Copilot may employ platform-level model selection and dynamic routing, and specific underlying model versions and routing behaviors are not fully disclosed. Therefore, in a web-based setting, researchers cannot reliably identify or control the exact model and routing pathway used for each response, which may affect strict comparability and reproducibility across models. In addition, LLM outputs may vary over time due to silent platform updates, policy changes, or routing adjustments; thus, our findings reflect performance within a specific time window. Third, this evaluation was conducted only in English and used a limited FAQ set derived from Google Trends and ACOG resources, which may not capture different cultural contexts, varying levels of health literacy, or higher risk clinical scenarios. Finally, while we focused on reliability and readability, we did not systematically quantify safety critical elements in perinatal mental health contexts, such as crisis resource guidance, risk communication, or referral recommendations. Future studies should extend this work across multiple languages and time points, under more controllable generation settings, and incorporate explicit safety metrics to further validate and generalize our findings.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec15">
<label>5</label>
<title>Conclusion</title>
<p>This study indicates that most AI models performed reasonably well when answering questions about perinatal depression. However, even the strongest performing models exhibited notable weaknesses; for example, Grok4 would benefit from improved performance on the JAMA, and Copilot would benefit from improvement on EQIP. Overall, readability across models was suboptimal. To enable the public to use AI more effectively and conveniently to obtain health information, further technical advances are needed to improve the readability of generated content. In addition, models should provide clearer source attribution by citing authoritative journals and official guidelines and by standardizing how sources and evidence are presented. With continued technical development, AI systems may better support public access to reliable health information.</p>
<p>Overall, although the evaluated LLMs were generally capable of generating empathetic and information rich content related to perinatal mental health, only some models consistently met key clinical safety expectations. These include appropriate referral guidance, discouraging self-directed medical decision making, and maintaining alignment with established care pathways. Such variability underscores the need for cautious deployment of generative AI in maternal mental health and highlights the importance of evaluating specific models rather than assuming equivalence across systems. Future research should focus on defining and standardizing safety behaviors, particularly in high-risk mental health scenarios, to ensure that AI generated health information supports informed decisions without inadvertently undermining safe clinical care.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec16">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="sec17">
<title>Author contributions</title>
<p>JH: Data curation, Formal analysis, Methodology, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. HY: Data curation, Writing &#x2013; original draft. JC: Data curation, Writing &#x2013; original draft. XW: Data curation, Writing &#x2013; original draft. LH: Formal analysis, Visualization, Writing &#x2013; original draft. JW: Software, Writing &#x2013; original draft. HL: Methodology, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec18">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec19">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec20">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec21">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fpubh.2026.1760872/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fpubh.2026.1760872/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.csv" id="SM1" mimetype="text/comma-separated-values" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_2.csv" id="SM2" mimetype="text/comma-separated-values" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label> <mixed-citation publication-type="journal"><person-group person-group-type="author"><collab id="coll1">The Lancet</collab></person-group>. <article-title>Perinatal depression: a neglected aspect of maternal health</article-title>. <source>Lancet</source>. (<year>2023</year>) <volume>402</volume>:<fpage>667</fpage>. doi: <pub-id pub-id-type="doi">10.1016/S0140-6736(23)01786-5</pub-id></mixed-citation></ref>
<ref id="ref2"><label>2.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>O'Hara</surname> <given-names>M</given-names></name> <name><surname>Wisner</surname> <given-names>K</given-names></name></person-group>. <article-title>Perinatal mental illness: definition, description and aetiology</article-title>. <source>Best Pract Res Clin Obstet Gynaecol</source>. (<year>2014</year>) <volume>28</volume>:<fpage>3</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bpobgyn.2013.09.002</pub-id>, <pub-id pub-id-type="pmid">24140480</pub-id></mixed-citation></ref>
<ref id="ref3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Koc</surname> <given-names>D</given-names></name> <name><surname>Hermans</surname> <given-names>APC</given-names></name> <name><surname>Xu</surname> <given-names>B</given-names></name> <name><surname>Muetzel</surname> <given-names>RL</given-names></name> <name><surname>El Marroun</surname> <given-names>H</given-names></name> <name><surname>Tiemeier</surname> <given-names>H</given-names></name></person-group>. <article-title>Perinatal maternal depressive symptoms and brain connectivity among 9-to 15-year-old offspring</article-title>. <source>JAMA Netw Open</source>. (<year>2025</year>) <volume>8</volume>:<fpage>e2523978</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.23978</pub-id>, <pub-id pub-id-type="pmid">40742589</pub-id></mixed-citation></ref>
<ref id="ref4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Br&#x00E4;nn</surname> <given-names>E</given-names></name> <name><surname>Shen</surname> <given-names>Q</given-names></name> <name><surname>Lu</surname> <given-names>D</given-names></name></person-group>. <article-title>Perinatal depression and its health impact</article-title>. <source>BMJ</source>. (<year>2024</year>) <volume>384</volume>:<fpage>2777</fpage>. doi: <pub-id pub-id-type="doi">10.1136/bmj.p2777</pub-id>, <pub-id pub-id-type="pmid">38195147</pub-id></mixed-citation></ref>
<ref id="ref5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mitchell</surname> <given-names>AR</given-names></name> <name><surname>Gordon</surname> <given-names>H</given-names></name> <name><surname>Lindquist</surname> <given-names>A</given-names></name> <name><surname>Walker</surname> <given-names>SP</given-names></name> <name><surname>Homer</surname> <given-names>CSE</given-names></name> <name><surname>Middleton</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Prevalence of perinatal depression in low-and middle-income countries: a systematic review and meta-analysis</article-title>. <source>JAMA Psychiatr</source>. (<year>2023</year>) <volume>80</volume>:<fpage>425</fpage>&#x2013;<lpage>31</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jamapsychiatry.2023.0069</pub-id>, <pub-id pub-id-type="pmid">36884232</pub-id></mixed-citation></ref>
<ref id="ref6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lesinskien&#x0117;</surname> <given-names>S</given-names></name> <name><surname>Andru&#x0161;kevi&#x010D;</surname> <given-names>J</given-names></name> <name><surname>Butvilait&#x0117;</surname> <given-names>A</given-names></name></person-group>. <article-title>Adolescent pregnancies and perinatal mental health: needs and complex support options: a literature review</article-title>. <source>J Clin Med</source>. (<year>2025</year>) <volume>14</volume>:<fpage>2334</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jcm14072334</pub-id>, <pub-id pub-id-type="pmid">40217788</pub-id></mixed-citation></ref>
<ref id="ref7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nelson</surname> <given-names>HD</given-names></name> <name><surname>Darney</surname> <given-names>BG</given-names></name> <name><surname>Ahrens</surname> <given-names>K</given-names></name> <name><surname>Burgess</surname> <given-names>A</given-names></name> <name><surname>Jungbauer</surname> <given-names>RM</given-names></name> <name><surname>Cantor</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Associations of unintended pregnancy with maternal and infant health outcomes: a systematic review and meta-analysis</article-title>. <source>JAMA</source>. (<year>2022</year>) <volume>328</volume>:<fpage>1714</fpage>&#x2013;<lpage>29</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2022.19097</pub-id>, <pub-id pub-id-type="pmid">36318133</pub-id></mixed-citation></ref>
<ref id="ref8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Al-Abri</surname> <given-names>K</given-names></name> <name><surname>Edge</surname> <given-names>D</given-names></name> <name><surname>Armitage</surname> <given-names>CJ</given-names></name></person-group>. <article-title>Prevalence and correlates of perinatal depression</article-title>. <source>Soc Psychiatry Psychiatr Epidemiol</source>. (<year>2023</year>) <volume>58</volume>:<fpage>1581</fpage>&#x2013;<lpage>90</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00127-022-02386-9</pub-id>, <pub-id pub-id-type="pmid">36646936</pub-id></mixed-citation></ref>
<ref id="ref9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>M</given-names></name> <name><surname>Chai</surname> <given-names>Q</given-names></name> <name><surname>Fei</surname> <given-names>Y</given-names></name></person-group>. <article-title>Unintended pregnancy and maternal and infant health outcomes</article-title>. <source>JAMA</source>. (<year>2023</year>) <volume>329</volume>:<fpage>765</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2022.24654</pub-id></mixed-citation></ref>
<ref id="ref10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rokicki</surname> <given-names>S</given-names></name> <name><surname>Patel</surname> <given-names>M</given-names></name> <name><surname>Suplee</surname> <given-names>PD</given-names></name> <name><surname>D&#x2019;Oria</surname> <given-names>R</given-names></name></person-group>. <article-title>Racial and ethnic disparities in access to community-based perinatal mental health programs: results from a cross-sectional survey</article-title>. <source>BMC Public Health</source>. (<year>2024</year>) <volume>24</volume>:<fpage>1094</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12889-024-18517-7</pub-id>, <pub-id pub-id-type="pmid">38643069</pub-id></mixed-citation></ref>
<ref id="ref11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Solomonov</surname> <given-names>N</given-names></name> <name><surname>Kerchner</surname> <given-names>D</given-names></name> <name><surname>Dai</surname> <given-names>Y</given-names></name> <name><surname>Kwon</surname> <given-names>M</given-names></name> <name><surname>Callaghan</surname> <given-names>DG</given-names></name> <name><surname>Schier</surname> <given-names>MM</given-names></name> <etal/></person-group>. <article-title>Prevalence and trajectories of perinatal anxiety and depression in a large urban medical center</article-title>. <source>JAMA Netw Open</source>. (<year>2025</year>) <volume>8</volume>:<fpage>e2533111</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.33111</pub-id>, <pub-id pub-id-type="pmid">40982279</pub-id></mixed-citation></ref>
<ref id="ref12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dennis</surname> <given-names>C-L</given-names></name> <name><surname>Singla</surname> <given-names>DR</given-names></name> <name><surname>Brown</surname> <given-names>HK</given-names></name> <name><surname>Savel</surname> <given-names>K</given-names></name> <name><surname>Clark</surname> <given-names>CT</given-names></name> <name><surname>Grigoriadis</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>Postpartum depression: a clinical review of impact and current treatment solutions</article-title>. <source>Drugs</source>. (<year>2024</year>) <volume>84</volume>:<fpage>645</fpage>&#x2013;<lpage>59</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s40265-024-02038-z</pub-id>, <pub-id pub-id-type="pmid">38811474</pub-id></mixed-citation></ref>
<ref id="ref13"><label>13.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Daehn</surname> <given-names>D</given-names></name> <name><surname>Rudolf</surname> <given-names>S</given-names></name> <name><surname>Pawils</surname> <given-names>S</given-names></name> <name><surname>Renneberg</surname> <given-names>B</given-names></name></person-group>. <article-title>Perinatal mental health literacy: knowledge, attitudes, and help-seeking among perinatal women and the public: a systematic review</article-title>. <source>BMC Pregnancy Childbirth</source>. (<year>2022</year>) <volume>22</volume>:<fpage>574</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12884-022-04865-y</pub-id>, <pub-id pub-id-type="pmid">35854232</pub-id></mixed-citation></ref>
<ref id="ref14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pinar</surname> <given-names>S</given-names></name> <name><surname>Bedford</surname> <given-names>H</given-names></name> <name><surname>Ersser</surname> <given-names>S</given-names></name> <name><surname>McMillan</surname> <given-names>D</given-names></name></person-group>. <article-title>Women&#x2019;s experiences of perinatal depression: symptoms, barriers and enablers to disclosure, and effects on daily life and interaction within the family</article-title>. <source>Midwifery</source>. (<year>2022</year>) <volume>112</volume>:<fpage>103389</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.midw.2022.103389</pub-id>, <pub-id pub-id-type="pmid">35709676</pub-id></mixed-citation></ref>
<ref id="ref15"><label>15.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Place</surname> <given-names>JMS</given-names></name> <name><surname>Renbarger</surname> <given-names>K</given-names></name> <name><surname>Van De Griend</surname> <given-names>K</given-names></name> <name><surname>Guinn</surname> <given-names>M</given-names></name> <name><surname>Wheatley</surname> <given-names>C</given-names></name> <name><surname>Holmes</surname> <given-names>O</given-names></name></person-group>. <article-title>Barriers to help-seeking for postpartum depression mapped onto the socio-ecological model and recommendations to address barriers</article-title>. <source>Front Glob Womens Health</source>. (<year>2024</year>) <volume>5</volume>:<fpage>1335437</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fgwh.2024.1335437</pub-id>, <pub-id pub-id-type="pmid">38855482</pub-id></mixed-citation></ref>
<ref id="ref16"><label>16.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zivin</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Tilea</surname> <given-names>A</given-names></name> <name><surname>Hall</surname> <given-names>SV</given-names></name> <name><surname>Admon</surname> <given-names>LK</given-names></name> <name><surname>Vance</surname> <given-names>AJ</given-names></name> <etal/></person-group>. <article-title>Perinatal psychotherapy use and costs before and after federally mandated health insurance coverage</article-title>. <source>JAMA Netw Open</source>. (<year>2024</year>) <volume>7</volume>:<fpage>e2426802</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.26802</pub-id>, <pub-id pub-id-type="pmid">39120900</pub-id></mixed-citation></ref>
<ref id="ref17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bauer</surname> <given-names>A</given-names></name> <name><surname>Knapp</surname> <given-names>M</given-names></name> <name><surname>Alvi</surname> <given-names>M</given-names></name> <name><surname>Chaudhry</surname> <given-names>N</given-names></name> <name><surname>Gregoire</surname> <given-names>A</given-names></name> <name><surname>Malik</surname> <given-names>A</given-names></name> <etal/></person-group>. <article-title>Economic costs of perinatal depression and anxiety in a lower middle income country: Pakistan</article-title>. <source>J Affect Disord</source>. (<year>2024</year>) <volume>357</volume>:<fpage>60</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jad.2024.04.061</pub-id>, <pub-id pub-id-type="pmid">38642903</pub-id></mixed-citation></ref>
<ref id="ref18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname> <given-names>Q</given-names></name> <name><surname>Yang</surname> <given-names>Y</given-names></name> <name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Wang</surname> <given-names>T</given-names></name> <name><surname>Chen</surname> <given-names>R</given-names></name> <name><surname>Duan</surname> <given-names>X</given-names></name></person-group>. <article-title>Factors influencing spousal support for women with perinatal depression in seeking formal assistance: a qualitative study</article-title>. <source>Front Public Health</source>. (<year>2024</year>) <volume>12</volume>:<fpage>1493300</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpubh.2024.1493300</pub-id>, <pub-id pub-id-type="pmid">39618948</pub-id></mixed-citation></ref>
<ref id="ref19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Howard</surname> <given-names>L</given-names></name> <name><surname>Khalifeh</surname> <given-names>H</given-names></name></person-group>. <article-title>Perinatal mental health: a review of progress and challenges</article-title>. <source>World Psychiatry</source>. (<year>2020</year>) <volume>19</volume>:<fpage>313</fpage>&#x2013;<lpage>27</lpage>. doi: <pub-id pub-id-type="doi">10.1002/wps.20769</pub-id>, <pub-id pub-id-type="pmid">32931106</pub-id></mixed-citation></ref>
<ref id="ref20"><label>20.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>C</given-names></name> <name><surname>Wang</surname> <given-names>X</given-names></name> <name><surname>Xu</surname> <given-names>H</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name></person-group>. <article-title>Effectiveness of digital psychological interventions in reducing perinatal depression: a systematic review of meta-analyses</article-title>. <source>Arch Womens Ment Health</source>. (<year>2023</year>) <volume>2</volume>:<fpage>423</fpage>&#x2013;<lpage>39</lpage>. doi: <pub-id pub-id-type="doi">10.1093/oodh/oqae026</pub-id>, <pub-id pub-id-type="pmid">40237016</pub-id></mixed-citation></ref>
<ref id="ref21"><label>21.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Howell</surname> <given-names>MD</given-names></name> <name><surname>Corrado</surname> <given-names>GS</given-names></name> <name><surname>DeSalvo</surname> <given-names>KB</given-names></name></person-group>. <article-title>Three epochs of artificial intelligence in health care</article-title>. <source>JAMA</source>. (<year>2024</year>) <volume>331</volume>:<fpage>242</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2023.25057</pub-id>, <pub-id pub-id-type="pmid">38227029</pub-id></mixed-citation></ref>
<ref id="ref22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yoga Ratnam</surname> <given-names>KK</given-names></name></person-group>. <article-title>Generative artificial intelligence in public health research and scientific communication: a narrative review of real applications and future directions</article-title>. <source>Digit Health</source>. (<year>2025</year>) <volume>11</volume>:<fpage>20552076251362070</fpage>. doi: <pub-id pub-id-type="doi">10.1177/20552076251362070</pub-id>, <pub-id pub-id-type="pmid">40842935</pub-id></mixed-citation></ref>
<ref id="ref23"><label>23.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>L</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Zhuang</surname> <given-names>B</given-names></name> <name><surname>Huang</surname> <given-names>S</given-names></name> <name><surname>Fang</surname> <given-names>M</given-names></name> <name><surname>Wang</surname> <given-names>C</given-names></name> <etal/></person-group>. <article-title>Accuracy of large language models when answering clinical research questions: systematic review and network meta-analysis</article-title>. <source>J Med Internet Res</source>. (<year>2025</year>) <volume>27</volume>:<fpage>e64486</fpage>. doi: <pub-id pub-id-type="doi">10.2196/64486</pub-id>, <pub-id pub-id-type="pmid">40305085</pub-id></mixed-citation></ref>
<ref id="ref24"><label>24.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Omiye</surname> <given-names>JA</given-names></name> <name><surname>Gui</surname> <given-names>H</given-names></name> <name><surname>Rezaei</surname> <given-names>SJ</given-names></name> <name><surname>Zou</surname> <given-names>J</given-names></name> <name><surname>Daneshjou</surname> <given-names>R</given-names></name></person-group>. <article-title>Large language models in medicine: the potentials and pitfalls: a narrative review</article-title>. <source>Ann Intern Med</source>. (<year>2024</year>) <volume>177</volume>:<fpage>210</fpage>&#x2013;<lpage>20</lpage>. doi: <pub-id pub-id-type="doi">10.7326/M23-2772</pub-id></mixed-citation></ref>
<ref id="ref25"><label>25.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Thirunavukarasu</surname> <given-names>AJ</given-names></name> <name><surname>Ting</surname> <given-names>DSJ</given-names></name> <name><surname>Elangovan</surname> <given-names>K</given-names></name> <name><surname>Gutierrez</surname> <given-names>L</given-names></name> <name><surname>Tan</surname> <given-names>TF</given-names></name> <name><surname>Ting</surname> <given-names>DSW</given-names></name></person-group>. <article-title>Large language models in medicine</article-title>. <source>Nat Med</source>. (<year>2023</year>) <volume>29</volume>:<fpage>1930</fpage>&#x2013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id></mixed-citation></ref>
<ref id="ref26"><label>26.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mesk&#x00F3;</surname> <given-names>B</given-names></name> <name><surname>G&#x00F6;r&#x00F6;g</surname> <given-names>M</given-names></name></person-group>. <article-title>A short guide for medical professionals in the era of artificial intelligence</article-title>. <source>NPJ Digit Med</source>. (<year>2020</year>) <volume>3</volume>:<fpage>126</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41746-020-00333-z</pub-id>, <pub-id pub-id-type="pmid">33043150</pub-id></mixed-citation></ref>
<ref id="ref27"><label>27.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Morrow</surname> <given-names>E</given-names></name> <name><surname>Zidaru</surname> <given-names>T</given-names></name> <name><surname>Ross</surname> <given-names>F</given-names></name> <name><surname>Mason</surname> <given-names>C</given-names></name> <name><surname>Patel</surname> <given-names>KD</given-names></name> <name><surname>Ream</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Artificial intelligence technologies and compassion in healthcare: a systematic scoping review</article-title>. <source>Front Psychol</source>. (<year>2023</year>) <volume>13</volume>:<fpage>971044</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2022.971044</pub-id>, <pub-id pub-id-type="pmid">36733854</pub-id></mixed-citation></ref>
<ref id="ref28"><label>28.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sallam</surname> <given-names>M</given-names></name></person-group>. <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>. <source>Healthcare</source>. (<year>2023</year>) <volume>11</volume>:<fpage>887</fpage>. doi: <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>, <pub-id pub-id-type="pmid">36981544</pub-id></mixed-citation></ref>
<ref id="ref29"><label>29.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sezgin</surname> <given-names>E</given-names></name> <name><surname>Chekeni</surname> <given-names>F</given-names></name> <name><surname>Lee</surname> <given-names>J</given-names></name> <name><surname>Keim</surname> <given-names>S</given-names></name></person-group>. <article-title>Clinical accuracy of large language models and Google search responses to postpartum depression questions: cross-sectional study</article-title>. <source>J Med Internet Res</source>. (<year>2023</year>) <volume>25</volume>:<fpage>e49240</fpage>. doi: <pub-id pub-id-type="doi">10.2196/49240</pub-id>, <pub-id pub-id-type="pmid">37695668</pub-id></mixed-citation></ref>
<ref id="ref30"><label>30.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><collab id="coll2">American College of Obstetricians Gynecologists</collab></person-group>. <source>Postpartum depression: FAQs</source>. <publisher-loc>Washington, DC</publisher-loc>: <publisher-name>American College of Obstetricians and Gynecologists</publisher-name> (<year>2025</year>).</mixed-citation></ref>
<ref id="ref31"><label>31.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><collab id="coll3">American College of Obstetricians Gynecologists</collab></person-group>. <source>Depression during pregnancy: FAQs</source>. <publisher-loc>Washington, DC</publisher-loc>: <source>American College of Obstetricians and Gynecologists</source> (<year>2025</year>).</mixed-citation></ref>
<ref id="ref32"><label>32.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rees</surname> <given-names>CE</given-names></name> <name><surname>Ford</surname> <given-names>JE</given-names></name> <name><surname>Sheard</surname> <given-names>CE</given-names></name></person-group>. <article-title>Evaluating the reliability of DISCERN: a tool for assessing the quality of written patient information on treatment choices</article-title>. <source>Patient Educ Couns</source>. (<year>2002</year>) <volume>47</volume>:<fpage>273</fpage>&#x2013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0738-3991(01)00225-7</pub-id>, <pub-id pub-id-type="pmid">12088606</pub-id></mixed-citation></ref>
<ref id="ref33"><label>33.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Moult</surname> <given-names>B</given-names></name> <name><surname>Franck</surname> <given-names>LS</given-names></name> <name><surname>Brady</surname> <given-names>H</given-names></name></person-group>. <article-title>Ensuring quality information for patients: development and preliminary validation of a new instrument to improve the quality of written health care information</article-title>. <source>Health Expect</source>. (<year>2004</year>) <volume>7</volume>:<fpage>165</fpage>&#x2013;<lpage>75</lpage>. doi: <pub-id pub-id-type="doi">10.1111/j.1369-7625.2004.00273.x</pub-id>, <pub-id pub-id-type="pmid">15117391</pub-id></mixed-citation></ref>
<ref id="ref34"><label>34.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>KS</given-names></name> <name><surname>Ghani</surname> <given-names>SA</given-names></name> <name><surname>Machairas</surname> <given-names>N</given-names></name> <name><surname>Lenti</surname> <given-names>L</given-names></name> <name><surname>Fan</surname> <given-names>KH</given-names></name> <name><surname>Richardson</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>COVID-19 prevention and treatment information on the internet: a systematic analysis and quality assessment</article-title>. <source>BMJ Open</source>. (<year>2020</year>) <volume>10</volume>:<fpage>e040487</fpage>. doi: <pub-id pub-id-type="doi">10.1136/bmjopen-2020-040487</pub-id>, <pub-id pub-id-type="pmid">32912996</pub-id></mixed-citation></ref>
<ref id="ref35"><label>35.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ghani</surname> <given-names>S</given-names></name> <name><surname>Fan</surname> <given-names>KS</given-names></name> <name><surname>Fan</surname> <given-names>KH</given-names></name> <name><surname>Lenti</surname> <given-names>L</given-names></name> <name><surname>Raptis</surname> <given-names>D</given-names></name></person-group>. <article-title>Using the ensuring quality information for patients tool to assess patient information on appendicitis websites: systematic search and evaluation</article-title>. <source>J Med Internet Res</source>. (<year>2021</year>) <volume>23</volume>:<fpage>e22618</fpage>. doi: <pub-id pub-id-type="doi">10.2196/22618</pub-id>, <pub-id pub-id-type="pmid">33729160</pub-id></mixed-citation></ref>
<ref id="ref36"><label>36.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Silberg</surname> <given-names>W</given-names></name> <name><surname>Lundberg</surname> <given-names>G</given-names></name> <name><surname>Musacchio</surname> <given-names>R</given-names></name></person-group>. <article-title>Assessing, controlling, and assuring the quality of medical information on the internet: caveant lector et viewor: let the reader and viewer beware</article-title>. <source>Generations</source>. (<year>1997</year>) <volume>21</volume>:<fpage>53</fpage>&#x2013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.1997.03540390074039</pub-id>, <pub-id pub-id-type="pmid">9103351</pub-id></mixed-citation></ref>
<ref id="ref37"><label>37.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benigeri</surname> <given-names>M</given-names></name> <name><surname>Brodeur</surname> <given-names>JM</given-names></name></person-group>. <article-title>L'utilisation des technologies de l'information et des communications (TIC) en sant&#x00E9; publique</article-title>. <source>Can J Public Health</source>. (<year>2001</year>) <volume>92</volume>:<fpage>313</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1007/BF03404969</pub-id>, <pub-id pub-id-type="pmid">11962120</pub-id></mixed-citation></ref>
<ref id="ref38"><label>38.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Oydanich</surname> <given-names>M</given-names></name> <name><surname>Kuklinski</surname> <given-names>E</given-names></name> <name><surname>Asbell</surname> <given-names>P</given-names></name></person-group>. <article-title>Assessing the quality, reliability, and readability of online information on dry eye disease</article-title>. <source>Cornea</source>. (<year>2022</year>) <volume>41</volume>:<fpage>1023</fpage>&#x2013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1097/ICO.0000000000003034</pub-id>, <pub-id pub-id-type="pmid">35344972</pub-id></mixed-citation></ref>
<ref id="ref39"><label>39.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><collab id="coll4">National Institutes of Health</collab></person-group>. <source>Clear and simple: developing effective print materials for low-literate readers</source>. <publisher-loc>Bethesda</publisher-loc>: <publisher-name>U.S. Department of Health and Human Services</publisher-name> (<year>2019</year>).</mixed-citation></ref>
<ref id="ref40"><label>40.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Y&#x0131;ld&#x0131;z</surname> <given-names>HA</given-names></name> <name><surname>S&#x00F6;&#x011F;&#x00FC;tdelen</surname> <given-names>E</given-names></name></person-group>. <article-title>AI chatbots as sources of STD information: a study on reliability and readability</article-title>. <source>J Med Syst</source>. (<year>2025</year>) <volume>49</volume>:<fpage>43</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s10916-025-02178-z</pub-id>, <pub-id pub-id-type="pmid">40178771</pub-id></mixed-citation></ref>
<ref id="ref41"><label>41.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname> <given-names>S</given-names></name> <name><surname>Tarapore</surname> <given-names>R</given-names></name> <name><surname>Haislup</surname> <given-names>B</given-names></name> <name><surname>Fillar</surname> <given-names>A</given-names></name></person-group>. <article-title>Microsoft copilot provides more accurate and reliable information about anterior cruciate ligament injury and repair than ChatGPT and Google Gemini; however, no resource was overall the best</article-title>. <source>Arthrosc Sports Med Rehabil</source>. (<year>2025</year>) <volume>7</volume>:<fpage>101043</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.asmr.2024.101043</pub-id>, <pub-id pub-id-type="pmid">40297090</pub-id></mixed-citation></ref>
<ref id="ref42"><label>42.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khazaal</surname> <given-names>Y</given-names></name> <name><surname>Chatton</surname> <given-names>A</given-names></name> <name><surname>Zullino</surname> <given-names>D</given-names></name> <name><surname>Khan</surname> <given-names>R</given-names></name></person-group>. <article-title>HON label and DISCERN as content quality indicators of health-related websites</article-title>. <source>Psychiatry Q</source>. (<year>2012</year>) <volume>83</volume>:<fpage>15</fpage>&#x2013;<lpage>27</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11126-011-9179-x</pub-id>, <pub-id pub-id-type="pmid">21547515</pub-id></mixed-citation></ref>
<ref id="ref43"><label>43.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><collab id="coll5">National Institutes of Health</collab></person-group>. Clear and simple: developing effective print materials for low-literate readers. <year>2011</year>). Available online at: <ext-link xlink:href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/clear-communication/clear-simple" ext-link-type="uri">https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/clear-communication/clear-simple</ext-link> (Accessed October 16, 2025)</mixed-citation></ref>
<ref id="ref44"><label>44.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tandar</surname> <given-names>C</given-names></name> <name><surname>Lin</surname> <given-names>J</given-names></name> <name><surname>Stanford</surname> <given-names>F</given-names></name></person-group>. <article-title>Combating medical misinformation and rebuilding trust in the USA</article-title>. <source>Lancet Digit Health</source>. (<year>2024</year>) <volume>6</volume>:<fpage>e773</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S2589-7500(24)00197-3</pub-id>, <pub-id pub-id-type="pmid">39379267</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2048290/overview">Fahim Sufi</ext-link>, Monash University, Australia</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1355578/overview">Carlos Alberto Pereira de Oliveira</ext-link>, Rio de Janeiro State University, Brazil</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3139821/overview">Debra Winberg</ext-link>, Georgetown University, United States</p>
</fn>
</fn-group>
</back>
</article>