<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Psychiatry</journal-id>
<journal-title-group>
<journal-title>Frontiers in Psychiatry</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Psychiatry</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-0640</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpsyt.2026.1621532</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>AI-generated documentation of psychiatric interviews: a proof-of-concept study</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>G&#xfc;legen</surname><given-names>Bengican</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2962961/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Haaf</surname><given-names>Raoul</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Schl&#xfc;&#xdf;ler</surname><given-names>Emanuel</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>K&#xf6;hler</surname><given-names>Stephan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Psychiatry and Psychotherapy, Charit&#xe9; &#x2013; Universit&#xe4;tsmedizin Berlin</institution>, <city>Berlin</city>,&#xa0;<country country="de">Germany</country></aff>
<aff id="aff2"><label>2</label><institution>Amnexis Digital Solutions GmbH</institution>, <city>Berlin</city>,&#xa0;<country country="de">Germany</country></aff>
<aff id="aff3"><label>3</label><institution>Alexianer St Joseph Berlin-Wei&#xdf;ensee GmbH</institution>, <city>Berlin</city>,&#xa0;<country country="de">Germany</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Bengican G&#xfc;legen, <email xlink:href="mailto:bengican.guelegen@charite.de">bengican.guelegen@charite.de</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-11">
<day>11</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1621532</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>25</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 G&#xfc;legen, Haaf, Schl&#xfc;&#xdf;ler and K&#xf6;hler.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>G&#xfc;legen, Haaf, Schl&#xfc;&#xdf;ler and K&#xf6;hler</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-11">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>The documentation process in psychiatric interviews is laborious and often compromises the quality of patient care. Addressing this challenge, we explored the potential of artificial intelligence (AI) to automate documentation tasks and improve efficiency in psychiatric practice.</p>
</sec>
<sec>
<title>Methods</title>
<p>Six simulated psychiatric interviews were transcribed and summarized using an AI model and compared to a gold standard, together with reports written by humans. Reports were decomposed into binary items using a predefined codebook covering patient information, current complaints, psychiatric history, medical history, medication, substance use, social history, family history, vegetative symptoms, psychopathology, and preliminary diagnoses. Transcription accuracy, performance, and inter-rater reliability were evaluated.</p>
</sec>
<sec>
<title>Results</title>
<p>The AI achieved a high transcription accuracy with a mean word error rate of 9.44% and a Levenshtein score of 0.996, aligning with current voice-to-text transcription standards. Inter-rater reliability was high overall. The mean Cohen&#x2019;s &#x3ba; was 0.80 (SD = 0.33), the mean percent agreement was 0.96 (SD = 0.07), and the mean Gwet&#x2019;s AC1 was 0.93 (SD = 0.12). Across all categories, human reports showed substantially higher agreement with the gold standard than AI reports. The mean accuracy was 0.94 (SD = 0.01) for human reports and 0.78 (SD = 0.08) for AI reports, t(5) = 6.33, p = .003. The mean F1 scores were also higher for human reports (M = 0.89, SD = 0.02) than for AI reports (M = 0.55, SD = 0.13), t(5) = 7.38, p = .001. Occasionally, AI reports provided more detailed contextual information than human reports. However, AI reports also introduced clinically relevant inaccuracies and struggled in complex domains such as psychopathology.</p>
</sec>
<sec>
<title>Conclusions</title>
<p>While our findings suggest promising prospects for AI-driven documentation in psychiatry, further development is essential to enhance the model&#x2019;s ability to comprehensively assess and document psychopathological features. Importantly, some AI-generated inaccuracies were clinically significant, underscoring the necessity of a final clinical review by a qualified professional. These findings are limited by the very small number of highly controlled simulated interviews. Larger studies with real patients, diverse clinicians, and routine clinical workflows will be required. Nonetheless, AI-supported documentation has the potential to considerably reduce time demands and alleviate the documentation burden in psychiatric care.</p>
</sec>
</abstract>
<kwd-group>
<kwd>artificial intelligence</kwd>
<kwd>clinical documentation</kwd>
<kwd>electronic medical records</kwd>
<kwd>natural language processing</kwd>
<kwd>neural language models</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The authors declare that this study received funding from Amnexis Digital Solutions GmbH. The funder was not involved in the study design, data collection, data analysis, interpretation of data, the writing of this article, or the decision to submit it for publication. Amnexis Digital Solutions GmbH also covered the article processing charges (APC).</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="5"/>
<equation-count count="0"/>
<ref-count count="48"/>
<page-count count="13"/>
<word-count count="8358"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Digital Mental Health</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<sec id="s1_1">
<label>1.1</label>
<title>Documentation burden</title>
<p>Documentation burden poses a particularly significant challenge across various domains of medicine, exerting high pressure on healthcare providers and impacting the delivery of quality patient care (<xref ref-type="bibr" rid="B1">1</xref>). In clinical settings, precise documentation serves as a cornerstone for effective communication, continuity of care, and legal compliance (<xref ref-type="bibr" rid="B2">2</xref>). However, the process of capturing and recording patient information is often plagued by its labor-intensive and time-demanding nature. This burden manifests in diverse ways, from the need to manually write detailed clinical encounters to the exhausting task of navigating complex electronic health record systems. As such, physicians devote an estimated one-third to two-thirds of their workday on electronic health records, having to spend more time on health records than on direct patient contact (<xref ref-type="bibr" rid="B3">3</xref>). For example, a study regarding the documentation burden for physicians in the USA showed a mean of 1.77 (95% CI, 1.67&#x2013;1.87) hours daily spent on documentation outside of office hours (<xref ref-type="bibr" rid="B1">1</xref>). Another US-based study reported 59% of the on average 1.5 hours after work spent on electronic healthcare record (EHR) (<xref ref-type="bibr" rid="B3">3</xref>). During office hours, 49.2% of physicians&#x2019; total time was spent on EHR and desk work. That, in turn, can be directed to clinical burnout (<xref ref-type="bibr" rid="B4">4</xref>&#x2013;<xref ref-type="bibr" rid="B6">6</xref>) and lack of documentation quality, which can even lead to clinical errors (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>While documentation burden can be pervasive throughout all fields of medicine, its repercussions may be particularly pronounced within the field of psychiatry. Unlike many other medical specialties, psychiatry revolves around the nuanced exploration of patients&#x2019; mental health and emotional well-being, relying heavily on interpersonal communication and rapport-building during interviews (<xref ref-type="bibr" rid="B8">8</xref>). Additionally, legal requirements for documentation play a growing role in psychiatry (<xref ref-type="bibr" rid="B9">9</xref>). As such, the documentation process in psychiatry is not merely a logistical hurdle but a crucial component of holistic patient care, underscoring the need for innovative solutions to streamline and enhance documentation practices in psychiatry and psychotherapy.</p>
</sec>
<sec id="s1_2">
<label>1.2</label>
<title>AI solutions</title>
<p>Artificial intelligence (AI) presents promising solutions for addressing the documentation burden pervasive in psychiatric practice and medicine at large. With the power of machine learning algorithms and natural language processing techniques, AI technologies offer the potential to optimize clinical documentation workflows. Through automated transcription, summarization, and analysis of clinical encounters, AI systems may alleviate the manual burden of documentation, freeing up clinicians&#x2019; time for more meaningful patient interactions and therapeutic interventions. Moreover, AI-driven documentation solutions have the capacity to enhance the accuracy, completeness, and consistency of patient records, mitigating the risks associated with human error and variability in documentation practices (<xref ref-type="bibr" rid="B10">10</xref>).</p>
<p>Lin et&#xa0;al. (2018) presented a theoretical prototype for an AI recording patient&#x2013;physician encounters via speech recognition and summarizing, sorting, and assembling clinical information. The authors went even further, suggesting the AI&#x2019;s ability to make clinical recommendations, predict clinical risks, calculate scores, and conclude ICD codes (<xref ref-type="bibr" rid="B11">11</xref>). They concluded that many technologies necessary for such an autoscribe already exist. However, they presumed that it remains unclear how autoscribes can be exactly developed regarding the high amount of data necessary. In the meantime, some AI tools have been evaluated in other fields of medicine. For example, a proof-of-concept study proved the ability of a software tool to automatically create surgical reports (<xref ref-type="bibr" rid="B12">12</xref>). In the field of radiology, several products have been discussed to improve clinical documentation via AI, for example, through voice-recording examination rooms or smart watches that transcribe conversations and create clinical notes (<xref ref-type="bibr" rid="B10">10</xref>). However, there have been no AI solutions proposed for the psychiatric field.</p>
</sec>
<sec id="s1_3">
<label>1.3</label>
<title>Aims and hypotheses</title>
<p>In this study, we aimed to evaluate the functionality and potential applicability of an AI software solution that is currently in development and has been utilized across various other medical specialties (<xref ref-type="bibr" rid="B13">13</xref>). While AI technologies have demonstrated promising results in automating clinical documentation tasks (<xref ref-type="bibr" rid="B14">14</xref>, <xref ref-type="bibr" rid="B15">15</xref>), their utilization and effectiveness in psychiatry remain relatively unexplored. Recognizing the unique challenges and nuances of psychiatric practice, we aimed to assess the feasibility and performance of this AI software within the context of psychiatric interviews and their documentation. By conducting a proof-of-concept evaluation, we aimed to elaborate on the strengths and limitations of implementing AI-driven documentation solutions in psychiatric settings.</p>
<p>We aimed to examine how effective the AI software would be in voice-to-text transcription and summarization by calculating its error rate, and especially in how accurately content would be assigned to its respective category within the psychiatric report, by comparing AI reports to human-written reports. Our goal was to explore not only overall quality and accuracy but also differences regarding different categories of psychiatric reports, to, for example, individually test the capability of recognizing and interpreting psychopathology as a key aspect of psychiatric evaluation.</p>
</sec>
</sec>
<sec id="s2">
<label>2</label>
<title>Methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>AI software</title>
<p>We utilized an AI software called QUIXXS, which is currently under development by Amnexis Digital Solutions GmbH, an Ireland-based tech company specializing in digital health solutions utilizing AI (<xref ref-type="bibr" rid="B13">13</xref>). The software is already in use in several clinical fields, but has not yet been evaluated in psychiatric environments and has not been tested in clinical studies. The AI software runs on smartphones as an Android or IOS application and utilizes the recording function of the smartphone.</p>
<p>During a doctor&#x2013;patient encounter, the smartphone is positioned in such a way that both the patient and the doctor can be heard well. The application can also be used for dictations without patients. The recorded audio data can be converted into a transcript using speech-to-text functions. To meet high data protection requirements, transcripts are initially pseudonymized. Afterward, transcription is performed using the Web API Whisper, a neural net automatic speech recognition (ASR) system by OpenAI, which has been proven to show human-like accuracy and robustness (<xref ref-type="bibr" rid="B16">16</xref>). Employing GPT-4, a large language model (<xref ref-type="bibr" rid="B17">17</xref>), a medical information filter compresses the pseudonymized transcript by extracting medically relevant information. A stream-based approach was employed, in which the transcript is divided into segments of approximately 5 minutes each. Every segment is compressed using a domain-specific summarization prompt. The resulting partial summaries are then sequentially linked together into an overall synthesis, which serves as the essential basis for form filling and further AI functions (e.g., cross-sectional analyses). In the final step, a previously selected form template is completed with the obtained information. Templates are used to specify the external form, structure, and desired content of a report. A data/form matcher fills the individual fields of the template with the medical information, thus completing the creation process. <xref ref-type="fig" rid="f1"><bold>Figure 1</bold></xref> depicts the processing pipeline of QUIXXS from audio input to clinical report generation.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>End to end processing pipeline for transforming recorded clinical speech into a structured medical report. Audio input is captured and transcribed, pseudonymised to protect patient identity, filtered to extract medically relevant content, and summarised. The resulting information is then matched against predefined templates and model knowledge to generate a standardised clinical report.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-17-1621532-g001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a process: voice capture creates an audio file, which is converted to text, forming a raw transcript. This undergoes pseudonymisation, producing a pseudonymised transcript. A medical information filter generates a medical summary. Information form matcher integrates with model knowledge, using templates to produce a report. Arrows indicate process flow.</alt-text>
</graphic></fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Interview mode</title>
<p>For this proof-of-concept study, we simulated psychiatric interviews, as they would occur during the admission of patients for inpatient treatment at the clinic for psychiatry and psychotherapy of the University Hospital Charit&#xe9; Berlin Mitte. We conducted six interviews, one of which was held by a final-year medical student who was just completing their psychiatric rotation and conducting psychiatric interviews on the acute ward regularly, and two by a psychiatric resident. Interview partners, simulating a patient, were psychiatric nurses, residents, and volunteers who were not of a medical profession whatsoever. Those who were simulating to be a patient were instructed to present themselves as patients looking for psychiatric help because of some, not further specified, psychiatric symptoms. Those who were conducting the interview did not know beforehand which diagnosis the patients would introduce themselves with.</p>
<p>The interviews were held in the same manner as during a real, first physician&#x2013;patient encounter on our acute psychiatric ward. Interviews were held in German and in accordance with standardized internal guidelines and protocols of our clinic, which were designed for psychiatric assessments. Patients were asked about their main symptoms, their psychiatric and medical history, their sociobiography, their family&#x2019;s psychiatric history, and their current and recent medication. Psychopathology was assessed using a semi-structured interview according to the AMDP System: Manual for Assessment and Documentation of Psychopathology in Psychiatry (<xref ref-type="bibr" rid="B18">18</xref>).</p>
<p>During the interview, audio was recorded and then processed using QUIXXS. QUIXXS then interpreted from the recorded and transcribed audio files the role of the physician and the patient. Based on the transcribed and processed documents, QUIXXS assigned information to predetermined categories, such as patient information (including their name, age, and gender), current complaints or reason for presentation, psychiatric history, medical history, allergies, medication, social history, family history, vegetative symptoms, psychopathology, and preliminary diagnosis. The reports created automatically by QUIXXS were then saved as html files. After the interview, the physician wrote a report with the same, above-mentioned categories, as for a real inpatient treatment. The human reports were saved as Word files for future comparison. All reports were written in German, but excerpts for this paper were always translated into English for better understandability.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Measures</title>
<p>To test the quality of the AI reports, we tested error-proneness for two different steps of the report creation: transcription and reporting.</p>
<p>To test the accuracy of the voice-to-text transcription, the word error rate (WER) was used, which is the ratio of errors in a transcript to the total words spoken, the most common way to report the quality of voice-to-text transcription (<xref ref-type="bibr" rid="B19">19</xref>). Errors in this case would be substitutions, deletions, or insertions. For comparison, a transcript was created by a human as a ground-truth transcript. Both the human and AI transcripts were normalized: punctuation was deleted, and uppercase letters were changed to lowercase letters. Additionally, following changes were made: abbreviations were written out, slang words (such as &#x201c;nah&#x201d; instead of &#x201c;no&#x201d;) and contractions (&#x201c;can&#x2019;t&#x201d; instead &#x201c;cannot&#x201d;, and &#x201c;gonna&#x201d; instead of &#x201c;going to&#x201d;) were written out, filler words (such as &#x201c;hmm&#x201d; or &#x201c;uhm&#x201d;) were deleted, and non-lexical conversational sounds were replaced with their equivalent (e.g., &#x201c;no&#x201d; instead of &#x201c;uh-uh&#x201d;). Grammatical errors of non-native speakers were corrected. Word error rate was then calculated using python v3.12.3 (<xref ref-type="bibr" rid="B20">20</xref>) and speechmatics python client v1.14.8 (<xref ref-type="bibr" rid="B21">21</xref>). In addition to WER, the Levenshtein score was calculated, which measures the similarity between the human and AI transcripts based on the minimum number of single-word edits (insertions, deletions, or substitutions) required to transform one into the other (<xref ref-type="bibr" rid="B22">22</xref>). This score provides a normalized similarity metric between 0 and 1, with 1 indicating perfect similarity. The calculation was performed using the python-Levenshtein library (<xref ref-type="bibr" rid="B23">23</xref>).</p>
<p>To assess the accuracy and quality of the reports, a reference standard was defined using the original audio recordings of the clinical interviews. A structured codebook was developed to operationalize systematic comparisons between this gold standard, the manually written reports, and the AI-generated reports. The codebook was grounded in standard domains of psychiatric assessment and comprised the following sections: personal information, current complaints, psychiatric history, somatic history, allergies, medication, substance use, social history, family history, vegetative symptoms, psychopathology, and preliminary diagnoses. Several sections of the codebook used multiple-choice response formats that combined fixed answer options with an additional free-text field. Fixed options covered the most frequently occurring categories, for example, commonly prescribed medications such as sertraline or mirtazapine, while free-text entries allowed entries that were not represented among the predefined options. Other sections relied exclusively on fixed single-choice options. An example is the assessment of visual hallucinations, with predefined responses of present and not present. If a report did not address visual hallucinations at all, no option was selected, and the corresponding item remained blank. Each response option within the codebook was treated as an independent binary item. For instance, within the substance use domain, nicotine use, alcohol use, and cocaine use were coded as separate variables. This design enabled a fine-grained, item-level analysis and deliberately avoided reliance on global free-text similarity measures. The exported codebook data were processed programmatically, and all responses were converted into string-based representations. Multi-select fields were automatically decomposed into individual binary indicators, resulting in a one-hot encoded item matrix in which each response option corresponded to exactly one analyzable variable. For each case and each item, concordance was assessed between the gold standard and the manual report, as well as between the gold standard and the AI-generated report. Items documented in a report but not supported by the audio recording were classified as false positives, whereas items present in the audio recording but absent from a report were classified as false negatives.</p>
<p>Structured codebook development and independent double coding, as employed here, follow recognized frameworks for quantitative content analysis in clinical research, providing a transparent and reproducible method for annotating and comparing key clinical concepts across sources.</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Inter-rater reliability</title>
<p>To assess the inter-rater reliability of the proposed evaluation framework, a second independent rater re-rated all manual and AI-generated reports using the same codebook and rating procedure as the primary rater. Both raters were psychiatrically trained and blinded to each other&#x2019;s ratings. Ratings were based on the structured item matrix derived from the original codebook, in which each answer option was represented as an individual binary item. Before statistical comparison, ratings were manually normalized to ensure semantic equivalence across raters, particularly for items derived from free-text inputs. This included harmonizing different phrasings referring to the same concept, for example, treating &#x201c;leg fracture&#x201d; and &#x201c;broken leg&#x201d; as equivalent. For each item, agreement between raters was computed on a per-case basis. Inter-rater reliability was primarily quantified using Cohen&#x2019;s &#x3ba;, which estimates agreement beyond chance for binary ratings (<xref ref-type="bibr" rid="B24">24</xref>). Because &#x3ba; is sensitive to prevalence and marginal distributions, particularly in sparse or highly imbalanced items, two additional complementary measures were calculated. First, percent agreement was reported to provide an intuitive measure of absolute concordance (<xref ref-type="bibr" rid="B25">25</xref>). Second, Gwet&#x2019;s AC1 was computed as a more robust chance-corrected agreement coefficient that is less affected by prevalence effects and is therefore recommended for reliability analyses involving binary clinical data with skewed distributions (<xref ref-type="bibr" rid="B26">26</xref>).</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Statistical analyses</title>
<p>Performance was evaluated at both the global level and the section level. Accuracy was defined as the proportion of correctly classified items among all evaluated items. To account for class imbalance and to distinguish between overreporting and underreporting of clinical information, precision, recall, and the F1 score were additionally computed, with the F1 score defined as the harmonic mean of precision and recall. All performance metrics were calculated separately for manual reports relative to the gold standard and for AI-generated reports relative to the gold standard. In addition, absolute counts of false positive and false negative items were reported to characterize systematic error patterns. This evaluation framework, while novel in the psychiatric domain, follows established approaches for the objective assessment of medical AI outputs, in which performance is quantified using item-based accuracy, precision, recall, and F1 scores derived from false-positive and false-negative counts (<xref ref-type="bibr" rid="B27">27</xref>&#x2013;<xref ref-type="bibr" rid="B29">29</xref>).</p>
<p>For performance comparison between manual and AI-generated reports, independent two-tailed t-tests were conducted. These analyses were used to test for differences in mean accuracy, precision, recall, and F1 scores between report types. All analyses and visualizations were performed using Python version 3.12.3 (<xref ref-type="bibr" rid="B30">30</xref>), and figures were generated using Matplotlib (<xref ref-type="bibr" rid="B31">31</xref>).</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Transcription</title>
<p>WER and Levenshtein scores are reported in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>. The mean WER was 9.44% (SD = 2.13%). Interviews contained an average of 2,684.33 words (SD = 800.68). Substitutions (M = 3.34%; SD = 1.60%) and deletions (M = 3.31%; SD = 1.21%) occurred more often than insertions (M = 2.77%; SD = 0.64%). The mean Levenshtein score was 0.966 (SD = 0.006).</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Report for the accuracy benchmarking for all three interviews, measuring the word error rate (WER) and Levenshtein score.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">No.</th>
<th valign="middle" align="left">Diagnosis</th>
<th valign="middle" align="left">Native speaker</th>
<th valign="middle" align="left">WER</th>
<th valign="middle" align="left">Levenshtein score</th>
<th valign="middle" align="left">Words (#)</th>
<th valign="middle" align="left">Subst. (%)</th>
<th valign="middle" align="left">Delet. (%)</th>
<th valign="middle" align="left">Insert. (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">1</td>
<td valign="middle" align="left">DEP, SP, PD</td>
<td valign="middle" align="left">No</td>
<td valign="middle" align="left">0.08</td>
<td valign="middle" align="left">0.9727</td>
<td valign="middle" align="left">3,104</td>
<td valign="middle" align="left">3.834</td>
<td valign="middle" align="left">2.223</td>
<td valign="middle" align="left">1.707</td>
</tr>
<tr>
<td valign="middle" align="left">2</td>
<td valign="middle" align="left">DEP, SP, PD, OCD</td>
<td valign="middle" align="left">Yes</td>
<td valign="middle" align="left">0.09</td>
<td valign="middle" align="left">0.9637</td>
<td valign="middle" align="left">2,325</td>
<td valign="middle" align="left">3.484</td>
<td valign="middle" align="left">2.968</td>
<td valign="middle" align="left">2.538</td>
</tr>
<tr>
<td valign="middle" align="left">3</td>
<td valign="middle" align="left">DEP</td>
<td valign="middle" align="left">Yes</td>
<td valign="middle" align="left">0.12</td>
<td valign="middle" align="left">0.9614</td>
<td valign="middle" align="left">4,015</td>
<td valign="middle" align="left">6.077</td>
<td valign="middle" align="left">3.163</td>
<td valign="middle" align="left">3.163</td>
</tr>
<tr>
<td valign="middle" align="left">4</td>
<td valign="middle" align="left">ADHD</td>
<td valign="middle" align="left">Yes</td>
<td valign="middle" align="left">0.11</td>
<td valign="middle" align="left">0.9576</td>
<td valign="middle" align="left">1,685</td>
<td valign="middle" align="left">2.789</td>
<td valign="middle" align="left">5.697</td>
<td valign="middle" align="left">2.967</td>
</tr>
<tr>
<td valign="middle" align="left">5</td>
<td valign="middle" align="left">BPAD</td>
<td valign="middle" align="left">Yes</td>
<td valign="middle" align="left">0.07</td>
<td valign="middle" align="left">0.9739</td>
<td valign="middle" align="left">2,653</td>
<td valign="middle" align="left">1.282</td>
<td valign="middle" align="left">2.902</td>
<td valign="middle" align="left">2.676</td>
</tr>
<tr>
<td valign="middle" align="left">6</td>
<td valign="middle" align="left">SCHIZ</td>
<td valign="middle" align="left">Yes</td>
<td valign="middle" align="left">0.09</td>
<td valign="middle" align="left">0.9670</td>
<td valign="middle" align="left">2,324</td>
<td valign="middle" align="left">2.582</td>
<td valign="middle" align="left">2.926</td>
<td valign="middle" align="left">3.571</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>WER consists of substitutions (Subst.), deletions (Delet.), and insertions (Insert.). Diagnoses: depression (DEP), social phobia (SP), panic disorder (PD), obsessive&#x2013;compulsive disorder (OCD), attention deficit hyperactivity disorder (ADHD), bipolar affective disorder (BPAD), and schizophrenia (SCHIZ).</p></fn>
</table-wrap-foot>
</table-wrap>
<p>However, some relevant errors occurred, which can be seen in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>. For example, when a patient was asked to spell the word &#x201c;radio&#x201d; backward to test for impairment of concentration, he was able to do so correctly. The AI transcript, however, only detected three of five letters correctly, therefore leading to a transcript that would suggest impairment of concentration, where there is none. Another example of significant mistakes was the misunderstanding of medical diseases. For instance, &#x201c;hypertonia&#x201d; was once mistaken for &#x201c;hypothermia&#x201d;, which could lead to a faulty medical history. Also, in many cases, names of medication were transcribed incorrectly, for example, &#x201c;metazapine&#x201d; instead of &#x201c;mirtazapine&#x201d; or &#x201c;Zuprexa&#x201d; instead of &#x201c;Zyprexa&#x201d;.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparison of the AI transcript and the ground truth, with differences highlighted in bold.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">AI transcript</th>
<th valign="middle" align="center">Ground truth</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Okay, could you spell the word radio backwards? Radio backwards? <bold>O I D.</bold> Okay great.</td>
<td valign="middle" align="left">Okay, could you spell the word radio backwards? Radio backwards? <bold>O I D A R</bold>. Okay great.</td>
</tr>
<tr>
<td valign="middle" align="left">My general practitioner prescribes me something against blood pressure. How is it called? It is called Ramipril, I think. Ramipril, okay, that means you have a known <bold>hypothermia</bold>?</td>
<td valign="middle" align="left">My general practitioner prescribes me something against blood pressure. How is it called? It is called Ramipril, I think. Ramipril, okay, that means you have a known <bold>hypertension</bold>?</td>
</tr>
<tr>
<td valign="middle" align="left">Now that I take the <bold>Metazapine</bold>, I sometimes feel a bit hungry again.</td>
<td valign="middle" align="left">Now that I take the <bold>Mirtazapine</bold>, I sometimes feel a bit hungry again.</td>
</tr>
<tr>
<td valign="middle" align="left">The medication was called <bold>Zuprexa</bold> I think.</td>
<td valign="middle" align="left">The medication was called <bold>Zyprexa</bold> I think.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Reports</title>
<p>Inter-rater reliability was high overall. Across all 209 items and both report types combined, the mean Cohen&#x2019;s &#x3ba; was 0.80 (SD = 0.33), the mean percent agreement was 0.96 (SD = 0.07), and the mean Gwet&#x2019;s AC1 was 0.93 (SD = 0.12), indicating robust agreement beyond chance despite heterogeneous item prevalence (mean prevalence = 0.24, SD = 0.21). For AI reports, the mean Cohen&#x2019;s &#x3ba; was &#x3ba; = 0.75 (SD = 0.38), accompanied by a high mean percent agreement of 0.96 (SD = 0.09) and a mean Gwet&#x2019;s AC1 of 0.93 (SD = 0.14), indicating robust agreement. For manually written reports, inter-rater reliability was similarly high, with a mean Cohen&#x2019;s &#x3ba; of 0.81 (SD = 0.36), a mean percent agreement of 0.96 (SD = 0.09), and a mean Gwet&#x2019;s AC1 of 0.94 (SD = 0.15), reflecting slightly higher overall consistency between raters compared with AI-generated content.</p>
<p><xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref> shows accuracy, F1 scores, false negatives, and false positives for human and AI reports.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Global performance of human and AI-generated psychiatric reports compared to the gold standard. The left panel displays global accuracy and F1 scores, while the right panel shows the mean number of false positives (FP) and false negatives (FN) per case. Blue bars represent human-written reports, and orange bars represent AI-generated reports. Error bars indicate standard deviations across cases. Group differences between human and AI reports were tested using two-tailed paired t-tests. Statistically significant differences are marked with asterisks (**p &lt;.01, and ***p &lt;.001).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-17-1621532-g002.tif">
<alt-text content-type="machine-generated">Bar charts comparing human and AI reports. The left chart shows higher accuracy and F1 scores for human reports. The right chart shows lower false positives (FP) and false negatives (FN) for human reports compared to AI. Error bars indicate variability.</alt-text>
</graphic></fig>
<p>Across all categories and items, human-generated reports showed substantially higher agreement with the gold standard than AI-generated reports. Human reports achieved a mean accuracy of 0.94 (SD = 0.01), whereas AI reports reached a mean accuracy of 0.78 (SD = 0.08). This difference was statistically significant, t(5) = 6.33, p = .003. Similarly, mean F1 scores were significantly higher for human reports (M = 0.89, SD = 0.02) compared to AI reports (M = 0.55, SD = 0.13), t(5) = 7.38, p = .001. Error analysis revealed that AI reports produced more false negatives (M = 33.50, SD = 10.31) than human reports (M = 6.33, SD = 2.07), a difference that was statistically significant, t(5) = &#x2212;8.39, p &lt;.001. False positives did not differ significantly between human (M = 7.00, SD = 1.90) and AI reports (M = 13.17, SD = 7.03), t(5) = &#x2212;2.02, p = .092.</p>
<p><xref ref-type="fig" rid="f3"><bold>Figure 3</bold></xref> shows the performance of AI reports throughout the different categories and <xref ref-type="fig" rid="f4"><bold>Figure 4</bold></xref> shows F1 scores. AI performance was the closest to human documentation in categories with limited semantic ambiguity and clearly defined factual content. For family history, AI achieved perfect agreement with the gold standard, with a mean accuracy of 1.00 (SD = 0.00) and an F1 score of 1.00 (SD = 0.00), matching human reports exactly (accuracy = 1.00, SD = 0.00; F1 = 1.00, SD = 0.00). Similarly, in medical history, AI reports demonstrated high accuracy (M = 0.92, SD = 0.13) and F1 scores (M = 0.90, SD = 0.15), approaching the performance of human reports (accuracy: M = 0.98, SD = 0.04; F1: M = 0.98, SD = 0.05). Comparable patterns were observed in medication history, where AI achieved an accuracy of 0.88 (SD = 0.08) and an F1 score of 0.73 (SD = 0.17), while human reports remained superior but closer in magnitude (accuracy: M = 0.95, SD = 0.07; F1: M = 0.90, SD = 0.12). For describing current complaints, AI performance declined more noticeably. AI reports reached a mean accuracy of 0.60 (SD = 0.36) and an F1 score of 0.64 (SD = 0.31), substantially below those of human reports, which achieved perfect performance in this category (both accuracy and F1: M = 1.00, SD = 0.00). This discrepancy reflects AI difficulties in consistently capturing symptom onset, course, and emphasis as expressed in free-form patient narratives. More pronounced performance gaps emerged for psychiatric history, where AI reports reached a mean accuracy of 0.79 (SD = 0.17) and an F1 score of 0.58 (SD = 0.32), compared to substantially higher values in human reports (accuracy: M = 0.97, SD = 0.03; F1: M = 0.96, SD = 0.06). Similarly, for psychopathology, AI performance was markedly lower, with a mean accuracy of 0.74 (SD = 0.07) and an F1 score of 0.50 (SD = 0.11), whereas human reports achieved near-ceiling performance (accuracy: M = 0.96, SD = 0.03; F1: M = 0.94, SD = 0.04). For social history, AI reports demonstrated moderate performance (accuracy: M = 0.70, SD = 0.21; F1: M = 0.62, SD = 0.34), again falling short of human documentation (accuracy: M = 0.84, SD = 0.18; F1: M = 0.74, SD = 0.25). Notably, both AI and human reports showed lower performance in vegetative symptoms, often omitting several items. AI achieved an accuracy of 0.74 (SD = 0.12) and an F1 score of 0.45 (SD = 0.19), which was similar in accuracy but lower in F1 compared to human reports (accuracy: M = 0.73, SD = 0.13; F1: M = 0.56, SD = 0.28). Finally, preliminary diagnoses revealed a distinctive pattern. While AI accuracy was low (M = 0.42, SD = 0.49), its F1 score reached 1.00 (SD = 0.00), mirroring human reports (accuracy: M = 0.83, SD = 0.41; F1 = 1.00, SD = 0.00). This reflects the binary structure of the diagnosis items and the fact that, when diagnoses were named, AI tended to reproduce them consistently, despite frequent omissions reflected in reduced accuracy.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Accuracy by category for human and AI reports. The mean accuracy values for human-written and AI-generated psychiatric reports are shown separately for each diagnostic category, based on comparisons with the gold-standard annotations. Error bars represent standard deviations across cases. Accuracy reflects the proportion of correctly classified binary items within each category. Info, Patient Information; Compl, Current Complaints; PsyHist, Psychiatric History; MedHist, Medical History, Subst, Substance Use, Social, Social History, Family, Family History; Veget, Vegetative Symptoms; PsychPath, Psychopathology; Prelim, Preliminary Diagnosis.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-17-1621532-g003.tif">
<alt-text content-type="machine-generated">Bar chart comparing accuracy scores by category between human and AI reports. Categories include Info, Compl, PsyHis, MedHis, Med_cat, Subst, Social, Family, Veget, PsychPath, and Prelim. Human reports generally show higher scores across most categories. Error bars indicate variability.</alt-text>
</graphic></fig>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>F1 scores by category for human and AI reports. The mean F1 scores for human-written and AI-generated psychiatric reports are shown separately for each diagnostic category, based on comparisons with the gold-standard annotations. Error bars represent standard deviations across cases. Accuracy reflects the harmonic mean of precision and recall relative to the gold standard. Info, Patient Information; Compl, Current Complaints; PsyHist, Psychiatric History; MedHist, Medical History, Subst, Substance Use; Social, Social History, Family, Family History, Veget, Vegetative Symptoms; PsychPath, Psychopathology; Prelim, Preliminary Diagnosis.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-17-1621532-g004.tif">
<alt-text content-type="machine-generated">Bar chart comparing F1 scores by category for human and AI reports. Categories include Info, Compl, PsyHis, MedHis, Medicat, Subst, Social, Family, Veget, PsychPath, and Prelim. Blue bars represent human reports, and orange bars represent AI reports. The chart shows scores and error bars with human reports generally scoring higher across most categories.</alt-text>
</graphic></fig>
<p><xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref> shows examples of both human and AI reports, compared to the transcript, to visualize how information from the recorded conversation was converted to the report and to contextualize the quantitative performance differences across categories.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Side-by-side comparison of the human and AI reports next to excerpts of the transcript.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Transcript</th>
<th valign="top" align="center">Human report</th>
<th valign="middle" align="center">AI report</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="2" align="left"><italic>Doctor:</italic> Okay, and how long has this been going on?<break/><italic>Patient:</italic> It has actually been like this for about two or three years, but in the past few weeks, it has gotten much worse again. (&#x2026;) Then I basically spend the whole day either in bed or on the couch watching TV, but otherwise, I don&#x2019;t do anything else anymore.</td>
<td valign="middle" colspan="2" align="center">Current complaints (example of interview no. 1)</td>
</tr>
<tr>
<td valign="top" align="left">The symptoms have been present for 2&#x2013;3 years but have significantly worsened in recent weeks. There is no specific trigger. The patient spends a lot of time in bed (&#x2026;) and spends the rest of the day watching TV.</td>
<td valign="top" align="left">He has been feeling extremely exhausted and isolated for the past 2&#x2013;3 years, and his condition has worsened in recent weeks. His daily routine mainly consists of lying in bed or on the couch and watching TV.</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="left"><italic>Patient:</italic> First, (my psychiatrist) prescribed me Sertraline. But that caused stomach problems for me, so he stopped it, and now I take Mirtazapine every evening.<break/><italic>Doctor:</italic> Have there been any improvements?<break/><italic>Patient:</italic> I might be able to fall asleep a bit better, but I still feel really bad.<break/>&#x223c;<break/><italic>Patient:</italic> No, I&#x2019;ve never had psychotherapy. (My psychiatrist) said I should search for a psychotherapist, but I just couldn&#x2019;t manage it.<break/>&#x223c;<break/><italic>Patient:</italic> And now with the Mirtazapine, I&#x2019;ve maybe gained two kilos or so.<break/>&#x223c;<break/><italic>Doctor:</italic> And have you ever had an inpatient stay?<break/><italic>Patient:</italic> You mean psychiatric? No, I&#x2019;ve never been inside of a psychiatric hospital.</td>
<td valign="middle" colspan="2" align="center">Psychiatric history (example of interview no. 1)</td>
</tr>
<tr>
<td valign="top" align="left">His medication was switched from sertraline to mirtazapine due to gastrointestinal side effects.<break/>Under mirtazapine, there has been slight weight gain but improvement in sleep disturbances. No previous inpatient stays, no psychotherapy.</td>
<td valign="top" align="left">He previously took sertraline but stopped due to side effects.<break/>Psychotherapy has been recommended but has not yet been initiated.</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="left"><italic>Doctor:</italic> Do you take any other medication (&#x2026;) besides the Mirtazapine?<break/><italic>Patient:</italic> My general practitioner prescribes me something for blood pressure.<break/><italic>Doctor:</italic> What is it called?<break/><italic>Patient:</italic> It&#x2019;s called Ramipril, I think.<break/>&#x223c;<break/><italic>Doctor:</italic> But do you have anything else?<break/><italic>Patient:</italic> I had my appendix removed once.<break/><italic>Doctor:</italic> Okay. Do you have any allergies? You mentioned hay fever, but any allergies against medication?<break/><italic>Patient:</italic> I&#x2019;m allergic to penicillin.</td>
<td valign="middle" colspan="2" align="center">Medical history and medication (example of interview no. 1)</td>
</tr>
<tr>
<td valign="top" align="left">Hypertension, appendectomy<break/>Allergies: penicillin and hay fever<break/>Medication: mirtazapine and ramipril</td>
<td valign="top" align="left"><bold>n/c</bold><break/>Allergies: penicillin<break/>Medication: mirtazapine (for 1.5 years <bold>against sleep problem</bold>s) and ramipril (against hypertonia)</td>
</tr>
<tr>
<td valign="middle" rowspan="2" align="left"><italic>Patient:</italic> So, I&#x2019;m actually studying. But I haven&#x2019;t been able to attend my seminars in the last few months. And this semester, I haven&#x2019;t managed to do anything.<break/><italic>Doctor:</italic> What are you studying?<break/><italic>Patient:</italic> I&#x2019;m studying economics.<break/>&#x223c;<break/><italic>Doctor:</italic> Do you live alone or in a shared apartment?<break/><italic>Patient:</italic> I live in a shared apartment with another person, but they are also not home often. So, I am often alone.<break/><italic>Doctor:</italic> So, you also feel lonely often?<break/><italic>Patient:</italic> Yes, exactly.<break/>&#x223c;<break/><italic>Patient:</italic> I have a few friends, but they don&#x2019;t live nearby. So, I can&#x2019;t see them very often.<break/>&#x223c;<break/><italic>Doctor:</italic> And your family, where do they live?<break/><italic>Patient:</italic> Um, they live in Braunschweig. (&#x2026;) I talk to them on the phone now and then.<break/>&#x223c;<break/><italic>Doctor:</italic> Do you have a partner?<break/><italic>Patient:</italic> No, not anymore.<break/><italic>Doctor:</italic> When did the relationship end, if I may ask?<break/>That was five or six <bold>years</bold> ago.</td>
<td valign="middle" colspan="2" align="left">Social history (example of interview no. 2)</td>
</tr>
<tr>
<td valign="top" align="left">Studies economics, but could not continue for 4 months now. Had a job as a waitress, but had to quit due to a depressive episode. In Berlin, the patient has only sporadic contact with her friends. Her family lives in Braunschweig.</td>
<td valign="top" align="left">Lives in a shared apartment. Often feels lonely. Her family lives in Braunschweig, and they have contact by telephone. She has not had a partner for 5 to 6 years.</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Errors are highlighted in bold.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>An example of when AI performance was comparatively close to the human report and mirrored the interview content correctly is the current complaints section of interview no. 1. The patient reports a symptom duration of 2 to 3 years with marked worsening in recent weeks and describes spending most of the day in bed or on the couch watching television. Both the human report (&#x201c;The symptoms have been present for 2&#x2013;3 years but have significantly worsened in recent weeks&#x2026;&#x201d;) and the AI report (&#x201c;He has been feeling extremely exhausted and isolated for the past 2&#x2013;3 years&#x2026;&#x201d;) correctly captured symptom chronicity, functional impairment, and behavioral withdrawal.</p>
<p>In contrast, regarding their psychiatric history, in interview no. 1, the patient reports a switch from sertraline to mirtazapine due to gastrointestinal side effects, slight weight gain, improved sleep, absence of psychotherapy, and no prior inpatient treatment. While the human report explicitly integrated all these elements into a coherent longitudinal summary, the AI report selectively emphasized medication changes and side effects but omitted or fragmented contextual information, such as the lack of psychotherapy or inpatient stays. These omissions led to false negatives at the item level, reducing recall and F1 despite largely correct information.</p>
<p>In the medical history and medication categories, the excerpts show both the strengths and risks of AI enrichment. Compared with the human report, the AI sometimes appended additional medication context, such as an assumed indication or duration of treatment, even when the human report listed only the medication names. In interview no. 1, for example, the human report listed hypertension and appendectomy, penicillin and hay fever, and medication with mirtazapine and ramipril. The AI report, in contrast, included additional specifications, such as stating that mirtazapine had been prescribed for 1.5 years and assigning an indication (&#x201c;against sleep problems&#x201d;), and it also provided an indication for ramipril (&#x201c;against hypertonia&#x201d;). While indication and duration fields can be clinically useful, this excerpt also demonstrates the central failure mode: the assigned indication for mirtazapine was incorrect because the transcript indicates that mirtazapine was prescribed after sertraline for depressive symptoms, with sleep improvement reported as an effect rather than the primary reason for prescribing. This kind of plausible-sounding but incorrect attribute assignment would be expected to reduce precision and therefore depress F1, particularly in categories where correct attribution of diagnoses, indications, and historical treatments is central.</p>
<p>By contrast, social history errors more frequently involved abstraction and loss of specificity. In interview no. 2, the patient provides detailed information about studies, employment interruption, living situation, social isolation, geographic distance from friends, and family contact. The human report integrated these details into a structured narrative, whereas the AI report condensed the information into broader statements (&#x201c;Often feels lonely&#x201d; and &#x201c;family contact per telephone&#x201d;) and omitted temporal qualifiers such as the duration of unemployment or partnership status. These compressions are clinically plausible but failed to meet item-level criteria, leading to lower F1 scores despite superficially adequate summaries.</p>
<p>Sometimes, errors occurred due to faulty transcription. For example, once, the AI wrote that the patient had not been in a relationship for 5 to 6 years, although the patient stated that her relationship ended 5 to 6 <italic>months</italic> ago, missing a direct temporal link to the beginning of the current depressive episode, only because of the false transcription of <italic>years</italic> instead of <italic>months</italic>. In other cases, errors occurred, although the transcription was correct.</p>
<p>In some cases, the content of certain categories was misplaced altogether. <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref> shows a few examples where a category consisted of almost no adequate information. We could not identify what caused this error.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Examples of unfitting or inadequate content within the AI reports.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">AI report</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" align="center">Medical History (example of interview no. 5)</th>
</tr>
<tr>
<td valign="middle" align="left">No obsessive&#x2013;compulsive disorders, phobias, delusions, or hallucinations.<break/>Eating disorder in their youth?<break/>Previous physical illnesses?<break/>Hospital stays?</td>
</tr>
<tr>
<th valign="middle" align="center">Vegetative symptoms (example of interview no. 5)</th>
</tr>
<tr>
<td valign="middle" align="left">No further medication other than lithium and quetiapine?<break/>Allergies?</td>
</tr>
<tr>
<th valign="middle" align="center">Psychiatric history (example of interview no. 4)</th>
</tr>
<tr>
<td valign="middle" align="left">No anxiety, phobias, compulsions, panic attacks, hallucinations, or suicidal thoughts. No experience with a false reality, mind reading, secret messages, feeling persecuted, or malevolent people. No experience with unreal body sensations, loud thoughts, or foreign voices. Brief thoughts about non-existence, but no desire for suicide or self-harm. Mother has depression, and brother has ADHD without medication.</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>At last, we show examples of AI reports for psychopathology, the most challenging category. As can be seen in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>, in some cases, AI described certain aspects of psychopathology correctly, especially mood and drive, but sometimes also suicidal thoughts, compulsions, phobias, and panic attacks. Impairments in concentration or memory were correctly identified some of the time, but sometimes incorrectly as well. Both formal and content thought disorders were incorrect most of the time. Interestingly, one time, the human report was wrong, and the AI report was correct: in interview no. 2, the human incorrectly stated that the patient was able to recall three out of three words after 10 minutes, documenting no impairment of memory retention, although the ground-truth transcript proved that the patient did not memorize all three words, which was correctly stated by the AI and apparently incorrectly memorized or documented by the physician.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Side-by-side comparison of the human and AI report.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Human report</th>
<th valign="middle" align="center">AI report</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" colspan="2" align="center">Interview no. 1</th>
</tr>
<tr>
<td valign="middle" align="left">In contact: awake, conscious, friendly, approachable, and communicative. Fully oriented to place, situation, and person; temporal orientation not tested. Memory retention (1 out of 3 terms recalled after 10 minutes) and concentration reduced (0/5 in serial subtraction and 5/5 in backward spelling). No amnesia. Comprehension and abstract thinking not impaired. Worries about the future. Social phobia with avoidant behavior. Panic attacks and derealization experiences are present. No compulsive thoughts or actions. Formal thought is slightly slowed with a tendency to ruminate. No content thought disorders. No ego disturbances. Depressed mood with reduced emotional fluctuation. Severe lack of drive with a loss of everyday competencies. Psychomotor activity is normal. Thoughts of being tired of life with passive death wishes. No suicidal thoughts or plans. Credibly distanced from acute suicidality. No acute self- or other-directed danger. Insight into the illness is present.</td>
<td valign="middle" align="left">(The patient) is awake and conscious. He is <bold>oriented to time</bold>, place, situation, and person. He <bold>shows comprehension</bold> and concentration disturbances, with <bold>several errors in backward spelling</bold> and serial subtraction. There are memory retention issues, as he could only remember a few terms after 10 minutes. There are indications of formal thought disorders such as ruminations and <bold>thought pressure</bold>. The patient exhibits <bold>mistrust</bold>, <bold>hypochondria</bold>, and <bold>obsessive thoughts</bold>. <bold>There are content-related thought disturbances such as depersonalization and hallucinations.</bold> The patient&#x2019;s mood is depressed, and his affect is blunted. His drive is greatly reduced, and <bold>psychomotor activity is slowed</bold>. <bold>Suicidality is present</bold>. Psychovegetative abnormalities are present. <bold>Insight into his condition is not clearly recognizable.</bold></td>
</tr>
<tr>
<th valign="middle" colspan="2" align="center">Interview no. 2</th>
</tr>
<tr>
<td valign="middle" align="left">In contact: awake, conscious, friendly, approachable, and communicative. Slightly disoriented in time (Monday, January 9, 2024). Fully oriented to place, situation, and person. <bold>Memory retention (3 out of 3 terms recalled after 10 minutes)</bold> and concentration (5/5 in serial subtraction and 5/5 in backward spelling) objectively intact but subjectively reduced. Memory, comprehension, and abstraction in proverb testing are unimpaired. Fear of loneliness, social phobia, acrophobia, and claustrophobia with avoidance behavior. Panic attacks with fear of death. Depersonalization experiences. Compulsive actions (checking the stove and checking emails). No compulsive thoughts. Formal thought is slightly slowed with a tendency to ruminate. No content thought disorders. No perceptual disturbances. No ego disturbances. Depressed mood with reduced emotional fluctuation. Reduced drive. Psychomotor activity is normal. Suicidal thoughts and methods (medication overdose), but credibly distanced from acute suicidality. No acute self- or other-directed danger. Insight into the illness is present.</td>
<td valign="middle" align="left">(The patient) is awake and conscious. She is <bold>oriented to time</bold>, place, situation, and person. <bold>Comprehension and concentration disturbances in backward spelling and serial subtraction were noted</bold>. She had memory retention issues, recalling only a few terms after 10 minutes. <bold>Formal thought disorders such as thought pressure and flight of ideas were observed</bold>. The patient shows fears of loneliness, heights, confined spaces, and social interactions. She suffers from panic attacks, compulsive actions, suicidal thoughts, identity loss, memory problems, sadness, lack of drive, and ruminations. The patient&#x2019;s mood is depressed, and her affect is blunted. Her drive is reduced, and <bold>psychomotor activity is slowed</bold>. Suicidality is present. Psychovegetative abnormalities such as sleep disturbances, loss of appetite, and weight loss were found. <bold>Insight into her condition is limited</bold>.</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Errors are highlighted in bold.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Limitations</title>
<p>This study has several limitations that should be acknowledged. First and most importantly, the work was based on simulated psychiatric interviews conducted under highly controlled conditions. Real-world clinical interviews show much greater variability in communication style, emotional tone, comorbidity, and the expression of psychopathology and often involve complex relational dynamics. It is therefore likely that model performance in actual clinical practice would be lower than in the present simulations. Future studies should therefore be conducted in diverse clinical environments to gather comprehensive data on how the AI performs across different psychiatric conditions and patient demographics.</p>
<p>Second, the sample size was extremely small, which means that all statistical analyses must be considered exploratory and that effect estimates were unstable. The results should thus be understood as preliminary signals that require confirmation in larger and more diverse samples of patients and clinicians.</p>
<p>Third, no systematic calibration analysis (e.g., through temperature scaling, Platt scaling, or reliability diagrams) was conducted. In the context of psychiatric interview transcription and summarization, mispredictions made with high confidence could be particularly problematic, as they may introduce clinically significant misunderstandings that remain unchecked. Our primary focus was on the qualitative evaluation of generated notes in collaboration with subject-matter experts, and calibration was therefore not included in the methodological scope. Future work should incorporate calibration assessments to better characterize and mitigate the risks of overconfident errors in clinical applications.</p>
<p>Finally, no automated metrics such as ROUGE or embedding-based similarity were applied for the evaluation of summarization quality. Psychiatric interview reports are inherently heterogeneous in structure, sequence, and focus, even between human clinicians, which reduces the interpretability and clinical relevance of such metrics. Small differences in phrasing or emphasis may result in low similarity scores despite equivalent clinical accuracy. To address this challenge, we instead adopted an approach with a structured codebook development and independent double coding, following recognized frameworks for quantitative content analysis.</p>
<p>Taken together, our findings should be viewed as an early feasibility and method study rather than as evidence for clinical implementation. The main contribution of this work is to demonstrate that it is technically possible to generate structured psychiatric documentation directly from recorded interviews and to outline a concrete evaluation framework that combines transcription accuracy with clinically meaningful content metrics. The study is not designed or powered to support strong claims about clinical effectiveness or safety, and it does not address how such a system would perform in the full heterogeneity of routine care.</p>
</sec>
<sec id="s5" sec-type="discussion">
<label>5</label>
<title>Discussion</title>
<p>While acknowledging the above-stated limitations of this study, our results suggest a significant potential for AI to streamline the documentation process and reduce documentation burden for clinicians in psychiatry. However, our findings regarding erroneous and redundant information point to the need for refining the AI to reduce inaccuracies and superfluous content to ensure reliable documentation.</p>
<p>Specifically for psychiatry, there was one great weakness of the AI: Its performance in documenting psychopathology. This underscores a critical area for further development, as accurate and structured documentation of psychopathology is essential for psychiatric evaluations (<xref ref-type="bibr" rid="B32">32</xref>). Recognizing psychopathology can be quite a demanding task, with the need for professional training in the psychiatric field. There are several aspects of psychopathology that can only be assessed by professionals with enough experience and expertise, for instance, to identify and distinguish between delusions, partial delusions, and preoccupations. One solution would be further, specific training of the AI for psychopathology. In a future study, we plan to train the AI on a high number of psychopathology reports for it to learn the general structure and combine this with audio records or transcripts from the doctor&#x2013;patient conversations from which the psychopathology reports were obtained (<xref ref-type="bibr" rid="B33">33</xref>). However, this can be a difficult task to accomplish, as audio records from real doctor&#x2013;patient encounters in psychiatry are rare and highly sensitive and could sometimes even risk patients&#x2019; privacy (<xref ref-type="bibr" rid="B34">34</xref>). Another hurdle would be that some aspects of psychopathology are not exclusive to the content of what patients say, but the intonation and volume of their way of talking (<xref ref-type="bibr" rid="B35">35</xref>). Meanwhile, the approach of the AI model that we examined focuses only on written transcripts, giving no insight into the sound or nature of speech. Additionally, some aspects of psychopathology are not apparent through speech at all. For example, psychomotor disorders can only be able to be perceived visually (<xref ref-type="bibr" rid="B18">18</xref>). However, the AI guessed those disorders, eventually wrong in many cases. AI, therefore, should be trained to only describe those aspects of psychopathology that can accurately be described by the content of an interview. Alternatively, developers would have to find additional ways to assess changes in tonality and volume of speech, and even visual analysis via video.</p>
<p>However, the AI demonstrated a significant potential to reduce the documentation burden by automating substantial portions of the documentation process. This could free up clinicians&#x2019; time for direct patient care and other critical tasks, potentially improving the overall efficiency and quality of psychiatric care (<xref ref-type="bibr" rid="B10">10</xref>). Even with their current restrictions, we suppose that the AI would already significantly reduce the time spent on documentation, as even the brief correction of its mistakes would cost a lot less time than writing the whole report manually. To prove this hypothesis in future studies, we plan to include time measurements to compare the time spent on manual report creation and the time spent on revision and correction of AI reports. Only by obtaining exact time measurements will a quantitative assessment of the effectiveness of AI-driven scribes be possible, which is the key point in testing whether the documentation burden can be reduced.</p>
<p>Importantly, it is worth noting that some AI-generated inaccuracies were highly clinically relevant. For example, previous diagnoses were misunderstood, and the wrong names of medication were recorded. This underscores that human supervision remains a crucial component in ensuring the safety and accuracy of AI-generated documentation (<xref ref-type="bibr" rid="B36">36</xref>). Moreover, the involvement of clinicians in the review process can provide an additional layer of quality control and continuous improvement for the AI system. By systematically analyzing and correcting the AI&#x2019;s errors, clinicians can help refine the algorithms and contribute to the development of more accurate and reliable AI tools (<xref ref-type="bibr" rid="B12">12</xref>). This iterative feedback loop is essential for enhancing the AI&#x2019;s performance and ensuring that it evolves to meet the complex demands of real-world psychiatric practice.</p>
<p>However, integrating AI into clinical documentation processes raises important considerations about the clinician&#x2013;patient relationship (<xref ref-type="bibr" rid="B37">37</xref>, <xref ref-type="bibr" rid="B38">38</xref>). There is a potential risk that over-reliance on AI could depersonalize patient care, as clinicians may spend less time engaging directly with patients. To mitigate this, it is important to strike a balance where AI supports clinicians by reducing administrative burdens without diminishing the quality of interpersonal interactions and empathetic patient care. There is evidence that to ensure a positive impact on the clinician&#x2013;patient relationship, AI should not <italic>replace</italic> but rather <italic>assist</italic> clinicians in treatment (<xref ref-type="bibr" rid="B37">37</xref>). Future studies should also explore the impact of AI integration on patient satisfaction and the therapeutic alliance, ensuring that technological advancements complement rather than compromise the human aspects of psychiatric practice.</p>
<p>The deployment of AI in the psychiatric field also raises important ethical and legal questions and ensures that patient confidentiality, data security, and adherence to legal standards will be of high importance (<xref ref-type="bibr" rid="B39">39</xref>&#x2013;<xref ref-type="bibr" rid="B41">41</xref>). Under the European General Data Protection Regulation (GDPR), audio recordings of psychiatric consultations and their automated analysis constitute the processing of particularly sensitive personal data (<xref ref-type="bibr" rid="B42">42</xref>). Any clinical deployment, therefore, requires a clearly defined legal basis, for example, within the framework of medical treatment and quality assurance, as well as strict adherence to the principles of data minimization, purpose limitation, and storage limitation. In practice, this implies that patients must be transparently informed about the nature and purpose of the recordings, the role of the AI system, and the limits of its use (<xref ref-type="bibr" rid="B43">43</xref>). They should understand that the clinician remains fully responsible for the content of the final report and for all diagnostic and therapeutic decisions and that AI-generated content is only a draft that requires critical review (<xref ref-type="bibr" rid="B44">44</xref>). The technical implementation of AI-supported documentation must also meet high standards of data protection and information security (<xref ref-type="bibr" rid="B45">45</xref>, <xref ref-type="bibr" rid="B46">46</xref>). Whenever possible, processing should occur within secure institutional infrastructures, and the use of external cloud services needs scrutiny, contractual safeguards, and data protection impact assessments. Current debates on AI-supported clinical documentation in psychiatry highlight concerns about potential misuse of sensitive data, unintended secondary uses, and the risk that automated summaries could be accessed by parties beyond the immediate treatment team (<xref ref-type="bibr" rid="B47">47</xref>, <xref ref-type="bibr" rid="B48">48</xref>). These concerns underline the importance of governance structures that involve data protection officers, ethics committees, clinicians, and patient representatives. Our proof-of-concept study deliberately avoided many of these issues by relying on simulated interviews and by processing the data in a controlled research environment. Future work that moves toward real patient data will need to integrate ethical and legal considerations from the outset and to demonstrate not only technical performance but also compliance with data protection law and acceptance by patients and clinicians.</p>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusions</title>
<p>In conclusion, this proof-of-concept study suggests that AI systems can generate usable draft documentation from psychiatric interviews but also reveal important limitations. The AI achieved high transcription accuracy and produced structured reports that could be systematically evaluated. The evaluation methodology itself demonstrated high inter-rater reliability across both human and AI-generated reports, supporting the robustness of the coding and comparison procedure. While AI-generated reports showed substantially lower overall performance than human reports, it varied markedly across clinical domains. In more factual and demographically oriented categories, AI performance approached human-level agreement with the gold standard. Given the very small and simulated sample, these findings are preliminary and should be considered hypothesis-generating. The main contribution of this work is an evaluation framework and initial signal of feasibility, not evidence for clinical deployment. Future work in real-world settings, combined with robust safeguards for data protection and clinical oversight, will be essential. Nonetheless, AI-supported documentation has the potential to considerably reduce time demands and alleviate the documentation burden in psychiatric care.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p></sec>
<sec id="s8" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Ethics Committee of Charit&#xe9; Unviersit&#xe4;tsmedizin Berlin (application number EA4/215/23). The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study. Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p></sec>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>BG: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. RH: Data curation, Investigation, Writing &#x2013; review &amp; editing. ES: Software, Visualization, Writing &#x2013; original draft. SK: Conceptualization, Project administration, Supervision, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>ES was employed by Amnexis Digital Solutions GmbH.</p>
<p>The remaining author(s) declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s12" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. Generative artificial intelligence tools were used to support the organization and structuring of the manuscript, assist with code development for data analysis, and improve clarity and consistency of the written text. All study design decisions, data analyses, interpretation of results, and final content were conducted and verified by the authors, who take full responsibility for the manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s13" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gaffney</surname> <given-names>A</given-names></name>
<name><surname>Woolhandler</surname> <given-names>S</given-names></name>
<name><surname>Cai</surname> <given-names>C</given-names></name>
<name><surname>Bor</surname> <given-names>D</given-names></name>
<name><surname>Himmelstein</surname> <given-names>J</given-names></name>
<name><surname>McCormick</surname> <given-names>D</given-names></name>
<etal/>
</person-group>. 
<article-title>Medical documentation burden among US office-based physicians in 2019: A national study</article-title>. <source>JAMA Intern Med</source>. (<year>2022</year>) <volume>182</volume>:<fpage>564</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1001/jamainternmed.2022.0372</pub-id>, PMID: <pub-id pub-id-type="pmid">35344006</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<label>2</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chivilgina</surname> <given-names>O</given-names></name>
<name><surname>Elger</surname> <given-names>BS</given-names></name>
<name><surname>Benichou</surname> <given-names>MM</given-names></name>
<name><surname>Jotterand</surname> <given-names>F</given-names></name>
</person-group>. 
<article-title>What&#x2019;s the best way to document information concerning psychiatric patients? I just don&#x2019;t know&#x201d;&#x2014;A qualitative study about recording psychiatric patients notes in the era of electronic health records</article-title>. <source>PLoS One</source>. (<year>2022</year>) <volume>17</volume>:<fpage>e0264255</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0264255</pub-id>, PMID: <pub-id pub-id-type="pmid">35239698</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<label>3</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sinsky</surname> <given-names>C</given-names></name>
<name><surname>Colligan</surname> <given-names>L</given-names></name>
<name><surname>Li</surname> <given-names>L</given-names></name>
<name><surname>Prgomet</surname> <given-names>M</given-names></name>
<name><surname>Reynolds</surname> <given-names>S</given-names></name>
<name><surname>Goeders</surname> <given-names>L</given-names></name>
<etal/>
</person-group>. 
<article-title>Allocation of physician time in ambulatory practice: A time and motion study in 4 specialties</article-title>. <source>Ann Internal Med</source>. (<year>2016</year>) <volume>165</volume>:<fpage>753</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.7326/M16-0961</pub-id>, PMID: <pub-id pub-id-type="pmid">27595430</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<label>4</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Moy</surname> <given-names>AJ</given-names></name>
<name><surname>Schwartz</surname> <given-names>JM</given-names></name>
<name><surname>Chen</surname> <given-names>R</given-names></name>
<name><surname>Sadri</surname> <given-names>S</given-names></name>
<name><surname>Lucas</surname> <given-names>E</given-names></name>
<name><surname>Cato</surname> <given-names>KD</given-names></name>
<etal/>
</person-group>. 
<article-title>Measurement of clinical documentation burden among physicians and nurses using electronic health records: a scoping review</article-title>. <source>J Am Med Inf Assoc</source>. (<year>2021</year>) <volume>28</volume>:<fpage>998</fpage>&#x2013;<lpage>1008</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/jamia/ocaa325</pub-id>, PMID: <pub-id pub-id-type="pmid">33434273</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<label>5</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gardner</surname> <given-names>RL</given-names></name>
<name><surname>Cooper</surname> <given-names>E</given-names></name>
<name><surname>Haskell</surname> <given-names>J</given-names></name>
<name><surname>Harris</surname> <given-names>DA</given-names></name>
<name><surname>Poplau</surname> <given-names>S</given-names></name>
<name><surname>Kroth</surname> <given-names>PJ</given-names></name>
<etal/>
</person-group>. 
<article-title>Physician stress and burnout: the impact of health information technology</article-title>. <source>J Am Med Inf Assoc</source>. (<year>2019</year>) <volume>26</volume>:<page-range>106&#x2013;14</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/jamia/ocy145</pub-id>, PMID: <pub-id pub-id-type="pmid">30517663</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<label>6</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Dyrbye</surname> <given-names>LN</given-names></name>
<name><surname>Shanafelt</surname> <given-names>TD</given-names></name><collab>Mayo Clinic</collab>
<name><surname>Sinsky</surname> <given-names>CA</given-names></name><collab>American Medical Association</collab>
<name><surname>Cipriano</surname> <given-names>PF</given-names></name>
<etal/>
</person-group>. 
<article-title>Burnout among health care professionals: A call to explore and address this underrecognized threat to safe, high-quality care</article-title>. <source>NAM Perspect</source>. <volume>7</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.31478/201707b</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<label>7</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ash</surname> <given-names>JS</given-names></name>
</person-group>. 
<article-title>Some unintended consequences of information technology in health care: the nature of patient care information system-related errors</article-title>. <source>J Am Med Inf Assoc</source>. (<year>2003</year>) <volume>11</volume>:<page-range>104&#x2013;12</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1197/jamia.M1471</pub-id>, PMID: <pub-id pub-id-type="pmid">14633936</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<label>8</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tajirian</surname> <given-names>T</given-names></name>
<name><surname>Jankowicz</surname> <given-names>D</given-names></name>
<name><surname>Lo</surname> <given-names>B</given-names></name>
<name><surname>Sequeira</surname> <given-names>L</given-names></name>
<name><surname>Strudwick</surname> <given-names>G</given-names></name>
<name><surname>Almilaji</surname> <given-names>K</given-names></name>
<etal/>
</person-group>. 
<article-title>Tackling the burden of electronic health record use among physicians in a mental health setting: physician engagement strategy</article-title>. <source>J Med Internet Res</source>. (<year>2022</year>) <volume>24</volume>:<elocation-id>e32800</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.2196/32800</pub-id>, PMID: <pub-id pub-id-type="pmid">35258473</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<label>9</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Spie&#xdf;l</surname> <given-names>H</given-names></name>
<name><surname>Hausner</surname> <given-names>H</given-names></name>
</person-group>. 
<article-title>&#xc4;rztliche dokumentation in der psychiatrie</article-title>. <source>Fortschr Neurol Psychiatr</source>. (<year>2012</year>) <volume>80</volume>:<fpage>53</fpage>&#x2013;<lpage>60</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1055/s-0031-1282019</pub-id>, PMID: <pub-id pub-id-type="pmid">22234816</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<label>10</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Luh</surname> <given-names>JY</given-names></name>
<name><surname>Thompson</surname> <given-names>RF</given-names></name>
<name><surname>Lin</surname> <given-names>S</given-names></name>
</person-group>. 
<article-title>Clinical documentation and patient care using artificial intelligence in radiation oncology</article-title>. <source>J Am Coll Radiol</source>. (<year>2019</year>) <volume>16</volume>:<page-range>1343&#x2013;6</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jacr.2019.05.044</pub-id>, PMID: <pub-id pub-id-type="pmid">31238022</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<label>11</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lin</surname> <given-names>SY</given-names></name>
<name><surname>Shanafelt</surname> <given-names>TD</given-names></name>
<name><surname>Asch</surname> <given-names>SM</given-names></name>
</person-group>. 
<article-title>Reimagining clinical documentation with artificial intelligence</article-title>. <source>Mayo Clinic Proc</source>. (<year>2018</year>) <volume>93</volume>:<page-range>563&#x2013;5</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.mayocp.2018.02.016</pub-id>, PMID: <pub-id pub-id-type="pmid">29631808</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<label>12</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kunz</surname> <given-names>V</given-names></name>
<name><surname>Wildfeuer</surname> <given-names>V</given-names></name>
<name><surname>Bieck</surname> <given-names>R</given-names></name>
<name><surname>Sorge</surname> <given-names>M</given-names></name>
<name><surname>Zebralla</surname> <given-names>V</given-names></name>
<name><surname>Dietz</surname> <given-names>A</given-names></name>
<etal/>
</person-group>. 
<article-title>Keyword-augmented and semi-automatic generation of FESS reports: a proof-of-concept study</article-title>. <source>Int J CARS</source>. (<year>2022</year>) <volume>18</volume>:<page-range>961&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11548-022-02791-0</pub-id>, PMID: <pub-id pub-id-type="pmid">36394797</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<label>13</label>
<mixed-citation publication-type="web"><source>Amnexis digital solutions gmbH</source>. Available online at: <uri xlink:href="https://amnexis.com/de/">https://amnexis.com/de/</uri> (Accessed <date-in-citation content-type="access-date">August 12, 2024</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B14">
<label>14</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sezgin</surname> <given-names>E</given-names></name>
<name><surname>Sirrianni</surname> <given-names>JW</given-names></name>
<name><surname>Kranz</surname> <given-names>K</given-names></name>
</person-group>. 
<article-title>Evaluation of a digital scribe: conversation summarization for emergency department consultation calls</article-title>. <source>Appl Clin Inform</source>. (<year>2024</year>) <volume>15</volume>:<page-range>600&#x2013;11</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1055/a-2327-4121</pub-id>, PMID: <pub-id pub-id-type="pmid">38749477</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<label>15</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Fogleman</surname> <given-names>BM</given-names></name>
<name><surname>Goldman</surname> <given-names>M</given-names></name>
<name><surname>Holland</surname> <given-names>AB</given-names></name>
<name><surname>Dyess</surname> <given-names>G</given-names></name>
<name><surname>Patel</surname> <given-names>A</given-names></name>
</person-group>. 
<article-title>Charting tomorrow&#x2019;s healthcare: A traditional literature review for an artificial intelligence-driven future</article-title>. <source>Cureus</source>. (<year>2024</year>) <volume>16</volume>:<elocation-id>e58032</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.7759/cureus.58032</pub-id>, PMID: <pub-id pub-id-type="pmid">38738104</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<label>16</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Radford</surname> <given-names>A</given-names></name>
<name><surname>Kim</surname> <given-names>JW</given-names></name>
<name><surname>Xu</surname> <given-names>T</given-names></name>
<name><surname>Brockman</surname> <given-names>G</given-names></name>
<name><surname>McLeavey</surname> <given-names>C</given-names></name>
<name><surname>Sutskever</surname> <given-names>I</given-names></name>
</person-group>. 
<article-title>Robust speech recognition via large-scale weak supervision</article-title>. In: <source>Proceedings of the 40th International Conference on Machine Learning (ICML 2023)</source>. <publisher-loc>Honolulu (HI)</publisher-loc>: 
<publisher-name>Proceedings of Machine Learning Research</publisher-name> (<year>2023</year>) p. <page-range>28492&#x2013;518</page-range>.
</mixed-citation>
</ref>
<ref id="B17">
<label>17</label>
<mixed-citation publication-type="web">
<person-group person-group-type="author"><collab>OpenAI</collab>
<name><surname>Achiam</surname> <given-names>J</given-names></name>
<name><surname>Adler</surname> <given-names>S</given-names></name>
<name><surname>Agarwal</surname> <given-names>S</given-names></name>
<name><surname>Ahmad</surname> <given-names>L</given-names></name>
<name><surname>Akkaya</surname> <given-names>I</given-names></name>
<etal/>
</person-group>. <source>GPT-4 technical report</source>. Available online at: <uri xlink:href="http://arxiv.org/abs/2303.08774">http://arxiv.org/abs/2303.08774</uri> (Accessed <date-in-citation content-type="access-date">May 28, 2024</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B18">
<label>18</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author"><collab>Arbeitsgemeinschaft f&#xfc;r Methodik und Dokumentation in der Psychiatrie</collab>
</person-group>. 
<article-title>Das AMDP-System: Manual zur Dokumentation psychiatrischer Befunde</article-title>. In: <source>korrigierte auflage</source>. 
<publisher-name>Hogrefe</publisher-name>, <publisher-loc>G&#xf6;ttingen</publisher-loc> (<year>2018</year>). p. <fpage>10</fpage>.
</mixed-citation>
</ref>
<ref id="B19">
<label>19</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Makhoul</surname> <given-names>J</given-names></name>
<name><surname>Schwartz</surname> <given-names>R</given-names></name>
</person-group>. 
<article-title>State of the art in continuous speech recognition</article-title>. <source>Proc Natl Acad Sci U S A</source>. (<year>1995</year>) <volume>92</volume>:<page-range>9956&#x2013;63</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1073/pnas.92.22.9956</pub-id>, PMID: <pub-id pub-id-type="pmid">7479809</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<label>20</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Van Rossum</surname> <given-names>G</given-names></name>
<name><surname>Drake</surname> <given-names>FL</given-names></name>
</person-group>. <source>Python 3 reference manual</source>. <publisher-loc>Scotts Valley, CA</publisher-loc>: 
<publisher-name>CreateSpace</publisher-name> (<year>2009</year>).
</mixed-citation>
</ref>
<ref id="B21">
<label>21</label>
<mixed-citation publication-type="other">. Speechmatics Python Client.
</mixed-citation>
</ref>
<ref id="B22">
<label>22</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Levenshtein</surname> <given-names>VI</given-names></name>
</person-group>. 
<article-title>Binary codes capable of correcting deletions, insertions and reversals</article-title>. <source>Soviet Phys Doklady</source>. (<year>1966</year>) <volume>10</volume>:<page-range>707&#x2013;10</page-range>.
</mixed-citation>
</ref>
<ref id="B23">
<label>23</label>
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Halchenko</surname> <given-names>Y</given-names></name>
</person-group>. <source>python-levenshtein</source> (<year>2008</year>). Available online at: <uri xlink:href="https://github.com/ztane/python-Levenshtein">https://github.com/ztane/python-Levenshtein</uri> (Accessed <date-in-citation content-type="access-date">November 20, 2025</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B24">
<label>24</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cohen</surname> <given-names>J</given-names></name>
</person-group>. 
<article-title>A Coefficient of agreement for nominal scales</article-title>. <source>Educ psychol Measurement</source>. (<year>1960</year>) <volume>20</volume>:<fpage>37</fpage>&#x2013;<lpage>46</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<label>25</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>McHugh</surname> <given-names>ML</given-names></name>
</person-group>. 
<article-title>Interrater reliability: the kappa statistic</article-title>. <source>Biochem Med (Zagreb)</source>. (<year>2012</year>) <volume>22</volume>:<page-range>276&#x2013;82</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.11613/BM.2012.031</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<label>26</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gwet</surname> <given-names>KL</given-names></name>
</person-group>. 
<article-title>Computing inter-rater reliability and its variance in the presence of high agreement</article-title>. <source>Brit J Math Statis</source>. (<year>2008</year>) <volume>61</volume>:<fpage>29</fpage>&#x2013;<lpage>48</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1348/000711006X126600</pub-id>, PMID: <pub-id pub-id-type="pmid">18482474</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<label>27</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hicks</surname> <given-names>SA</given-names></name>
<name><surname>Str&#xfc;mke</surname> <given-names>I</given-names></name>
<name><surname>Thambawita</surname> <given-names>V</given-names></name>
<name><surname>Hammou</surname> <given-names>M</given-names></name>
<name><surname>Riegler</surname> <given-names>MA</given-names></name>
<name><surname>Halvorsen</surname> <given-names>P</given-names></name>
<etal/>
</person-group>. 
<article-title>On evaluation metrics for medical applications of artificial intelligence</article-title>. <source>Sci Rep</source>. (<year>2022</year>) <volume>12</volume>:<fpage>5979</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-022-09954-8</pub-id>, PMID: <pub-id pub-id-type="pmid">35395867</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<label>28</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cianciulli</surname> <given-names>A</given-names></name>
<name><surname>Santoro</surname> <given-names>E</given-names></name>
<name><surname>Manente</surname> <given-names>R</given-names></name>
<name><surname>Pacifico</surname> <given-names>A</given-names></name>
<name><surname>Quagliarella</surname> <given-names>S</given-names></name>
<name><surname>Bruno</surname> <given-names>N</given-names></name>
<etal/>
</person-group>. 
<article-title>Artificial intelligence and digital technologies against health misinformation: A scoping review of public health responses</article-title>. <source>Healthcare</source>. (<year>2025</year>) <volume>13</volume>:<fpage>2623</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/healthcare13202623</pub-id>, PMID: <pub-id pub-id-type="pmid">41154301</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<label>29</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Moramarco</surname> <given-names>F</given-names></name>
<name><surname>Juric</surname> <given-names>D</given-names></name>
<name><surname>Savkov</surname> <given-names>A</given-names></name>
<name><surname>Reiter</surname> <given-names>E</given-names></name>
</person-group>. 
<article-title>Towards objectively evaluating the quality of generated medical summaries</article-title>. In: <source>Proceedings of the Workshop on Human Evaluation of NLP Systems (HumEval)</source>. (<year>2021</year>). p. <page-range>56&#x2013;61</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/ARXIV.2104.04412</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<label>30</label>
<mixed-citation publication-type="web">
<person-group person-group-type="author"><collab>Foundation PS</collab>
</person-group>. <source>Python: A programming language</source> (<year>2023</year>). Available online at: <uri xlink:href="https://www.python.org/">https://www.python.org/</uri> (Accessed <date-in-citation content-type="access-date">May 28, 2024</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B31">
<label>31</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hunter</surname> <given-names>JD</given-names></name>
</person-group>. 
<article-title>Matplotlib: A 2D graphics environment</article-title>. <source>Comput Sci Eng</source>. (<year>2007</year>) <volume>9</volume>:<page-range>90&#x2013;5</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/MCSE.2007.55</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<label>32</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Baumann</surname> <given-names>U</given-names></name>
</person-group>. 
<article-title>Assessment and documentation of psychopathology</article-title>. <source>Psychopathology</source>. (<year>1995</year>) <volume>28</volume>:<fpage>13</fpage>&#x2013;<lpage>20</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1159/000284954</pub-id>, PMID: <pub-id pub-id-type="pmid">8903887</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<label>33</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Garfield</surname> <given-names>DAS</given-names></name>
<name><surname>Rapp</surname> <given-names>C</given-names></name>
<name><surname>Evens</surname> <given-names>M</given-names></name>
</person-group>. 
<article-title>Natural language processing in psychiatry: artificial intelligence technology and psychopathology</article-title>. <source>J Nervous Ment Dis</source>. (<year>1992</year>) <volume>180</volume>:<page-range>227&#x2013;37</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1097/00005053-199204000-00004</pub-id>, PMID: <pub-id pub-id-type="pmid">1556562</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<label>34</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hewson</surname> <given-names>T</given-names></name>
<name><surname>Abraham</surname> <given-names>S</given-names></name>
<name><surname>Randles</surname> <given-names>N</given-names></name>
<name><surname>Akinola</surname> <given-names>A</given-names></name>
<name><surname>Cliff</surname> <given-names>R</given-names></name>
<name><surname>Byrne</surname> <given-names>P</given-names></name>
<etal/>
</person-group>. 
<article-title>The recording of mental health consultations by patients: clinical, ethical and legal considerations</article-title>. <source>BJPsych Bull</source>. (<year>2022</year>) <volume>46</volume>:<page-range>133&#x2013;7</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1192/bjb.2021.89</pub-id>, PMID: <pub-id pub-id-type="pmid">34533115</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<label>35</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wallbott</surname> <given-names>H</given-names></name>
</person-group>. 
<article-title>Vocal behavior and psychopathology</article-title>. <source>Pharmacopsychiatry</source>. (<year>1989</year>) <volume>22</volume>:<page-range>13&#x2013;6</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1055/s-2007-1014618</pub-id>, PMID: <pub-id pub-id-type="pmid">2654966</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<label>36</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>de Filippis</surname> <given-names>R</given-names></name>
<name><surname>Al Foysal</surname> <given-names>A</given-names></name>
<name><surname>Rocco</surname> <given-names>V</given-names></name>
<name><surname>Guglielmo</surname> <given-names>R</given-names></name>
<name><surname>Sabatino</surname> <given-names>B</given-names></name>
<name><surname>Pietropaoli</surname> <given-names>A</given-names></name>
<etal/>
</person-group>. 
<article-title>The risk perspective of AI in healthcare: GDPR and GELSI framework (Governance, Ethical, Legal and Social Implications) and the new European AI Act</article-title>. <source>Ital J Psychiatry</source>. (<year>2024</year>) <volume>10</volume>:<page-range>12&#x2013;6</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.36180/2421-4469-2024-4</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<label>37</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sauerbrei</surname> <given-names>A</given-names></name>
<name><surname>Kerasidou</surname> <given-names>A</given-names></name>
<name><surname>Lucivero</surname> <given-names>F</given-names></name>
<name><surname>Hallowell</surname> <given-names>N</given-names></name>
</person-group>. 
<article-title>The impact of artificial intelligence on the person-centred, doctor-patient relationship: some problems and solutions</article-title>. <source>BMC Med Inform Decis Mak</source>. (<year>2023</year>) <volume>23</volume>:<fpage>73</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12911-023-02162-y</pub-id>, PMID: <pub-id pub-id-type="pmid">37081503</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<label>38</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wilhelmy</surname> <given-names>S</given-names></name>
<name><surname>Giupponi</surname> <given-names>G</given-names></name>
<name><surname>Gro&#xdf;</surname> <given-names>D</given-names></name>
<name><surname>Eisendle</surname> <given-names>K</given-names></name>
<name><surname>Conca</surname> <given-names>A</given-names></name>
</person-group>. 
<article-title>A shift in psychiatry through AI? Ethical challenges</article-title>. <source>Ann Gen Psychiatry</source>. (<year>2023</year>) <volume>22</volume>:<fpage>43</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12991-023-00476-9</pub-id>, PMID: <pub-id pub-id-type="pmid">37919759</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<label>39</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Warrier</surname> <given-names>U</given-names></name>
<name><surname>Warrier</surname> <given-names>A</given-names></name>
<name><surname>Khandelwal</surname> <given-names>K</given-names></name>
</person-group>. 
<article-title>Ethical considerations in the use of artificial intelligence in mental health</article-title>. <source>Egypt J Neurol Psychiatry Neurosurg</source>. (<year>2023</year>) <volume>59</volume>:<fpage>139</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s41983-023-00735-2</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<label>40</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rubeis</surname> <given-names>G</given-names></name>
</person-group>. 
<article-title>iHealth: The ethics of artificial intelligence and big data in mental healthcare</article-title>. <source>Internet Interventions</source>. (<year>2022</year>) <volume>28</volume>:<fpage>100518</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.invent.2022.100518</pub-id>, PMID: <pub-id pub-id-type="pmid">35257003</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<label>41</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Terra</surname> <given-names>M</given-names></name>
<name><surname>Baklola</surname> <given-names>M</given-names></name>
<name><surname>Ali</surname> <given-names>S</given-names></name>
<name><surname>El-Bastawisy</surname> <given-names>K</given-names></name>
</person-group>. 
<article-title>Opportunities, applications, challenges and ethical implications of artificial intelligence in psychiatry: a narrative review</article-title>. <source>Egypt J Neurol Psychiatry Neurosurg</source>. (<year>2023</year>) <volume>59</volume>:<fpage>80</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s41983-023-00681-z</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<label>42</label>
<mixed-citation publication-type="web">
<person-group person-group-type="author"><collab>European Parliament, Council of the European Union</collab>
</person-group>. <source>Regulation (EU) 2016/679 of the European Parliament and of the Council of 27 April 2016 on the protection of natural persons with regard to the processing of personal data and on the free movement of such data (General Data Protection Regulation)</source> (<year>2016</year>). Available online at: <uri xlink:href="https://eur-lex.europa.eu/eli/reg/2016/679/oj">https://eur-lex.europa.eu/eli/reg/2016/679/oj</uri> (Accessed <date-in-citation content-type="access-date">November 20, 2025</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B43">
<label>43</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author"><collab>World Health Organization</collab>
</person-group>. <source>Ethics and governance of artificial intelligence for health: WHO guidance</source>. <publisher-loc>Geneva</publisher-loc>: 
<publisher-name>World Health Organization</publisher-name> (<year>2021</year>). Available online at: <uri xlink:href="https://www.who.int/publications/i/item/9789240029200">https://www.who.int/publications/i/item/9789240029200</uri> (Accessed <date-in-citation content-type="access-date">November 20, 2025</date-in-citation>).
</mixed-citation>
</ref>
<ref id="B44">
<label>44</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author"><collab>Central Ethics Committee at the German Medical Association</collab>
</person-group>. 
<article-title>Decision support by artificial intelligence in medical practice</article-title>. <source>Deutsches &#xc4;rzteblatt</source>. (<year>2021</year>) <volume>118</volume>:<fpage>A1</fpage>&#x2013;<lpage>A12</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3238/arztebl.zeko_sn_cdss_2021_en</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<label>45</label>
<mixed-citation publication-type="web">
<person-group person-group-type="author"><collab>European Parliament. Directorate General for Parliamentary Research Services</collab>
</person-group>. <source>The impact of the general data protection regulation on artificial intelligence</source>. <publisher-loc>LU</publisher-loc>: 
<publisher-name>Publications Office</publisher-name>.
</mixed-citation>
</ref>
<ref id="B46">
<label>46</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Riou</surname> <given-names>C</given-names></name>
<name><surname>El Azzouzi</surname> <given-names>M</given-names></name>
<name><surname>Hespel</surname> <given-names>A</given-names></name>
<name><surname>Guillou</surname> <given-names>E</given-names></name>
<name><surname>Coatrieux</surname> <given-names>G</given-names></name>
<name><surname>Cuggia</surname> <given-names>M</given-names></name>
</person-group>. 
<article-title>Ensuring general data protection regulation compliance and security in a clinical data warehouse from a university hospital: implementation study</article-title>. <source>JMIR Med Inform</source>. (<year>2025</year>) <volume>13</volume>:<page-range>e63754&#x2013;4</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.2196/63754</pub-id>, PMID: <pub-id pub-id-type="pmid">40244890</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<label>47</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Iwaya</surname> <given-names>LH</given-names></name>
<name><surname>Babar</surname> <given-names>MA</given-names></name>
<name><surname>Rashid</surname> <given-names>A</given-names></name>
<name><surname>Wijayarathna</surname> <given-names>C</given-names></name>
</person-group>. 
<article-title>On the privacy of mental health apps: An empirical investigation and its implications for app development</article-title>. <source>Empir Softw Eng</source>. (<year>2023</year>) <volume>28</volume>:<fpage>2</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10664-022-10236-0</pub-id>, PMID: <pub-id pub-id-type="pmid">36407814</pub-id>
</mixed-citation>
</ref>
<ref id="B48">
<label>48</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>H</given-names></name>
<name><surname>Mao</surname> <given-names>Y</given-names></name>
<name><surname>Lin</surname> <given-names>Y</given-names></name>
<name><surname>Zhang</surname> <given-names>D</given-names></name>
</person-group>. 
<article-title>E-mental health in the age of AI: data safety, privacy regulations and recommendations</article-title>. <source>Alpha Psychiatry</source>. (<year>2025</year>) <volume>26</volume>:<fpage>44279</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.31083/AP44279</pub-id>, PMID: <pub-id pub-id-type="pmid">40630871</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/293195">Xuntao Yin</ext-link>, Guangzhou Medical University, China</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/543188">Francesco Monaco</ext-link>, Azienda Sanitaria Locale Salerno, Italy</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1817549">Arghya Datta</ext-link>, Amazon, United States</p></fn>
</fn-group>
</back>
</article>