<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title-group>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdata.2025.1651290</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Depression detection through dual-stream modeling with large language models: a fusion-based transfer learning framework integrating BERT and T5 representations</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Na</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3105595"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhang</surname> <given-names>Weijia</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Kamil</surname> <given-names>Raja</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Renner</surname> <given-names>Ian</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Abdul Rahman Al-Haddad</surname> <given-names>Syed</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ibrahim</surname> <given-names>Normala</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<uri xlink:href="https://loop.frontiersin.org/people/1001979"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhao</surname> <given-names>Zhen</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<uri xlink:href="https://loop.frontiersin.org/people/2185443"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Faculty of Engineering, Universiti Putra Malaysia, UPM Serdang, Serdang</institution>, <city>Selangor</city>, <country country="my">Malaysia</country></aff>
<aff id="aff2"><label>2</label><institution>School of Information and Physical Sciences, The University of Newcastle</institution>, <city>Callaghan, NSW</city>, <country country="au">Australia</country></aff>
<aff id="aff3"><label>3</label><institution>School of Automation, Guangdong Polytechnic Normal University, Guangzhou</institution>, <city>Guangdong</city>, <country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>Faculty of Medicine and Health Sciences, Universiti Putra Malaysia, UPM Selangor</institution>, <city>Selangor</city>, <country country="my">Malaysia</country></aff>
<aff id="aff5"><label>5</label><institution>Department of Electrical Engineering, Faculty of Engineering, Universiti Malaya</institution>, <city>Kuala Lumpur</city>, <country country="my">Malaysia</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Weijia Zhang, <email xlink:href="mailto:weijia.zhang@newcastle.edu.au">weijia.zhang@newcastle.edu.au</email>; Raja Kamil, <email xlink:href="mailto:kamil@upm.edu.my">kamil@upm.edu.my</email>; Zhen Zhao, <email xlink:href="mailto:zhenzhao@scut.edu.cn">zhenzhao@scut.edu.cn</email>; Syed Abdul Rahman Al-Haddad, <email xlink:href="mailto:sar@upm.edu.my">sar@upm.edu.my</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-04">
<day>04</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>8</volume>
<elocation-id>1651290</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>20</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Wang, Zhang, Kamil, Renner, Abdul Rahman Al-Haddad, Ibrahim and Zhao.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Wang, Zhang, Kamil, Renner, Abdul Rahman Al-Haddad, Ibrahim and Zhao</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-04">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Millions of people around the world suffer from depression. While early diagnosis is essential for timely intervention, it remains a significant challenge due to limited access to clinically diagnosed data and privacy restrictions on mental health records. These limitations hinder the training of robust AI models for depression detection. To tackle this, this article proposes a parallel transfer learning framework for depression detection that integrates BERT and T5 through a fusion mechanism, combining the complementary advantages of these two large language models (LLMs). By integrating their semantic embeddings, the method captures a broader range of linguistic cues from transcribed speech. These embeddings are processed through a model with two parallel branches: a one-dimensional convolutional neural network and a dense neural network are used to construct each branch for preliminary prediction, which are then fused for final prediction. Evaluations on the E-DAIC dataset demonstrate that the proposed method outperforms baseline models, achieving a 3.0% increase in accuracy (91.3%), a 6.9% increase in precision (95.2%), and a 1.7% improvement in F1-score (90.0%). The experimental results verify the effectiveness of BERT and T5 fusion in enhancing depression detection performance and highlight the potential of transfer learning for scalable and privacy-conscious mental health applications.</p></abstract>
<kwd-group>
<kwd>1DCNN</kwd>
<kwd>BERT</kwd>
<kwd>depression</kwd>
<kwd>E-DAIC</kwd>
<kwd>T5</kwd>
<kwd>text</kwd>
<kwd>transfer learning</kwd>
<kwd>transformer</kwd>
</kwd-group>
<funding-group>
  <funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="8"/>
<equation-count count="15"/>
<ref-count count="34"/>
<page-count count="14"/>
<word-count count="7887"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Depression is also referred to as major depressive disorder (MDD). It is a widespread mental illness that impacts a significant percentage of people globally. According to the statistics of the World Health Organization, approximately 332 million people in the world have depression (<xref ref-type="bibr" rid="B13">Herrman et al., 2019</xref>). Depression is characterized by symptoms such as persistent sadness, loss of interest in daily activities, irritability, hopelessness, changes in appetite or weight, and low self-worth. In more severe cases, it may also result in suicidal thoughts or actions.</p>
<p>Diagnosing depression is inherently complex, typically requiring the expertise of trained mental health professionals. While early detection is essential for effective intervention, access to timely diagnosis remains limited due to high costs and inadequate access to mental health care, particularly in rural and underserved areas (<xref ref-type="bibr" rid="B23">Semrau et al., 2019</xref>). Artificial intelligence (A.I.) has shown promise in recent years in various medical applications (<xref ref-type="bibr" rid="B25">Uppal et al., 2023</xref>; <xref ref-type="bibr" rid="B29">Zafar et al., 2023</xref>; <xref ref-type="bibr" rid="B28">Wang et al., 2025</xref>; <xref ref-type="bibr" rid="B32">Zhao et al., 2023</xref>), including mental health assessment. However, the development of AI-based depression detection systems is hindered by several challenges, chief among them being the shortage of clinically validated data for model training.</p>
<p>Most datasets that are accessible to the general public for depression detection are relatively small and exhibit severe class imbalance. For instance, the Extended Distress Analysis Interview Corpus (E-DAIC) includes only 275 participants, of whom just 66 (24%) are diagnosed with MDD. Similarly, the Chinese Multimodal Depression Corpus (CMDC) and the Multimodal Open Dataset for Mental-health Analytics (MODMA) contain merely 78 and 52 participants, respectively, with fewer than half representing depressed individuals. The potential of AI models to generalize across various populations and therapeutic scenarios is severely limited by these dataset size and representativeness constraints.</p>
<p>Among various methods of detecting depression, the investigation of linguistic features in patient speech has been regarded as especially beneficial. Text features offer a more abundant level of interpretability compared to auditory or visual modalities. Linguistic markers, particularly those with absolutist terms such as &#x0201C;NEVER&#x0201D; indicated by phrases such as &#x0201C;I NEVER want to wake up&#x0201D; (<xref ref-type="bibr" rid="B1">Adam-Troian et al., 2022</xref>), and desperation or fatigue expressions, more commonly represented by &#x0201C;I am so tired all the time,&#x0201D; are found to correlate with depressive symptoms. Conventional clinical instruments, including the Beck Depression Inventory (<xref ref-type="bibr" rid="B5">Beck et al., 1996</xref>), Self-Rating Depression Scale (<xref ref-type="bibr" rid="B34">Zung, 1965</xref>), and Patient Health Questionnaire (PHQ; <xref ref-type="bibr" rid="B24">Spitzer et al., 1999</xref>), rely strongly on patient feedback in verbal or written format, thereby highlighting the central role that textual information plays in depression evaluation.</p>
<p>In spite of these strengths, the variation in patients&#x00027; language abilities, emotional expressions, and cultural contexts poses significant obstacles to the construction of representative textual datasets. Without sufficient diversity and sample populations, artificial intelligence systems are unable to successfully pick up on subtle cues of indicators of depression and might not be as generalizable to real-life situations. It is necessary to overcome these limitations to fully harness the potential of artificial intelligence-based depression intervention systems.</p>
<p>To resolve limitations related to insufficient training data, we recommend using a transfer learning approach based upon deep pre-trained language models to strengthen performance in minimal, clinically annotated datasets. Specifically, our model utilizes two big language models (LLMs) to take advantage of their pre-trained knowledge and two different neural network designs, namely a one-dimensional convolutional neural network (1DCNN) and a fully connected neural network typified by dense layers combined with a dropout mechanism for automatic depression detection. By fine-tuning the model using diagnostic labels and harnessing the linguistic knowledge encoded in the LLMs, the proposed architecture enhances the system&#x00027;s ability to identify depressive symptoms from textual input.</p>
<p>The following is a summary of this research&#x00027;s main contributions.</p>
<list list-type="order">
<list-item><p>Introducing a dual-stream transfer learning fusion framework that leverages the complementary strengths of two pre-trained large language models (LLMs), BERT, and T5, combined with 1D convolutional and dense neural networks, enabling robust depression detection by capturing diverse linguistic representations.</p></list-item>
<list-item><p>Designing a lightweight logical &#x0201C;AND fusion&#x0201D; strategy for integrating the outputs of all four branches as a conservative agreement based decision rule, with the aim of enhancing prediction reliability and precision rather than introducing a novel ensemble mechanism.</p></list-item>
<list-item><p>Conducting comprehensive ablation studies to evaluate the individual and combined contributions of BERT and T5 embeddings, as well as different architectural variants, providing deeper insights into their effectiveness for clinical depression detection.</p></list-item>
<list-item><p>Benchmarking the proposed method against common machine learning models (including traditional machine learning models and deep learning models), as well as the state-of-the-art studies on the E-DAIC dataset. The findings of this study offer a reference for future study on the application of LLMs in related domains.</p></list-item>
</list>
<p>This paper&#x00027;s remaining sections are arranged as follows: Section 2 provides an overview of previous studies pertinent to this research; Section 3 describes the workflow, dataset, data processing steps, and model structure; Section 4 outlines the experiment and evaluation setups; Section 5 presents the experiment configurations and evaluation criteria; and finally, Section 6 concludes the study with final remarks.</p></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>Speech text provides valuable insights for depression evaluation, which motivates researchers to develop automatic depression detection methods using textual data. Among the deep learning approaches, 1DCNN and variants of Recurrent Neural Networks (RNNs), such as Long Short-Term Memory (LSTM) and BiLSTM, have been widely employed for these tasks.</p>
<p>Convolutional neural networks, initially proposed for image processing (<xref ref-type="bibr" rid="B16">LeCun et al., 1989</xref>), were utilized for one-dimensional data in the mid-2010s, tackling difficulties in natural language processing, time-series analysis, and signal processing. An early milestone in this direction was the work of Kim et al., who applied 1DCNNs to text classification in 2014 (<xref ref-type="bibr" rid="B15">Kim, 2014</xref>).</p>
<p>Complementing CNNs, Long Short-Term Memory (LSTM) networks, proposed by <xref ref-type="bibr" rid="B14">Hochreiter and Schmidhuber (1997)</xref>, addressed the vanishing gradient issue of traditional RNNs, allowing the effective learning of long- and short-term correlations. Building on LSTM, <xref ref-type="bibr" rid="B11">Graves and Schmidhuber (2005)</xref> proposed BiLSTM, which combines forward and backward LSTMs to capture bidirectional dependencies, making it suitable for sequential data processing tasks, e.g., speech recognition, automatic translation, and classifications.</p>
<p>In 2023, Wani et al. explored depression screening using Word2Vec (<xref ref-type="bibr" rid="B7">Church, 2017</xref>) and TF-IDF (<xref ref-type="bibr" rid="B3">Baena-Garc&#x000ED;a et al., 2011</xref>) features combined with CNN and LSTM models (<xref ref-type="bibr" rid="B2">Ahmad Wani et al., 2023</xref>). They achieved accuracies of 99.01% with Word2Vec (CNN &#x0002B; LSTM) on data sourced from Facebook, Twitter, and YouTube. However, one of the study&#x00027;s limitations is its dependence on non-clinically diagnosed data, raising concerns about its applicability in clinical settings.</p>
<p>To address limitations in data and feature extraction, transfer learning techniques have emerged as a promising approach across various research domains and have shown effectiveness in a range of language-related applications.</p>
<p>The BERT model (<xref ref-type="bibr" rid="B26">Vaswani et al., 2017</xref>), pre-trained on massive corpora such as BooksCorpus (<xref ref-type="bibr" rid="B33">Zhu et al., 2015</xref>) and English Wikipedia, has become an essential building block of transfer learning techniques. Such proficiency in learning knowledge from long and uninterrupted texts through document-level training has been key to its applicability in tasks that need deep contextual understanding. For instance, <xref ref-type="bibr" rid="B19">Milintsevich et al. (2023)</xref> applied the RoBERTa (Liu et al., <xref ref-type="bibr" rid="B9">2019</xref>) model, an adaptation of BERT, for depression prediction based on clinical transcripts from the DAIC dataset. According to their study, depression detection revealed a macro-F1 score of 73.9 within a binary setting, citing the need for clinically validated datasets.</p>
<p><xref ref-type="bibr" rid="B30">Zhang and Guo (2024)</xref> presented the MDSD-FGPL algorithm, integrating BERT and T5 encoders in fine-grained prompt learning, through a research project carried out in 2024. The multi-tier detection approach yielded an F1-score of 0.8276 for binary classification, hence pointing out the advantages of encoder integration. <xref ref-type="bibr" rid="B12">Hadikhah Mozhdehi and Eftekhari Moghadam (2023)</xref> applied Emotional BERT toward tasks of emotion recognition on the Wang and MELD datasets through another investigation, further proving the effectiveness of transfer learning in deriving contextual knowledge from minimal datasets.</p>
<p>The T5 model, described by <xref ref-type="bibr" rid="B20">Raffel et al. (2020)</xref>, has been widely applied to a variety of text-centered analytical tasks (<xref ref-type="bibr" rid="B4">Bao et al., 2024</xref>). The model&#x00027;s flexibility is further demonstrated by the Sensory-T5 version presented by <xref ref-type="bibr" rid="B31">Zhao et al. (2025)</xref>, which incorporates sensory information to enhance emotion classification accuracy. This methodology has led to significant precision and F1 score improvement on various datasets.</p>
<p>Inspired by a similar effort, <xref ref-type="bibr" rid="B18">Lu et al. (2025)</xref> also fine-tuned T5 for their ABSA and achieved remarkable improvement through their application of data augmentation techniques and implicit rationale-driven information management. <xref ref-type="bibr" rid="B6">Chawla et al. (2024)</xref> also identified emotional dimensions that influence the negotiation process, particularly with regard to outcome predictions such as satisfaction and perceptions of partners. Incorporating the CaSiNo dataset with emoticons, Linguistic Inquiry and Word Count, and T5 models, they conducted a comparison whereby T5-Reddit performed better than other models in detecting subtle emotional expressions.</p>
<p>As impressive as the breakthroughs achieved through leveraging pre-trained models have been for various text analytical applications, most available works tend to be based upon a single model developed for a specific architecture or a single configuration of tasks. Even though these models have delivered impressive performances, overreliance upon a solitary model has the potential to compromise generalizability, especially when applied in cases like detecting clinical depression, where access to annotated knowledge is sparse and contextual centrality matters.</p>
<p>Considering these limitations, the current work proposes initiating an innovative dual-language model integration. A methodological strategy that combines benefits of two sizable pre-trained language models: BERT and T5. The models have complementary characteristics in structural design and learning methods. BERT is an encoder-based model that is particularly designed to handle bidirectional contexts. The use of contextual features in a sentence reinforces its ability to capture subtle semantic relationships and complex syntactic relationships. On the other hand, T5 is a text-to-text transformer model based on a combined encoder-decoder model that is ideally suited to generate and reconstruct text content, thus demonstrating a better ability to generalize and abstract across diverse tasks. By combining the models, the current approach leverages the accuracy of BERT in producing contextual embeddings (Devlin et al., <xref ref-type="bibr" rid="B8">2019</xref>) and the versatility of T5 in capturing task-agnostic patterns through its unified text-to-text framework (<xref ref-type="bibr" rid="B20">Raffel et al., 2020</xref>), ultimately leading to more accurate and efficient detection of depression from transcribed speech.</p></sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<p>The initial step involves pre-processing raw text transcripts, which is integrated into the proposed framework. As illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>, the framework takes advantage of the collaborative strengths of transformer-based pre-trained large language models, specifically BERT and T5, both of which are highly reputed for their efficiencies in natural language processing. These models possess the capability to learn subtle linguistic patterns, grammatical structures, and other language attributes (<xref ref-type="bibr" rid="B22">Rodr&#x000ED;guez-Ib&#x000E1;nez et al., 2023</xref>), and are assumed to capture transferable knowledge.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>The workflow of the proposed method.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-08-1651290-g0001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a text classification model using BERT and T5 tokenizers. It shows tokenization, extraction of last hidden states, and an ensemble of 1D Convolutional Neural Networks (1DCNN) and dense neural networks. The ensemble combines outputs for prediction. Detailed diagrams of 1DCNN, showing convolutional operations, max pooling, and dense layers, and the dense neural network with 128 neurons are included.</alt-text>
</graphic>
</fig>
<p>To effectively utilize this pre-trained knowledge for our task, both models are fine-tuned using pre-processed data tailored to depression prediction. The processed text is fed into the fine-tuned BERT and T5 encoders, generating two distinct sets of embeddings. These embeddings are subsequently passed through parallel 1DCNN and Dense branches to extract task-relevant features and produce preliminary predictions.</p>
<p>The resulting predictions are then passed to a fusion module that performs a logical &#x0201C;AND&#x0201D; operation for the final output.</p>
<sec>
<label>3.1</label>
<title>Dataset and pre-processing</title>
<p>Clinically diagnosed datasets are scarce, as most publicly available datasets are derived from social media posts where labels are self-claimed. While these datasets offer valuable insights, they lack the reliability of clinically confirmed ground truth. Some datasets are built from samples collected from clinically diagnosed MDD patients; however, the majority of these are not publicly accessible. In contrast, the E-DAIC dataset is not only clinically reliable but also publicly available, with baseline models provided to enable fair comparisons of model performance.</p>
<p>The E-DAIC dataset, developed from clinical interviews, is designed to aid in diagnosing psychological disorders including anxiety, depression, and post-traumatic stress disorder (PTSD). It includes recordings of the conversations between the interviewers and the research subjects, along with transcripts capturing both parties&#x00027; spoken words, providing a rich linguistic context for analysis. Compared to the DAIC dataset, E-DAIC offers a larger sample size, which enhances its utility for research.</p>
<p>The dataset includes recordings from 275 subjects. 66 (35 males, 31 females) of whom were diagnosed with depression at the time of recording, while 209 (135 males, 74 females) were categorized as health controls (HCs). PHQ-8 scores of each subject are provided. A subset of the sessions in the E-DAIC dataset was collected semi-automatically, with a virtual interviewer controlled by a human. The rest were gathered using an AI-controlled agent. It uses automated modules for perception and behavior generation to function completely independently. The subjects comprised U.S. military veterans and recruits from the general public in the Los Angeles area.</p>
<p>Detailed summaries of the data collection methods and participant demographics are provided in <xref ref-type="table" rid="T1">Tables 1</xref>, <xref ref-type="table" rid="T2">2</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Dataset information of the E-DAIC.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Attribute</bold></th>
<th valign="top" align="left"><bold>Details</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Disorders</td>
<td valign="top" align="left">Depression, PTSD</td>
</tr>
<tr>
<td valign="top" align="left">Diagnosis basis</td>
<td valign="top" align="left">PHQ-8, PCL-C</td>
</tr>
<tr>
<td valign="top" align="left">Number of participants</td>
<td valign="top" align="left">275</td>
</tr>
<tr>
<td valign="top" align="left">Data modalities</td>
<td valign="top" align="left">Visual, Audio, Text</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>PCL-C, PTSD Checklist - Civilian Version.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Gender and MDD proportion of the E-DAIC dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Gender</bold></th>
<th valign="top" align="center"><bold>HC</bold></th>
<th valign="top" align="center"><bold>MDD</bold></th>
<th valign="top" align="center"><bold>Positive ratio</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Male</td>
<td valign="top" align="center">135</td>
<td valign="top" align="center">35</td>
<td valign="top" align="center">20.6%</td>
</tr>
<tr>
<td valign="top" align="left">Female</td>
<td valign="top" align="center">74</td>
<td valign="top" align="center">31</td>
<td valign="top" align="center">29.5%</td>
</tr>
<tr>
<td valign="top" align="left">Total</td>
<td valign="top" align="center">209</td>
<td valign="top" align="center">66</td>
<td valign="top" align="center">24.0%</td>
</tr></tbody>
</table>
</table-wrap>
<p>The following preprocessing steps were implemented for preprocessing:</p>
<list list-type="bullet">
<list-item><p>Trimming irrelevant sections: the interview scripts&#x00027; initial 90 s and last 40 s were eliminated. These portions predominantly contained answers to introductory remarks, general questions (e.g., &#x0201C;Where are you from?&#x0201D;), greetings, and goodbyes, which were deemed irrelevant for the analysis.</p></list-item>
<list-item><p>Only participant utterances were retained for textual analysis, while interviewer speech was excluded.</p></list-item>
<list-item><p>Neutral sentence removal: sentences consisting of one neutral word only, e.g., &#x0201C;yes&#x0201D; or &#x0201C;ok,&#x0201D; were excluded, as they lacked meaningful information for classification.</p></list-item>
<list-item><p>Labeling: the scripts were assigned labels based on PHQ-8 binary scores, with &#x0201C;1&#x0201D; indicating MDD and &#x0201C;0&#x0201D; indicating HC.</p></list-item>
<list-item><p>Balancing classes: to achieve an equal distribution between MDD and HC samples and prevent model bias during training, we adopted a random undersampling strategy. Among the available options, we chose undersampling to preserve the full set of clinically diagnosed MDD cases. Ensuring that all MDD samples originate from genuine, clinically diagnosed patients is crucial in medically sensitive contexts such as ours (<xref ref-type="bibr" rid="B10">Fern&#x000E1;ndez et al., 2015</xref>). In contrast, upsampling methods typically introduce synthetic data, which may compromise the authenticity and clinical reliability of the dataset.</p></list-item>
</list>
<p><xref ref-type="table" rid="T3">Table 3</xref> provides examples of the preprocessed text and labels.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Examples of the pre-processed text data.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Text transcript</bold></th>
<th valign="top" align="center"><bold>Label</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">No, it&#x00027;s just too rough trying to pick up all the pieces</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">Sleeping all the time, eating too much, arguing, screaming at this</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">I just couldn&#x00027;t find work, and so I had to settle for doing that for right now</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">I applied from anywhere and everywhere</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">My parents just buried their daughter six months ago; they don&#x00027;t want to bury their other daughter</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">I play sports: volleyball, softball, biking, walking</td>
<td valign="top" align="center">0</td>
</tr>
<tr>
<td valign="top" align="left">I&#x00027;m a little more rigid than most people, but it was okay, not bad</td>
<td valign="top" align="center">0</td>
</tr>
<tr>
<td valign="top" align="left">Maybe more disciplined. I can take orders very well. I&#x00027;m not afraid of most situations</td>
<td valign="top" align="center">0</td>
</tr>
<tr>
<td valign="top" align="left">He knows exactly what I&#x00027;m going through, and he&#x00027;s one hundred percent behind me, and I love him very much</td>
<td valign="top" align="center">0</td>
</tr>
<tr>
<td valign="top" align="left">Yeah, I mean they&#x00027;ve always gave me great advice</td>
<td valign="top" align="center">0</td>
</tr></tbody>
</table>
</table-wrap>
<p>The E-DAIC dataset was selected for the current study since it is clinically trustworthy and publicly accessible. However, a small dataset sample size makes it hard to train deep learning models from the beginning. In tackling such a weakness, transfer learning was used to improve not just performance but also generalization ability. Leveraging knowledge embedded in pre-trained models, transfer learning considerably expands exploration of the E-DAIC dataset. Transfer learning is leveraged in the current project by fine-tuning transformer-based LLMs: BERT and T5.</p></sec>
<sec>
<label>3.2</label>
<title>BERT and T5</title>
<p>Both BERT and T5 are pre-trained large language models (LLMs) based on the Transformer architecture. They act as contextual feature extractors for input text, generating dense vector representations. <xref ref-type="table" rid="T4">Table 4</xref> summarizes the architectural and training differences between the original Transformer, B-base (BERT-base), and T-base (T5-base).</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Key differences between transformer, BERT, and T5.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Aspect</bold></th>
<th valign="top" align="left"><bold>Transformer base</bold></th>
<th valign="top" align="left"><bold>B base (BERT)</bold></th>
<th valign="top" align="left"><bold>T base (T5)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Architecture</td>
<td valign="top" align="left">Encoder-Decoder</td>
<td valign="top" align="left">Encoder-only</td>
<td valign="top" align="left">Encoder-Decoder</td>
</tr>
<tr>
<td valign="top" align="left">Contextualization</td>
<td valign="top" align="left">Unidirectional encoder and decoder</td>
<td valign="top" align="left">Bidirectional encoder</td>
<td valign="top" align="left">Bidirectional encoder, Unidirectional decoder</td>
</tr>
<tr>
<td valign="top" align="left">Pre-training objective</td>
<td valign="top" align="left">None (original)</td>
<td valign="top" align="left">MLM &#x0002B; NSP</td>
<td valign="top" align="left">Span Corruption</td>
</tr>
<tr>
<td valign="top" align="left">Model size (parameters)</td>
<td valign="top" align="left">65M</td>
<td valign="top" align="left">110M</td>
<td valign="top" align="left">220M</td>
</tr></tbody>
</table>
</table-wrap>
<p>Let the input sequence be represented as:</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mtext>&#x01D54B;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where &#x01D54B; denotes the token space. The contextual embeddings extracted by each model are denoted by:</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>E</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>B</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:mtext class="textrm" mathvariant="normal">BERT embeddings</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>E</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:mtext class="textrm" mathvariant="normal">T5 embeddings</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>where</p>
<disp-formula id="EQ4"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>E</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
<disp-formula id="EQ5"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>E</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<p>To obtain fixed-size embeddings, we apply mean pooling across each token sequence:</p>
<disp-formula id="EQ6"><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(6)</label></disp-formula>
<disp-formula id="EQ7"><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>h</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(7)</label></disp-formula>
<p>where <inline-formula><mml:math id="M8"><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M9"><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> denote the sentence-level embeddings for the input <italic>X</italic> from BERT and T5, respectively.</p>
<p>These vectors are saved in NumPy format (<monospace>.npy</monospace>) and serve as inputs to downstream classifiers. The embedding extraction process is summarized in <xref ref-type="statement" rid="algo1">Algorithm 1</xref>.</p>
<statement content-type="algorithm" id="algo1">
<label>Algorithm 1</label>
<title>Embedding generation using pre-trained LLMs.</title>
<p>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-08-1651290-i0001.tif"/>
</p>
</statement>
</sec>
<sec>
<label>3.3</label>
<title>Classification and fusion</title>
<p>To classify the embeddings, we use two neural network classifiers:</p>
<list list-type="bullet">
<list-item><p><italic>M</italic><sub><italic>C</italic></sub>: a 1D Convolutional Neural Network (1DCNN),</p></list-item>
<list-item><p><italic>M</italic><sub><italic>D</italic></sub>: a fully connected Dense Neural Network (DNN).</p></list-item>
</list>
<p>The 1DCNN <italic>M</italic><sub><italic>C</italic></sub> operates on the input vector <italic>x</italic> &#x02208; &#x0211D;<sup><italic>d</italic></sup> with kernel <italic>w</italic> &#x02208; &#x0211D;<sup><italic>m</italic></sup>, producing an output <italic>y</italic><sub><italic>i</italic></sub> at position <italic>i</italic> as:</p>
<disp-formula id="EQ8"><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(8)</label></disp-formula>
<p>Each model computes a probability score indicating the likelihood of depression.</p>
<disp-formula id="EQ9"><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(9)</label></disp-formula>
<disp-formula id="EQ10"><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>e</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(10)</label></disp-formula>
<p>A final prediction is made using a conservative logical AND fusion:</p>
<disp-formula id="EQ11"><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">final</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x01D540;</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0003E;</mml:mo><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02227;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0003E;</mml:mo><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02227;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0003E;</mml:mo><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02227;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0003E;</mml:mo><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(11)</label></disp-formula>
<p>where &#x003C4; &#x02208; [0, 1] is the decision threshold (default&#x003C4; &#x0003D; 0.5), and <italic>I</italic>[&#x000B7;] is the indicator function returning 1 if the condition is true. A sample is labeled as MDD only if all four models agree.</p>
<p>The classification and fusion procedure is summarized in <xref ref-type="statement" rid="algo2">Algorithm 2</xref>.</p>
<statement content-type="algorithm" id="algo2">
<label>Algorithm 2</label>
<title>Neural network fusion</title>
<p>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-08-1651290-i0002.tif"/>
</p>
</statement>
</sec></sec>
<sec id="s4">
<label>4</label>
<title>Experiment configurations and evaluation criteria</title>
<p>The experiment&#x00027;s hardware is a workstation with an AMD Ryzen 9 CPU and 64 GB RAM. The software environment is the Windows 11 operating system, Jupyter IDE, and Python 3.11. The models were trained using a CPU.</p>
<p>The experimental results are assessed using the following metrics: accuracy, precision, recall, and F1 score. These metrics offer a thorough comprehension of the model&#x00027;s functionality. The equations for these metrics are defined as follows:</p>
<disp-formula id="EQ12"><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Accuracy</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TN</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TN</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(12)</label></disp-formula>
<disp-formula id="EQ13"><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Precision</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(13)</label></disp-formula>
<disp-formula id="EQ14"><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Recall</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(14)</label></disp-formula>
<disp-formula id="EQ15"><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">F1</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mtext class="textrm" mathvariant="normal">precision</mml:mtext><mml:mo>&#x000D7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">recall</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">precision</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">recall</mml:mtext></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(15)</label></disp-formula>
<p>where TP, TN, FP, and FN represent the counts of true positives, true negatives, false positives, and false negatives in the predictions, respectively. Additionally, the study involves plotting the Receiver Operating Characteristic (ROC) curves for the learning models for comparison purposes. The ROC curve depicts the true positive rate (TPR) vs. the false positive rate (FPR) at various threshold settings. Furthermore, the area under the curve (AUC) was calculated. An increased AUC signifies improved overall performance.</p></sec>
<sec id="s5">
<label>5</label>
<title>Experimental results</title>
<p>In this section, we examine and compare the traditional text feature extraction algorithms and transfer learning-based text embeddings using both conventional ML and deep learning models. The study evaluates six conventional machine learning models&#x02013;Logistic Regression (LR), Support Vector Machine (SVM), Random Forest (RF), Naive Bayes (NB), K-Nearest Neighbor (KNN), and Decision Tree (DT)&#x02013;alongside five deep learning models designed for one-dimensional text data analysis. The deep learning models include a fully connected dense neural network, a 1DCNN, a recurrent neural network (RNN), a gated recurrent unit (GRU), an LSTM, and a bidirectional long short-term memory network (BiLSTM).</p>
<sec>
<label>5.1</label>
<title>Experiments of traditional text feature extraction algorithms</title>
<p>To evaluate transfer learning&#x00027;s efficacy in improving models&#x00027; performance in this task, we first adopted the commonly used traditional text feature extraction algorithms, namely term frequency-inverse document frequency (TF-IDF), Keras frequency-based tokenizer (KFT), Bag of Words (BoW), and N-grams on both conventional MLs and deep learning models.</p>
<p><xref ref-type="fig" rid="F2">Figure 2</xref> presents the ROC curves generated using traditional text features for both conventional ML models and deep learning models. Among the conventional ML models, RF and DT demonstrated superior performance, with RF achieving the highest AUC of 94.5% when using KFT features. In contrast, the deep learning models generally underperformed, with the notable exception of the fully connected Dense Neural Network, which achieved an AUC of 94.0% using BoW features.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>ROC of models using traditional text feature extraction algorithms. The upper panel shows results from conventional machine learning models, while the lower panel shows results from deep learning models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-08-1651290-g0002.tif">
<alt-text content-type="machine-generated">The image contains two groups of ROC curve comparisons, each with four panels. The top group, labeled &#x0201C;Traditional text feature with conventional ML models,&#x0201D; compares logistic regression, SVM, random forest, naive Bayes, KNN, and decision tree models using TF-IDF, BoW, KFT, and n-gram methods. The bottom group, labeled &#x0201C;Traditional text feature with deep learning models,&#x0201D; compares Dense, 1D CNN, RNN, GRU, LSTM, and BiLSTM models using the same methods. Each panel shows true positive rates versus false positive rates, with AUC scores indicated for each model.</alt-text>
</graphic>
</fig>
<p>The RF model excels with a higher true positive rate (TPR) at lower false positive rates (FPR), reflecting its efficiency in correctly identifying positive samples. This balance between sensitivity and specificity underscores its effectiveness as a classifier. Furthermore, RF consistently achieves the highest AUC across various feature extraction methods, demonstrating its robustness to diverse data representations and its superior ability to distinguish between classes.</p>
<p>The deep learning models, with the exception of dense neural networks constructed solely with dense layers, showed poor performance when tested with traditional text feature extraction algorithms. The results of the other deep learning models are not reported, as their poor performance suggests that combining traditional text feature extraction algorithms with these models is not an effective approach for this task. Among the tested models, the dense neural network using BoW achieved an accuracy of 85.9%, outperforming all other deep learning models.</p>
<p><xref ref-type="table" rid="T5">Table 5</xref> reports the accuracy, precision, recall, and F1-score of the learning models using traditional text feature extraction algorithms. As shown, the highest accuracy was achieved by RF using TF-IDF, with an accuracy of 88.3%. This method (TF-IDF &#x0002B; RF) is selected as the baseline for evaluating the effectiveness of transfer learning. Overall, the results indicate that conventional ML models outperform deep learning models when traditional extraction algorithms are applied.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>The performance of learning models with traditional text feature algorithms (Non-TL).</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Feature</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>F1 (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">LR</td>
<td valign="top" align="center">TF-IDF</td>
<td valign="top" align="center">73.0</td>
<td valign="top" align="center">72.8</td>
<td valign="top" align="center">72.7</td>
<td valign="top" align="center">72.7</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">KFT</td>
<td valign="top" align="center">65.1</td>
<td valign="top" align="center">64.9</td>
<td valign="top" align="center">64.6</td>
<td valign="top" align="center">64.6</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BoW</td>
<td valign="top" align="center">77.9</td>
<td valign="top" align="center">77.9</td>
<td valign="top" align="center">77.5</td>
<td valign="top" align="center">77.6</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">N-gram</td>
<td valign="top" align="center"><bold>81.0</bold></td>
<td valign="top" align="center"><bold>80.9</bold></td>
<td valign="top" align="center"><bold>80.8</bold></td>
<td valign="top" align="center"><bold>80.8</bold></td>
</tr>
<tr>
<td valign="top" align="left">SVM</td>
<td valign="top" align="center">TF-IDF</td>
<td valign="top" align="center">75.7</td>
<td valign="top" align="center">75.7</td>
<td valign="top" align="center">75.8</td>
<td valign="top" align="center">75.6</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">KFT</td>
<td valign="top" align="center">64.9</td>
<td valign="top" align="center">64.7</td>
<td valign="top" align="center">64.6</td>
<td valign="top" align="center">64.7</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BoW</td>
<td valign="top" align="center">81.2</td>
<td valign="top" align="center">81.2</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">81.0</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">N-gram</td>
<td valign="top" align="center"><bold>83.5</bold></td>
<td valign="top" align="center"><bold>83.4</bold></td>
<td valign="top" align="center"><bold>83.3</bold></td>
<td valign="top" align="center"><bold>83.4</bold></td>
</tr>
<tr>
<td valign="top" align="left">RF</td>
<td valign="top" align="center">TF-IDF</td>
<td valign="top" align="center"><bold>88.3</bold><sup>&#x0002A;</sup></td>
<td valign="top" align="center"><bold>88.3</bold><sup>&#x0002A;</sup></td>
<td valign="top" align="center"><bold>88.3</bold><sup>&#x0002A;</sup></td>
<td valign="top" align="center"><bold>88.3</bold><sup>&#x0002A;</sup></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">KFT</td>
<td valign="top" align="center">86.8</td>
<td valign="top" align="center">86.8</td>
<td valign="top" align="center">86.8</td>
<td valign="top" align="center">86.8</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BoW</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">N-gram</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">86.6</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">86.6</td>
</tr>
<tr>
<td valign="top" align="left">NB</td>
<td valign="top" align="center">TF-IDF</td>
<td valign="top" align="center"><bold>74.6</bold></td>
<td valign="top" align="center"><bold>74.6</bold></td>
<td valign="top" align="center"><bold>74.7</bold></td>
<td valign="top" align="center"><bold>74.5</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">KFT</td>
<td valign="top" align="center">59.2</td>
<td valign="top" align="center">65.0</td>
<td valign="top" align="center">56.3</td>
<td valign="top" align="center">50.5</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BoW</td>
<td valign="top" align="center">74.1</td>
<td valign="top" align="center">74.1</td>
<td valign="top" align="center">74.3</td>
<td valign="top" align="center">74.1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">N-gram</td>
<td valign="top" align="center">73.7</td>
<td valign="top" align="center">73.5</td>
<td valign="top" align="center">73.5</td>
<td valign="top" align="center">73.5</td>
</tr>
<tr>
<td valign="top" align="left">KNN</td>
<td valign="top" align="center">TF-IDF</td>
<td valign="top" align="center">57.1</td>
<td valign="top" align="center">63.6</td>
<td valign="top" align="center">53.9</td>
<td valign="top" align="center">45.6</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">KFT</td>
<td valign="top" align="center">66.3</td>
<td valign="top" align="center">66.1</td>
<td valign="top" align="center"><bold>65.8</bold></td>
<td valign="top" align="center"><bold>65.8</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BoW</td>
<td valign="top" align="center"><bold>66.9</bold></td>
<td valign="top" align="center"><bold>68.2</bold></td>
<td valign="top" align="center">65.4</td>
<td valign="top" align="center">64.9</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">N-gram</td>
<td valign="top" align="center">65.4</td>
<td valign="top" align="center">66.9</td>
<td valign="top" align="center">63.8</td>
<td valign="top" align="center">62.9</td>
</tr>
<tr>
<td valign="top" align="left">DT</td>
<td valign="top" align="center">TF-IDF</td>
<td valign="top" align="center">85.3</td>
<td valign="top" align="center">85.6</td>
<td valign="top" align="center">85.7</td>
<td valign="top" align="center">85.3</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">KFT</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.4</td>
<td valign="top" align="center">86.1</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BoW</td>
<td valign="top" align="center">86.0</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.3</td>
<td valign="top" align="center">85.9</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">N-gram</td>
<td valign="top" align="center"><bold>86.7</bold></td>
<td valign="top" align="center"><bold>86.7</bold></td>
<td valign="top" align="center"><bold>86.7</bold></td>
<td valign="top" align="center"><bold>86.5</bold></td>
</tr>
<tr>
<td valign="top" align="left">Dense</td>
<td valign="top" align="center">TF-IDF</td>
<td valign="top" align="center">85.2</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">85.3</td>
<td valign="top" align="center">85.2</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">KFT</td>
<td valign="top" align="center">67.0</td>
<td valign="top" align="center">70.1</td>
<td valign="top" align="center">67.6</td>
<td valign="top" align="center">67.0</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">BoW</td>
<td valign="top" align="center"><bold>85.9</bold></td>
<td valign="top" align="center"><bold>86.0</bold></td>
<td valign="top" align="center"><bold>85.9</bold></td>
<td valign="top" align="center"><bold>85.9</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">N-gram</td>
<td valign="top" align="center">85.5</td>
<td valign="top" align="center">85.5</td>
<td valign="top" align="center">85.5</td>
<td valign="top" align="center">85.5</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>For each model, the highest scores are highlighted in bold.</p>
<p><sup>&#x0002A;</sup>denotes the highest performance across all learning models and text features.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<label>5.2</label>
<title>Experiments of transfer learning methods</title>
<p>Next, we investigate the effect of transfer learning by adopting two pre-trained models: BERT and T5. BERT represents an encoder-only model optimized for bidirectional contextual understanding, while T5 follows a sequence-to-sequence, text-to-text pre-training paradigm based on span corruption. Including both models allows us to examine whether generative-style pre-training yields complementary representations for depression-related language beyond encoder-only architectures. For a fair comparison, the embeddings generated from these BERT and T5 were used to train the same conventional ML and deep learning models.</p>
<p>In <xref ref-type="table" rid="T6">Table 6</xref>, we observe that the Dense model performs best across most metrics when using BERT embeddings, while the 1DCNN outperforms all models with T5 embeddings, achieving the highest accuracy, F1-score, and AUC. Overall, the 1DCNN emerges as the most robust model, excelling particularly with T5 embeddings, while the dense layer consistently delivers strong performance with both embeddings, showing a slight edge with BERT. In contrast, RNN, LSTM, and BiLSTM models consistently underperform compared to other models regardless of the embedding type.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Comparison of the models using transfer learning.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Embedding</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>F1 (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">LR</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center">69.04</td>
<td valign="top" align="center">66.31</td>
<td valign="top" align="center">67.03</td>
<td valign="top" align="center">66.67</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center"><bold>72.47</bold>&#x02193;</td>
<td valign="top" align="center"><bold>70.01</bold>&#x02193;</td>
<td valign="top" align="center"><bold>70.68</bold>&#x02193;</td>
<td valign="top" align="center"><bold>70.34</bold>&#x02193;</td>
</tr>
<tr>
<td valign="top" align="left">SVM</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center">69.79</td>
<td valign="top" align="center">67.25</td>
<td valign="top" align="center">67.43</td>
<td valign="top" align="center">67.34</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center"><bold>71.29</bold>&#x02193;</td>
<td valign="top" align="center"><bold>68.87</bold>&#x02193;</td>
<td valign="top" align="center"><bold>69.05</bold>&#x02193;</td>
<td valign="top" align="center"><bold>68.96</bold>&#x02193;</td>
</tr>
<tr>
<td valign="top" align="left">RF</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center">86.10</td>
<td valign="top" align="center">84.25</td>
<td valign="top" align="center">86.76</td>
<td valign="top" align="center">85.49</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center"><bold>87.27</bold>&#x02193;</td>
<td valign="top" align="center"><bold>85.17</bold>&#x02193;</td>
<td valign="top" align="center"><bold>87.70</bold>&#x02193;</td>
<td valign="top" align="center"><bold>86.42</bold>&#x02193;</td>
</tr>
<tr>
<td valign="top" align="left">NB</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center">61.55</td>
<td valign="top" align="center">57.23</td>
<td valign="top" align="center">66.35</td>
<td valign="top" align="center">61.45</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center"><bold>63.17</bold>&#x02193;</td>
<td valign="top" align="center"><bold>58.87</bold>&#x02193;</td>
<td valign="top" align="center"><bold>67.30</bold>&#x02193;</td>
<td valign="top" align="center"><bold>62.80</bold>&#x02193;</td>
</tr>
<tr>
<td valign="top" align="left">KNN</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center"><bold>70.66</bold></td>
<td valign="top" align="center"><bold>68.24</bold></td>
<td valign="top" align="center">68.24</td>
<td valign="top" align="center"><bold>68.24</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center">69.41</td>
<td valign="top" align="center">66.32</td>
<td valign="top" align="center"><bold>68.65</bold></td>
<td valign="top" align="center">67.46</td>
</tr>
<tr>
<td valign="top" align="left">DT</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center"><bold>86.83</bold>&#x02191;</td>
<td valign="top" align="center"><bold>82.53</bold>&#x02193;</td>
<td valign="top" align="center"><bold>90.68</bold>&#x02191;</td>
<td valign="top" align="center"><bold>86.41</bold>&#x02193;</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center">85.58</td>
<td valign="top" align="center">80.99</td>
<td valign="top" align="center">89.86</td>
<td valign="top" align="center">85.20</td>
</tr>
<tr>
<td valign="top" align="left">Dense</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center"><bold>87.33</bold>&#x02191;</td>
<td valign="top" align="center"><bold>84.38</bold>&#x02193;</td>
<td valign="top" align="center"><bold>89.05</bold> &#x02191;</td>
<td valign="top" align="center"><bold>86.65</bold>&#x02191;</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center">86.14</td>
<td valign="top" align="center">83.29</td>
<td valign="top" align="center">87.57</td>
<td valign="top" align="center">85.38</td>
</tr>
<tr>
<td valign="top" align="left">1DCNN</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center"><bold>89.45</bold><sup>&#x0002A;</sup></td>
<td valign="top" align="center"><bold>88.02</bold><sup>&#x0002A;</sup></td>
<td valign="top" align="center"><bold>89.32</bold></td>
<td valign="top" align="center"><bold>88.67</bold><sup>&#x0002A;</sup></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center">88.45</td>
<td valign="top" align="center">86.56</td>
<td valign="top" align="center">88.78</td>
<td valign="top" align="center">87.66</td>
</tr>
<tr>
<td valign="top" align="left">RNN</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center"><bold>85.46</bold></td>
<td valign="top" align="center">80.88</td>
<td valign="top" align="center"><bold>89.73</bold></td>
<td valign="top" align="center"><bold>85.07</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center">77.65</td>
<td valign="top" align="center"><bold>82.15</bold></td>
<td valign="top" align="center">65.95</td>
<td valign="top" align="center">73.16</td>
</tr>
<tr>
<td valign="top" align="left">GRU</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center"><bold>85.21</bold></td>
<td valign="top" align="center">78.74</td>
<td valign="top" align="center"><bold>93.11</bold><sup>&#x0002A;</sup></td>
<td valign="top" align="center"><bold>85.33</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center">85.02</td>
<td valign="top" align="center"><bold>81.65</bold></td>
<td valign="top" align="center">87.16</td>
<td valign="top" align="center">84.31</td>
</tr>
<tr>
<td valign="top" align="left">LSTM</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center">67.04</td>
<td valign="top" align="center">64.64</td>
<td valign="top" align="center"><bold>63.24</bold></td>
<td valign="top" align="center"><bold>63.93</bold></td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center"><bold>67.29</bold></td>
<td valign="top" align="center"><bold>66.12</bold></td>
<td valign="top" align="center">59.86</td>
<td valign="top" align="center">62.84</td>
</tr>
<tr>
<td valign="top" align="left">BiLSTM</td>
<td valign="top" align="center">BERT</td>
<td valign="top" align="center">52.25</td>
<td valign="top" align="center">48.33</td>
<td valign="top" align="center">48.78</td>
<td valign="top" align="center">48.55</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">T5</td>
<td valign="top" align="center"><bold>72.97</bold></td>
<td valign="top" align="center"><bold>67.79</bold></td>
<td valign="top" align="center"><bold>79.05</bold></td>
<td valign="top" align="center"><bold>72.99</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The highest value for each model is highlighted in bold.</p>
<p>Arrows (&#x02191;/&#x02193;) indicate the increase or decrease compared to the highest score of the corresponding non-TL method for the same learning model. <sup>&#x0002A;</sup>denotes the highest performance across all learning models using TL embeddings.</p>
</table-wrap-foot>
</table-wrap>
<p>Traditional machine learning methods are routinely outperformed by deep learning models, especially 1DCNN and dense. 1DCNN (BERT) achieves the highest accuracy (89.45%) and F1-score (88.67%), highlighting its robustness in this task. Among traditional ML models, RF and DT perform better than the other models but do not show an increase compared to non-transfer learning models. While GRU (BERT) excels in recall (93.11%), minimizing false negatives, models like RNN and LSTM show moderate performance. Simpler models such as Naive Bayes (NB) exhibit significant drops in F1-score.</p>
<p>The comparison between non-TL and TL models reveals distinct performance patterns. Traditional models such as RF and DT show competitive results in the non-TL setting, particularly when combined with TF-IDF features, with RF achieving the highest scores in accuracy, precision, recall, and F1-score. This suggests these models are well-suited to conventional feature representations.</p>
<p>In contrast, TL models using BERT and T5 embeddings consistently outperform non-TL approaches. Among them, 1DCNN with BERT embeddings achieves the highest accuracy (89.45%) and F1-score (88.67%), surpassing the best non-TL setup (RF &#x0002B; TF-IDF: 88.3%). This highlights the benefit of leveraging contextual embeddings for improved classification accuracy.</p>
<p>As illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>, TL-based models produce ROC curves that shift further left, corresponding to higher AUC scores and improved true positive rates, especially at low false positive rates. This behavior underscores the increased sensitivity and robustness of transfer learning in text-based depression detection.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Performance of transfer learning using traditional machine learning models and deep learning models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-08-1651290-g0003.tif">
<alt-text content-type="machine-generated">Four ROC curve comparison graphs showing model performance. Top left compares models with AUC scores: LR 0.775, SVM 0.710, RF 0.937, NB 0.655, KNN 0.771, DT 0.874. Top right compares Dense 0.943, 1DCNN 0.947, RNN 0.926, GRU 0.915, LSTM 0.798, BiLSTM 0.793. Bottom left compares LR 0.788, SVM 0.779, RF 0.933, NB 0.735, KNN 0.757, DT 0.861. Bottom right compares Dense 0.925, 1DCNN 0.954, RNN 0.926, GRU 0.811, LSTM 0.779, BiLSTM 0.795. Each graph plots True Positive Rate against False Positive Rate, including a diagonal reference line.</alt-text>
</graphic>
</fig>
<p>Given the effectiveness of 1DCNN and dense models using BERT and T5 embeddings in the above experiments, these models emerged as promising candidates for developing a superior approach to this task. As a result, we selected them for further experiments.</p>
<p>As illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>, the confusion matrix of the proposed dual-stream model demonstrates relatively balanced performance across both classes. Notably, the number of false negatives (113) is substantially lower than that of false positives (13), indicating that the model prioritizes sensitivity. This is particularly advantageous in early-stage depression screening, where maximizing recall is often more critical than specificity.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>The confusion matrix and ROC of the proposed dual-stream transfer learning.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-08-1651290-g0004.tif">
<alt-text content-type="machine-generated">Confusion matrix and ROC curves comparing different models. The confusion matrix shows model performance with 849 true negatives, 627 true positives, 13 false positives, and 113 false negatives. The ROC curves display comparisons of several models: BERT &#x0002B; 1DCNN (AUC = 0.950), T5 &#x0002B; 1DCNN (AUC = 0.954), BERT &#x0002B; Dense (AUC = 0.916), T5 &#x0002B; Dense (AUC = 0.927), and Logical AND Fusion Proposed (AUC = 0.956), showing high performance with values close to the top-left corner.</alt-text>
</graphic>
</fig>
<p>In terms of ROC performance, the fusion approach&#x02013;integrating predictions from four branches (BERT &#x0002B; Dense, BERT &#x0002B; 1DCNN, T5 &#x0002B; Dense, and T5 &#x0002B; 1DCNN)&#x02013;achieved the highest AUC of 0.956, as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>, outperforming each individual model. By shifting the ROC curve further toward the top-left corner, the fusion method yields a higher true positive rate at a lower false positive rate.</p></sec>
<sec>
<label>5.3</label>
<title>Ablation study</title>
<p>As observed, integrating BERT and T5 embeddings with 1DCNN and Dense architectures demonstrates strong discriminative ability, achieving high scores across all evaluation metrics and highlighting their potential as a promising approach. To further optimize the architecture, an ablation study was carried out to identify the best structure for the task.</p>
<p>During training BERT and T5 for predictive tasks, baseline models were created to test the effectiveness of embeddings without considering architectural bias. BERT achieved impressive performance, recording an accuracy rate of 88.8%, an F1-score of 88.0%, and an AUC of 92.8%. Its impressive recall rate of 89.5% indicates that it has good proficiency in exactly categorizing positive examples, making it very apt for operations that require minimizing false negatives. On the contrary, T5 had much lower performance scores, recording an accuracy rate of 64.1%, an F1 score of 63.1%, and an AUC of 65.6%. The difference in performance highlights deficiencies of T5 embeddings to distinguish classes effectively, hence requiring consideration of different frameworks for overall quality improvement.</p>
<p>The addition of one-dimensional convolutional neural network (1DCNN) layers greatly enhanced the performance of the T5 model. The BERT &#x0002B; 1DCNN achieved an accuracy rate of 89.5%, which outperformed the 88.8% accuracy rate of the BERT model, and an equally impressive F1-score of 88.7%. Though its recall rate of 89.3% happened to fall marginally lower compared to several other models, it had an AUC of 95.1% and thereby lent indication of an impressive overall performance. Similarly, the T5 &#x0002B; 1DCNN achieved an accuracy of 88.5% and an F1-score of 87.7%, reflecting a level of performance that is overall quite similar to that of the BERT model. The T5 &#x0002B; 1DCNN model achieved an AUC of 95.5%, which demonstrated a small improvement from its BERT counterpart for that specific metric.</p>
<p>The use of dense neural networks, though not outperforming 1D CNN models on all metrics considered, still produced competitive outputs. The BERT &#x0002B; Dense model had an accuracy rate of 87.3%, an AUC of 86.7%, and an F1-score of 93.1%.On the other hand, T5 &#x0002B; Dense attained an accuracy and an AUC of 86.1 and 93.0%, and an F1-score of 85.4. Lastly, our proposed approach sends embeddings from BERT and T5 through two parallel streams, each comprising two branches composed of one using a 1DCNN and another using a dense network before their outputs are combined.</p>
<p>The integration of four model combinations: BERT &#x0002B; 1DCNN, T5 &#x0002B; 1DCNN, BERT &#x0002B; Dense, and T5 &#x0002B; Dense, demonstrated superior performance compared to any single configuration. The combined model achieved an accuracy of 91.3%, an F1-score of 90.0%, and an AUC of 95.6%. Notably, its precision reached 95.2%, indicating a strong ability to reduce false positives and enhance the reliability of detected depressive cases. This emphasis on precision helps minimize unnecessary referrals and ensures that individuals identified as depressed are highly likely to require clinical attention. We acknowledge that the moderate decline in recall (from 89.5 to 86.4%) represents the cost of this gain in precision; however, the overall improvement across other metrics, particularly F1 and AUC, which reflect balanced and discriminative performance.</p>
<p>The confusion matrix and ROC curves of the proposed method are shown in <xref ref-type="fig" rid="F4">Figure 4</xref>, while the accuracy, precision, recall, and F1-score are reported in <xref ref-type="table" rid="T7">Table 7</xref>. To provide further context on computational feasibility, we report training time and model size for each branch in <xref ref-type="table" rid="T7">Table 7</xref>. These values quantify the per-epoch cost and the number of trainable parameters. For the ensemble, TM and MS depend on the training strategy: if branches are trained sequentially, the cost is the sum of all branches, whereas in a parallel setup the cost corresponds to the maximum branch. Detailed runtime per epoch and reproducibility logs are available in the accompanying GitHub repository.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Ablation study.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Acc</bold>.</th>
<th valign="top" align="center"><bold>Prec</bold>.</th>
<th valign="top" align="center"><bold>Rec</bold>.</th>
<th valign="top" align="center"><bold>F1</bold></th>
<th valign="top" align="center"><bold>AUC</bold></th>
<th valign="top" align="center"><bold>TM (s)</bold></th>
<th valign="top" align="center"><bold>MS (M)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">BERT</td>
<td valign="top" align="center">88.8</td>
<td valign="top" align="center">86.6</td>
<td valign="top" align="center"><bold>89.5</bold></td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">92.8</td>
<td valign="top" align="center">1844.8</td>
<td valign="top" align="center">110</td>
</tr>
<tr>
<td valign="top" align="left">T5</td>
<td valign="top" align="center">64.1</td>
<td valign="top" align="center">68.3</td>
<td valign="top" align="center">64.1</td>
<td valign="top" align="center">63.1</td>
<td valign="top" align="center">65.6</td>
<td valign="top" align="center">4519.0</td>
<td valign="top" align="center">223</td>
</tr>
<tr>
<td valign="top" align="left">BERT &#x0002B; Dense</td>
<td valign="top" align="center">87.3</td>
<td valign="top" align="center">84.4</td>
<td valign="top" align="center">89.1</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">93.1</td>
<td valign="top" align="center">161.5<sup>a</sup></td>
<td valign="top" align="center">116</td>
</tr>
<tr>
<td valign="top" align="left">T5 &#x0002B; Dense</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">83.3</td>
<td valign="top" align="center">87.6</td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center">93.0</td>
<td valign="top" align="center">161.2<sup>a</sup></td>
<td valign="top" align="center">229</td>
</tr>
<tr>
<td valign="top" align="left">BERT &#x0002B; 1DCNN</td>
<td valign="top" align="center">89.5</td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">88.7</td>
<td valign="top" align="center">95.1</td>
<td valign="top" align="center">49.0<sup>a</sup></td>
<td valign="top" align="center">111</td>
</tr>
<tr>
<td valign="top" align="left">T5 &#x0002B; 1DCNN</td>
<td valign="top" align="center">88.5</td>
<td valign="top" align="center">86.6</td>
<td valign="top" align="center">88.8</td>
<td valign="top" align="center">87.7</td>
<td valign="top" align="center">95.5</td>
<td valign="top" align="center">48.8<sup>a</sup></td>
<td valign="top" align="center">223</td>
</tr>
<tr>
<td valign="top" align="left">(Proposed)</td>
<td valign="top" align="center"><bold>91.3</bold></td>
<td valign="top" align="center"><bold>95.2</bold></td>
<td valign="top" align="center">86.4</td>
<td valign="top" align="center"><bold>90.0</bold></td>
<td valign="top" align="center"><bold>95.6</bold></td>
<td valign="top" align="center">N/A</td>
<td valign="top" align="center">N/A</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The highest value for each model is highlighted in bold.</p>
<p>TM, training time; MS, model size.</p>
<p><sup>a</sup>Training time of Dense or 1DCNN using extracted embeddings from BERT or T5.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<label>5.4</label>
<title>Comparison with the state-of-the-art methods</title>
<p>To provide a more comprehensive assessment of the proposed approaches, we additionally experimented with a Generative Pre-trained Transformer (GPT)-based model and a Temporal Graph model (TGCN), while also replicating several state-of-the-art baselines for direct comparison. <xref ref-type="table" rid="T8">Table 8</xref> provides a summary of the evaluation metrics.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>Comparison with the state-of-the-art methods.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Study</bold></th>
<th valign="top" align="center"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
<th valign="top" align="center"><bold>Precision (%)</bold></th>
<th valign="top" align="center"><bold>Recall (%)</bold></th>
<th valign="top" align="center"><bold>F1 (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B30">Zhang and Guo (2024)</xref></td>
<td valign="top" align="center">BERT, T5, Dense</td>
<td valign="top" align="center">89.13</td>
<td valign="top" align="center">80.0</td>
<td valign="top" align="center">85.71</td>
<td valign="top" align="center">82.76</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">BERT, T5, Dense</td>
<td valign="top" align="center">88.2</td>
<td valign="top" align="center">88.2</td>
<td valign="top" align="center"><bold>88.4</bold></td>
<td valign="top" align="center">88.2</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B19">Milintsevich et al. (2023)</xref></td>
<td valign="top" align="center">S-RoBERTa, BiLSTM</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">80.6</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">S-RoBERTa, BiLSTM</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.1</td>
<td valign="top" align="center">86.3</td>
<td valign="top" align="center">86.1</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B27">Villatoro-Tello et al. (2021)</xref></td>
<td valign="top" align="center">Multi-layer Perceptron</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">87.0</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">83.0</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">Multi-layer Perceptron</td>
<td valign="top" align="center">84.9</td>
<td valign="top" align="center">84.8</td>
<td valign="top" align="center">84.9</td>
<td valign="top" align="center">84.8</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B21">Rai et al. (2024)</xref></td>
<td valign="top" align="center">BERT, BiLSTM</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">83.0</td>
<td valign="top" align="center">83.0</td>
<td valign="top" align="center">83.0</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">BERT, BiLSTM</td>
<td valign="top" align="center">73.5</td>
<td valign="top" align="center">72.0</td>
<td valign="top" align="center">69.9</td>
<td valign="top" align="center">71.0</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B17">Li et al. (2024)</xref></td>
<td valign="top" align="center">Heterogeneous graph Att.</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">79.0</td>
<td valign="top" align="center">80.0</td>
<td valign="top" align="center">79.0</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">Heterogeneous graph Att.</td>
<td valign="top" align="center">77.7</td>
<td valign="top" align="center">78.4</td>
<td valign="top" align="center">77.0</td>
<td valign="top" align="center">77.2</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">LLaMA (GPT-based)</td>
<td valign="top" align="center">86.3</td>
<td valign="top" align="center">83.1</td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">85.5</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">TGCN &#x0002B; BERT</td>
<td valign="top" align="center">74.3</td>
<td valign="top" align="center">71.2</td>
<td valign="top" align="center">74.6</td>
<td valign="top" align="center">72.8</td>
</tr>
<tr>
<td valign="top" align="left">Ours</td>
<td valign="top" align="center">TGCN &#x0002B; T5</td>
<td valign="top" align="center">77.8</td>
<td valign="top" align="center">75.5</td>
<td valign="top" align="center">76.9</td>
<td valign="top" align="center">76.2</td>
</tr>
<tr>
<td valign="top" align="left">Proposed</td>
<td valign="top" align="center">BERT, T5, Dense, 1DCNN</td>
<td valign="top" align="center"><bold>91.3</bold></td>
<td valign="top" align="center"><bold>95.2</bold></td>
<td valign="top" align="center">85.4</td>
<td valign="top" align="center"><bold>90.0</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Att.: Attention mechanism.</p>
</table-wrap-foot>
</table-wrap>
<p>Compared to previous works, our dual-stream fusion approach demonstrates stronger generalization by effectively combining contextual textual embeddings (from BERT and T5) with convolutional representations that capture local semantic patterns. The GPT-based model (LLaMA) also achieved competitive results (F1 = 85.5%), highlighting the potential of generative large language models for depression detection. The Temporal Graph TGCN variants incorporating BERT and T5 embeddings achieved moderate but consistent performance (F1 = 72.8 and 76.2%, respectively), suggesting that temporal graph reasoning can capture sequential dependencies in dialogue-level or time-series contexts.</p>
<p>When comparing against other state-of-the-art methods, the following two observations can be drawn:</p>
<list list-type="bullet">
<list-item><p>Transformers dominate: models such as BERT, T5, and S-RoBERTa continue to lead in performance owing to their superior contextual representation and transfer-learning capabilities.</p></list-item>
<list-item><p>Hybrid architectures remain strong: many competitive systems combine transformers with BiLSTM, dense, or convolutional layers to exploit both contextual depth and sequential or local feature learning.</p></list-item>
</list></sec></sec>
<sec id="s6">
<label>6</label>
<title>Discussion and conclusion</title>
<p>This study presents a dual-stream transfer learning framework for text-based depression detection, leveraging transformer-based large language models.</p>
<p>Transfer learning, particularly through BERT and T5, offers substantial advantages over non-TL approaches&#x02013;especially in tasks requiring nuanced interpretation of linguistic patterns. Although models such as RF and Dense perform competitively in the non-TL setting, they are outperformed by TL-based models in terms of recall and F1. This highlights the strength of contextual embeddings in capturing subtle depressive cues.</p>
<p>When T5 embeddings were processed through a 1DCNN architecture, performance improved significantly. The T5 &#x0002B; 1DCNN model achieved an accuracy of 88.5%, an F1 of 87.7%, and an AUC of 95.5%&#x02013;a substantial increase over standalone T5, which showed weaker performance (accuracy: 64.1%, F1: 63.1%, AUC: 65.6%). These results suggest that convolutional layers effectively complement T5 embeddings by extracting local sequential patterns that improve representation quality.</p>
<p>The integration of BERT and T5 embeddings through parallel Dense and 1DCNN branches led to further performance improvements. The resulting fusion model, which combines all four branches, achieved the best results across all evaluation metrics, including accuracy (91.3%), F1 score (90.0%), and AUC (95.6%). Its high precision (95.2%) indicates strong reliability in minimizing false positive predictions. Although high recall is critical for depression screening, the proposed model balances sensitivity with improved precision to reduce false alarms in decision-support use. It is important to emphasize that the combination of BERT and T5 is not intended to assert theoretical optimality of a specific model pairing. Instead, this design aims to reduce reliance on a single pre-training bias by leveraging heterogeneous semantic encoders. Such an agreement oriented fusion strategy enhances robustness when modeling subtle and implicitly expressed linguistic markers of depression.</p>
<p>Compared to the best-performing non-TL model (RF &#x0002B; TF-IDF, accuracy: 88.3%), the proposed fusion method improved accuracy by 3.0%, demonstrating its superior effectiveness in real-world, low-resource clinical datasets.</p>
<p>In summary, transfer learning with pre-trained LLMs significantly enhances the capability of automated depression detection systems. By combining BERT and T5 within a dual-stream architecture, this study demonstrates a robust approach that outperforms both traditional and deep learning baselines. Importantly, the proposed framework has potential clinical relevance: it may support early-stage depression screening, reduce the burden on clinicians by providing automated pre-assessment tools, and enable scalable integration into telehealth and digital platforms for mental health monitoring.</p>
<p>Looking ahead, future research could explore the generalizability of this method to other mental health conditions, its extension to multimodal settings (e.g., combining text, audio, and video), and its deployment in clinical environments for scalable and early-stage mental health assessment.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>NW: Conceptualization, Methodology, Software, Writing &#x02013; original draft. WZ: Conceptualization, Methodology, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. RK: Supervision, Writing &#x02013; review &#x00026; editing. IR: Writing &#x02013; review &#x00026; editing. SA: Supervision, Writing &#x02013; review &#x00026; editing. NI: Conceptualization, Methodology, Writing &#x02013; original draft. ZZ: Software, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Adam-Troian</surname> <given-names>J.</given-names></name> <name><surname>Bonetto</surname> <given-names>E.</given-names></name> <name><surname>Arciszewski</surname> <given-names>T.</given-names></name></person-group> (<year>2022</year>). <article-title>Using absolutist word frequency from online searches to measure population mental health dynamics</article-title>. <source>Sci. Rep</source>. <volume>12</volume>:<fpage>2619</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-022-06392-4</pub-id><pub-id pub-id-type="pmid">35173219</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ahmad Wani</surname> <given-names>M.</given-names></name> <name><surname>ELAffendi</surname> <given-names>M. A.</given-names></name> <name><surname>Shakil</surname> <given-names>K. A.</given-names></name> <name><surname>Shariq Imran</surname> <given-names>A.</given-names></name> <name><surname>Abd El-Latif</surname> <given-names>A. A.</given-names></name></person-group> (<year>2023</year>). <article-title>Depression screening in humans with ai and deep learning techniques</article-title>. <source>IEEE Trans. Comput. Soc. Syst</source>. <volume>10</volume>, <fpage>2074</fpage>&#x02013;<lpage>2089</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TCSS.2022.3200213</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Baena-Garc&#x000ED;a</surname> <given-names>M.</given-names></name> <name><surname>Carmona-Cejudo</surname> <given-names>J. M.</given-names></name> <name><surname>Castillo</surname> <given-names>G.</given-names></name> <name><surname>Morales-Bueno</surname> <given-names>R.</given-names></name></person-group> (<year>2011</year>). <article-title>&#x0201C;TF-SIDF: term frequency, sketched inverse document frequency, &#x0201D;</article-title> in <source>2011 11th International Conference on Intelligent Systems Design and Applications</source> (<publisher-loc>Cordoba</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1044</fpage>&#x02013;<lpage>1049</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ISDA.2011.6121796</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bao</surname> <given-names>E.</given-names></name> <name><surname>P&#x000E9;rez</surname> <given-names>A.</given-names></name> <name><surname>Parapar</surname> <given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Explainable depression symptom detection in social media</article-title>. <source>Health Inform. Sci. Syst.</source> <volume>12</volume>:<fpage>47</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s13755-024-00303-9</pub-id><pub-id pub-id-type="pmid">39247905</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Beck</surname> <given-names>A. T.</given-names></name> <name><surname>Steer</surname> <given-names>R. A.</given-names></name> <name><surname>Ball</surname> <given-names>R.</given-names></name> <name><surname>Ranieri</surname> <given-names>W. F.</given-names></name></person-group> (<year>1996</year>). <article-title>Comparison of beck depression inventories-IA and-II in psychiatric outpatients</article-title>. <source>J. Pers. Assess</source>. <volume>67</volume>, <fpage>588</fpage>&#x02013;<lpage>597</lpage>. doi: <pub-id pub-id-type="doi">10.1207/s15327752jpa6703_13</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chawla</surname> <given-names>K.</given-names></name> <name><surname>Clever</surname> <given-names>R.</given-names></name> <name><surname>Ramirez</surname> <given-names>J.</given-names></name> <name><surname>Lucas</surname> <given-names>G. M.</given-names></name> <name><surname>Gratch</surname> <given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Towards emotion-aware agents for improved user satisfaction and partner perception in negotiation dialogues</article-title>. <source>IEEE Trans. Affect, Comput</source>. <volume>15</volume>, <fpage>433</fpage>&#x02013;<lpage>444</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TAFFC.2023.3238007</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Church</surname> <given-names>K. W.</given-names></name></person-group> (<year>2017</year>). <article-title>Word2Vec</article-title>. <source>Nat. Lang. Eng</source>. <volume>23</volume>, <fpage>155</fpage>&#x02013;<lpage>162</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S1351324916000334</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>[Dataset] Devlin</surname> <given-names>J.</given-names></name> <name><surname>Chang</surname> <given-names>M.-W.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Toutanova</surname> <given-names>K.</given-names></name></person-group> (<year>2019</year>). <source>BERT: pre-training of deep bidirectional transformers for language understanding</source>.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>[Dataset] Liu</surname> <given-names>Y.</given-names></name> <name><surname>Ott</surname> <given-names>M.</given-names></name> <name><surname>Goyal</surname> <given-names>N.</given-names></name> <name><surname>Du</surname> <given-names>J.</given-names></name> <name><surname>Joshi</surname> <given-names>M.</given-names></name> <name><surname>Chen</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2019</year>). <source>RoBERTa: a robustly optimized BERT pretraining approach</source>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Fern&#x000E1;ndez</surname> <given-names>A.</given-names></name> <name><surname>Garc&#x000ED;a</surname> <given-names>S.</given-names></name> <name><surname>Galar</surname> <given-names>M.</given-names></name> <name><surname>Prati</surname> <given-names>R. C.</given-names></name> <name><surname>Krawczyk</surname> <given-names>B.</given-names></name> <name><surname>Herrera</surname> <given-names>F.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Learning from imbalanced data sets,&#x0201D;</article-title> in <source>Springer Handbook of Computational Intelligence</source> (<publisher-loc>Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>), 993-1014.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Graves</surname> <given-names>A.</given-names></name> <name><surname>Schmidhuber</surname> <given-names>J.</given-names></name></person-group> (<year>2005</year>). <article-title>Framewise phoneme classification with bidirectional LSTM and other neural network architectures</article-title>. <source>Neural Netw</source>. <volume>18</volume>, <fpage>602</fpage>&#x02013;<lpage>610</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2005.06.042</pub-id><pub-id pub-id-type="pmid">16112549</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hadikhah Mozhdehi</surname> <given-names>M.</given-names></name> <name><surname>Eftekhari Moghadam</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Textual emotion detection utilizing a transfer learning approach</article-title>. <source>J. Supercomput</source>. <volume>79</volume>, <fpage>13075</fpage>&#x02013;<lpage>13089</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11227-023-05168-5</pub-id><pub-id pub-id-type="pmid">37359334</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Herrman</surname> <given-names>H.</given-names></name> <name><surname>Kieling</surname> <given-names>C.</given-names></name> <name><surname>McGorry</surname> <given-names>P.</given-names></name> <name><surname>Horton</surname> <given-names>R.</given-names></name> <name><surname>Sargent</surname> <given-names>J.</given-names></name> <name><surname>Patel</surname> <given-names>V.</given-names></name></person-group> (<year>2019</year>). <article-title>Reducing the global burden of depression: a lancet-world psychiatric association commission</article-title>. <source>Lancet</source>. <volume>393</volume>, <fpage>e42</fpage>&#x02013;<lpage>e43</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0140-6736(18)32408-5</pub-id><pub-id pub-id-type="pmid">30482607</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hochreiter</surname> <given-names>S.</given-names></name> <name><surname>Schmidhuber</surname> <given-names>J.</given-names></name></person-group> (<year>1997</year>). <article-title>Long short-term memory</article-title>. <source>Neural Comput</source>. <volume>9</volume>, <fpage>1735</fpage>&#x02013;<lpage>1780</lpage>. doi: <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>Y.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Convolutional neural networks for sentence classification,&#x0201D;</article-title> in <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>, eds A. Moschitti, B. Pang, and W. Daelemans (Doha: Association for Computational Linguistics), <fpage>1746</fpage>&#x02013;<lpage>1751</lpage>. doi: <pub-id pub-id-type="doi">10.3115/v1/D14-1181</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y.</given-names></name> <name><surname>Boser</surname> <given-names>B.</given-names></name> <name><surname>Denker</surname> <given-names>J.</given-names></name> <name><surname>Henderson</surname> <given-names>D.</given-names></name> <name><surname>Howard</surname> <given-names>R.</given-names></name> <name><surname>Hubbard</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>1989</year>). <article-title>Handwritten digit recognition with a back-propagation network</article-title>. <source>Adv. Neural Inf. Process. Syst</source>. <volume>2</volume>, <fpage>396</fpage>&#x02013;<lpage>404</lpage>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Sun</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>M.</given-names></name></person-group> (<year>2024</year>). <article-title>Detecting depression with heterogeneous graph neural network in clinical interview transcript</article-title>. <source>IEEE Trans. Comput. Soc. Syst</source>. <volume>11</volume>, <fpage>1315</fpage>&#x02013;<lpage>1324</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TCSS.2023.3263056</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>T.</given-names></name> <name><surname>Cong</surname> <given-names>R.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Gan</surname> <given-names>Q.</given-names></name> <name><surname>Fang</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>QAIE: LLM-based quantity augmentation and information enhancement for few-shot aspect-based sentiment analysis</article-title>. <source>Inform. Process. Manag</source>. <volume>62</volume>:<fpage>103917</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ipm.2024.103917</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Milintsevich</surname> <given-names>K.</given-names></name> <name><surname>Sirts</surname> <given-names>K.</given-names></name> <name><surname>Dias</surname> <given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>Towards automatic text-based estimation of depression through symptom prediction</article-title>. <source>Brain Inform</source>. <volume>10</volume>:<fpage>4</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s40708-023-00185-9</pub-id><pub-id pub-id-type="pmid">36780049</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Raffel</surname> <given-names>C.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Roberts</surname> <given-names>A.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Narang</surname> <given-names>S.</given-names></name> <name><surname>Matena</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Exploring the limits of transfer learning with a unified text-to-text transformer</article-title>. <source>J. Machine Learn. Res</source>. <volume>21</volume>, <fpage>1</fpage>&#x02013;<lpage>67</lpage>.</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rai</surname> <given-names>B. K.</given-names></name> <name><surname>Jain</surname> <given-names>I.</given-names></name> <name><surname>Tiwari</surname> <given-names>B.</given-names></name> <name><surname>Saxena</surname> <given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>Multimodal mental state analysis</article-title>. <source>Health Serv. Outcomes Res. Methodol</source>. <volume>25</volume>, <fpage>85</fpage>&#x02013;<lpage>112</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10742-024-00329-2</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rodr&#x000ED;guez-Ib&#x000E1;nez</surname> <given-names>M.</given-names></name> <name><surname>Cas&#x000E1;nez-Ventura</surname> <given-names>A.</given-names></name> <name><surname>Castej&#x000F3;n-Mateos</surname> <given-names>F.</given-names></name> <name><surname>Cuenca-Jim&#x000E9;nez</surname> <given-names>P.-M.</given-names></name></person-group> (<year>2023</year>). <article-title>A review on sentiment analysis from social media platforms</article-title>. <source>Expert Syst. Appl</source>. <volume>223</volume>:<fpage>119862</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2023.119862</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Semrau</surname> <given-names>M.</given-names></name> <name><surname>Alem</surname> <given-names>A.</given-names></name> <name><surname>Ayuso-Mateos</surname> <given-names>J. L.</given-names></name> <name><surname>Chisholm</surname> <given-names>D.</given-names></name> <name><surname>Gureje</surname> <given-names>O.</given-names></name> <name><surname>Hanlon</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Strengthening mental health systems in low-and middle-income countries: recommendations from the emerald programme</article-title>. <source>BJPsych Open</source> <volume>5</volume>:<fpage>e73</fpage>. doi: <pub-id pub-id-type="doi">10.1192/bjo.2018.90</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Spitzer</surname> <given-names>R. L.</given-names></name> <name><surname>Kroenke</surname> <given-names>K.</given-names></name> <name><surname>Williams</surname> <given-names>J. B.</given-names></name></person-group> (<year>1999</year>). <article-title>Validation and utility of a self-report version of prime-MD: the PHQ primary care study</article-title>. <source>JAMA</source> <volume>282</volume>, <fpage>1737</fpage>&#x02013;<lpage>1744</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.282.18.1737</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Uppal</surname> <given-names>M.</given-names></name> <name><surname>Gupta</surname> <given-names>D.</given-names></name> <name><surname>Juneja</surname> <given-names>S.</given-names></name> <name><surname>Gadekallu</surname> <given-names>T. R.</given-names></name> <name><surname>Bayoumy</surname> <given-names>I. E.</given-names></name> <name><surname>Hussain</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Enhancing accuracy in brain stroke detection: multi-layer perceptron with adadelta, RMSProp and AdaMax optimizers</article-title>. <source>Front. Bioeng. Biotechnol</source>. <volume>11</volume>:<fpage>1257591</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fbioe.2023.1257591</pub-id><pub-id pub-id-type="pmid">37823024</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Attention is all you need</article-title>. <source>Adv. Neural Inf. Process. Syst</source>. <volume>30</volume>, <fpage>1</fpage>&#x02013;<lpage>15</lpage>.</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Villatoro-Tello</surname> <given-names>E.</given-names></name> <name><surname>Ram&#x000ED;rez-de-la Rosa</surname> <given-names>G.</given-names></name> <name><surname>G&#x000E1;tica-P&#x000E9;rez</surname> <given-names>D.</given-names></name> <name><surname>Magimai-Doss</surname> <given-names>M.</given-names></name> <name><surname>Jim&#x000E9;nez-Salazar</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Approximating the mental lexicon from clinical interviews as a support tool for depression detection,&#x0201D;</article-title> in <source>Proceedings of the 2021 International Conference on Multimodal Interaction, ICMI &#x00027;21</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>557</fpage>&#x02013;<lpage>566</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3462244.3479896</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>N.</given-names></name> <name><surname>Kamil</surname> <given-names>R.</given-names></name> <name><surname>Al-Haddad</surname> <given-names>S. A. R.</given-names></name> <name><surname>Ibrahim</surname> <given-names>N.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name></person-group> (<year>2025</year>). <article-title>Enhancing AI depression detection using transfer learning</article-title>. <source>Contemp. Math</source>. <volume>6</volume>, <fpage>3054</fpage>&#x02013;<lpage>3080</lpage>. doi: <pub-id pub-id-type="doi">10.37256/cm.6320256184</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zafar</surname> <given-names>A.</given-names></name> <name><surname>Hussain</surname> <given-names>S. J.</given-names></name> <name><surname>Ali</surname> <given-names>M. U.</given-names></name> <name><surname>Lee</surname> <given-names>S. W.</given-names></name></person-group> (<year>2023</year>). <article-title>Metaheuristic optimization-based feature selection for imagery and arithmetic tasks: an fNIRS study</article-title>. <source>Sensors</source>, <volume>23</volume>:<fpage>3714</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s23073714</pub-id><pub-id pub-id-type="pmid">37050774</pub-id></mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Guo</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Multilevel depression status detection based on fine-grained prompt learning</article-title>. <source>Pattern Recognit. Lett</source>. <volume>178</volume>, <fpage>167</fpage>&#x02013;<lpage>173</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patrec.2024.01.005</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Q.</given-names></name> <name><surname>Xia</surname> <given-names>Y.</given-names></name> <name><surname>Long</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2025</year>). <article-title>Leveraging sensory knowledge into text-to-text transfer transformer for enhanced emotion analysis</article-title>. <source>Inform. Proces. Manag</source>. <volume>62</volume>:<fpage>103876</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ipm.2024.103876</pub-id></mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Z.</given-names></name> <name><surname>Chuah</surname> <given-names>J. H.</given-names></name> <name><surname>Lai</surname> <given-names>K. W.</given-names></name> <name><surname>Chow</surname> <given-names>C.-O.</given-names></name> <name><surname>Gochoo</surname> <given-names>M.</given-names></name> <name><surname>Dhanalakshmi</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Conventional machine learning and deep learning in alzheimer&#x00027;s disease diagnosis using neuroimaging: a review</article-title>. <source>Front. Comput. Neurosci</source>. <volume>17</volume>:<fpage>1038636</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fncom.2023.1038636</pub-id><pub-id pub-id-type="pmid">36814932</pub-id></mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>Kiros</surname> <given-names>R.</given-names></name> <name><surname>Zemel</surname> <given-names>R.</given-names></name> <name><surname>Salakhutdinov</surname> <given-names>R.</given-names></name> <name><surname>Urtasun</surname> <given-names>R.</given-names></name> <name><surname>Torralba</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>&#x0201C;Aligning books and movies: towards story-like visual explanations by watching movies and reading books,&#x0201D;</article-title> in <source>Proceedings of the IEEE International Conference on Computer Vision</source> (<publisher-loc>Santiago</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>19</fpage>&#x02013;<lpage>27</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV.2015.11</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zung</surname> <given-names>W. W.</given-names></name></person-group> (<year>1965</year>). <article-title>A self-rating depression scale</article-title>. <source>Arch. Gen. Psychiatry</source> <volume>12</volume>, <fpage>63</fpage>&#x02013;<lpage>70</lpage>. doi: <pub-id pub-id-type="doi">10.1001/archpsyc.1965.01720310065008</pub-id><pub-id pub-id-type="pmid">14221692</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1377782/overview">Hanqi Zhuang</ext-link>, Florida Atlantic University, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2868464/overview">Himanshu Sharma</ext-link>, NIMS University, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3171378/overview">K.M. Poonam</ext-link>, Indian Institute of Technology Kharagpur, India</p>
</fn>
</fn-group>
</back>
</article>