<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Psychiatry</journal-id>
<journal-title>Frontiers in Psychiatry</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Psychiatry</abbrev-journal-title>
<issn pub-type="epub">1664-0640</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpsyt.2025.1602650</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Psychiatry</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Predicting depression by using a novel deep learning model and video-audio-text multimodal data</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Yifu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3019886/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Xueping</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2630003/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Meng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Jiangtao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yao</surname>
<given-names>Yudong</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1100020/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Qian</surname>
<given-names>Wei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Qi</surname>
<given-names>Shouliang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/283101/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Medicine and Biological Information Engineering, Northeastern University</institution>, <addr-line>Shenyang</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Key Laboratory of Intelligent Computing in Medical Image, Ministry of Education, Northeastern University</institution>, <addr-line>Shenyang</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Psychology, The People&#x2019;s Hospital of Liaoning Province</institution>, <addr-line>Shenyang</addr-line>,&#xa0;<country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Electrical and Computer Engineering, Stevens Institute of Technology</institution>, <addr-line>Hoboken, NJ</addr-line>,&#xa0;<country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1773809/overview">Bing Liu</ext-link>, Beijing Normal University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1161503/overview">Yong Li</ext-link>, Nanjing University of Science and Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1640296/overview">Alwin Poulose</ext-link>, Indian Institute of Science Education and Research, India</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2825756/overview">Kuldeep Singh</ext-link>, Guru Nanak Dev University, India</p>
<p>&#xc7;a&#x11f;lar Uyulan, Izmir Katip Celebi University, T&#xfc;rkiye</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Shouliang Qi, <email xlink:href="mailto:qisl@bmie.neu.edu.cn">qisl@bmie.neu.edu.cn</email>
</p>
</fn>
<fn fn-type="other" id="fn003">
<p>&#x2020;ORCID: Yifu Li, <uri xlink:href="https://orcid.org/0009-0007-5991-9926">orcid.org/0009-0007-5991-9926</uri>; Xueping Yang, <uri xlink:href="https://orcid.org/0000-0002-0681-5812">orcid.org/0000-0002-0681-5812</uri>; Meng Zhao, <uri xlink:href="https://orcid.org/0009-0006-8449-8935">orcid.org/0009-0006-8449-8935</uri>; Jiangtao Wang, <uri xlink:href="https://orcid.org/0009-0006-1095-0458">orcid.org/0009-0006-1095-0458</uri>; Yudong Yao, <uri xlink:href="https://orcid.org/0000-0003-3868-0593">orcid.org/0000-0003-3868-0593</uri>; Wei Qian, <uri xlink:href="https://orcid.org/0000-0002-9563-721X">orcid.org/0000-0002-9563-721X</uri>; Shouliang Qi, <uri xlink:href="https://orcid.org/0000-0003-0977-1939">orcid.org/0000-0003-0977-1939</uri>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>24</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1602650</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>03</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Li, Yang, Zhao, Wang, Yao, Qian and Qi.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Li, Yang, Zhao, Wang, Yao, Qian and Qi</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Objective</title>
<p>Depression is a prevalent mental health disorder affecting millions of people. Traditional diagnostic methods primarily rely on self-reported questionnaires and clinical interviews, which can be subjective and vary significantly between individuals. This paper introduces the Integrative Multimodal Depression Detection Network (IMDD-Net), a novel deep-learning framework designed to enhance the accuracy of depression evaluation by leveraging both local and global features from video, audio, and text cues.</p>
</sec>
<sec>
<title>Methods</title>
<p>The IMDD-Net integrates these multimodal data streams using the Kronecker product for multimodal fusion, facilitating deep interactions between modalities. Within the audio modality, Mel Frequency Cepstrum Coefficient (MFCC) and extended Geneva Minimalistic Acoustic Parameter Set (eGeMAPS) features capture local and global acoustic properties, respectively. For video data, the TimeSformer network extracts both fine-grained and broad temporal features, while the text modality utilizes a pre-trained BERT model to obtain comprehensive contextual information. The IMDD-Net&#x2019;s architecture effectively combines these diverse data types to provide a holistic analysis of depressive symptoms.</p>
</sec>
<sec>
<title>Results</title>
<p>Experimental results on the AVEC 2014 dataset demonstrate that the IMDD-Net achieves state-of-the-art performance in predicting Beck Depression Inventory-II (BDI-II) scores, with a Root Mean Square Error (RMSE) of 7.55 and a Mean Absolute Error (MAE) of 5.75. A classification to identify potential depression subjects can achieve an accuracy of 0.79.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>These results underscore the robustness and precision of the IMDD-Net, highlighting the importance of integrating local and global features across multiple modalities for accurate depression prediction.</p>
</sec>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>depression</kwd>
<kwd>multimedia</kwd>
<kwd>information fusion</kwd>
<kwd>local and global features</kwd>
</kwd-group>
<counts>
<fig-count count="10"/>
<table-count count="2"/>
<equation-count count="5"/>
<ref-count count="64"/>
<page-count count="16"/>
<word-count count="8550"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Digital Mental Health</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Depression is the primary mental health disorder contributing to the disease burden and it impacts roughly 300 million individuals worldwide (<xref ref-type="bibr" rid="B1">1</xref>). Depression is a widespread, costly, and debilitating condition that significantly increases the risk of suicide (<xref ref-type="bibr" rid="B2">2</xref>). More than 80% of individuals fail to receive appropriate treatment due to the lack of early intervention services and treatments for depression and researchers estimate that approximately one in five individuals will experience depression at some point in their lifetime (<xref ref-type="bibr" rid="B3">3</xref>). Consequently, it is evident that the diagnosis and screening of depression are essential.</p>
<p>Current approaches to evaluating depression rely predominantly on the verbal accounts provided by patients, their families, or caregivers, whether through clinical interviews or questionnaires (<xref ref-type="bibr" rid="B4">4</xref>). However, these traditional methods have certain limitations because the subjectivity of individuals can affect responses to questions, and symptoms of depression may manifest differently across individuals (<xref ref-type="bibr" rid="B5">5</xref>). Traditionally, accurate diagnosis of depression severity requires comprehensive information and extensive clinical training (<xref ref-type="bibr" rid="B6">6</xref>). Fortunately, advanced computing methods, such as machine learning, deep learning, and artificial intelligence, are ideally suited to enhance the assessment of mental health outcomes for individuals (<xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>Utilizing audio and video methods for detecting depression offers distinct advantages, including the ability to capture direct cues and subtle behavioral changes that may not be evident in traditional assessments, while numerous indicators are used to detect depression, such as hormonal imbalances (<xref ref-type="bibr" rid="B8">8</xref>), changes in sleep patterns (<xref ref-type="bibr" rid="B9">9</xref>), cognitive performance assessments (<xref ref-type="bibr" rid="B10">10</xref>), resting-state functional magnetic resonance imaging (fMRI) data (<xref ref-type="bibr" rid="B11">11</xref>), EEG data (<xref ref-type="bibr" rid="B12">12</xref>, <xref ref-type="bibr" rid="B13">13</xref>) and other physiological data (<xref ref-type="bibr" rid="B14">14</xref>). In recent years, a variety of automatic depression estimation (ADE) systems have emerged (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B16">16</xref>). These systems automatically assess the severity of depression using audiovisual cues, employing advanced techniques from machine learning and deep learning (<xref ref-type="bibr" rid="B17">17</xref>). Research indicates that speech exhibits numerous unique characteristics that can be used to identify an individual&#x2019;s mental state (<xref ref-type="bibr" rid="B18">18</xref>&#x2013;<xref ref-type="bibr" rid="B20">20</xref>). With the aid of various gestures involving the eyes, mouth, nose, and hands, emotions such as anger, happiness, sadness, and neutrality can be identified through depression detection systems that utilize image and video processing (<xref ref-type="bibr" rid="B21">21</xref>). Similarly, textual information can also be analyzed to extract features relevant to depression (<xref ref-type="bibr" rid="B22">22</xref>) Similarly, textual information can also be analyzed to extract features relevant to depression (<xref ref-type="bibr" rid="B22">22</xref>, <xref ref-type="bibr" rid="B23">23</xref>). And many researchers also have explored diagnosing depression through social networks by textual information (<xref ref-type="bibr" rid="B24">24</xref>&#x2013;<xref ref-type="bibr" rid="B26">26</xref>). While unimodal approaches can be effective to some extent in detecting depression, multimodal or hybrid modalities often exhibit superior performance. However, the selection of different modalities, the choice of network architectures, and the methods of fusion all significantly impact the effectiveness of depression detection (<xref ref-type="bibr" rid="B17">17</xref>). Many depression detection networks utilize multimodal architectures, yet few of these systems effectively incorporate both local and global features within various modalities.</p>
<p>Consequently, this study introduces a novel deep learning network architecture for multimodal depression detection (<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>), termed the Integrative Multimodal Depression Detection Network (IMDD-Net). This advanced framework not only integrates data from video, text, and audio modalities but also considers both local and global information within each modality. By doing so, the IMDD-Net enhances the estimation efficacy by capturing a more comprehensive representation of depressive symptoms. Specifically, within the audio modality, we preprocessed the audio signals and extracted both time-frame level and global statistical features. Both sets of features were then fed into the IMDD-Net through separate channels. For video data, we sampled the video at regular intervals and processed each frame. Then we utilized the specific network to extract both global and local features. In the text modality, we employed a pre-trained Bert-base-german-cased (<xref ref-type="bibr" rid="B27">27</xref>) network. The high-dimensional features obtained from these three modalities were fused. This IMDD-Net was evaluated by using the AVEC2014 dataset (<xref ref-type="bibr" rid="B28">28</xref>), demonstrating the effectiveness of our approach in the practical assessment of depression.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Brief diagram of multimodal depression detection by IMDD.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g001.tif">
<alt-text content-type="machine-generated">Diagram showing a person connected by arrows to a microphone, camcorder, film strip, and cassette tape. These connect to a microchip, leading to emoticons showing happy, neutral, and sad expressions.</alt-text>
</graphic>
</fig>
<p>The primary innovations and contributions of this paper are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>A novel multimodal network architecture has been proposed for the identification of depression;</p>
</list-item>
<list-item>
<p>This network effectively integrates and analyzes both global and local features across multiple modalities, including text, video, and audio;</p>
</list-item>
<list-item>
<p>Using the Kronecker product for multimodal fusion explores deep interactions between modalities and enhances the detection accuracy of depression.</p>
</list-item>
<list-item>
<p>The IMDD-Net achieves state-of-the-art performance in depression assessment areas.</p>
</list-item>
</list>
<p>The remainder of this paper is organized as follows. Section 2 reviews the related work on automatic depression estimation. Section 3 describes the dataset used in this study, details the data preparation and preprocessing steps, and provides a comprehensive explanation of the proposed IMDD-Net architecture. Section 4 presents the experimental results obtained using IMDD-Net and provides an analysis of the model&#x2019;s performance across regression and classification tasks. Section 5 discusses the implications of our findings, outlines current limitations, and considers directions for future improvement and clinical applicability. Section 6 concludes the paper and summarizes the main contributions of this work.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related works</title>
<p>Recent years have witnessed significant advancements in the field of automatic depression estimation through deep learning. In 2021, Dong et&#xa0;al. (<xref ref-type="bibr" rid="B19">19</xref>) developed an automatic depression estimation method using speech signals. This approach combines deep speaker recognition and speech emotion recognition features from pre-trained models to utilize complementary vocal and emotional data. In 2020, Li et&#xa0;al. (<xref ref-type="bibr" rid="B29">29</xref>) introduced DRR_DepressionNet to predict depression severity from facial expressions. This method enhances facial images to enlarge the training dataset and uses a modified ResNet divided into C_M block, Resblock, and global average pooling. It employs Euclidean loss instead of traditional cross-entropy loss for training. Compared to static images, videos often contain more information and Uddin et&#xa0;al. (<xref ref-type="bibr" rid="B30">30</xref>) introduced a two-stream deep spatiotemporal network to assess depression levels from video data. This framework employs the Inception-ResNet-v2 for spatial data and a volume local directional number descriptor for facial motion analysis, enhanced by convolutional neural network (CNN) processing. It also features a multilayer bidirectional long short-term memory (Bi-LSTM) with temporal median pooling to integrate spatial and temporal features effectively. Then, He et&#xa0;al. (<xref ref-type="bibr" rid="B31">31</xref>) introduced an end-to-end trainable system for depression detection, utilizing a 3D CNN with a spatiotemporal feature aggregation module. This system utilizes a 3D DEP-NetVLAD aggregation method to effectively identify depression. In 2023, Rajawat et&#xa0;al. (<xref ref-type="bibr" rid="B32">32</xref>) introduced a fusion fuzzy logic model combined with deep learning to identify depression through facial expressions from image and video files. Their model uses a fuzzy algorithm and unordered fuzzy rule initiation for depression recognition, transforming facial expressions into detectable indicators of depression.</p>
<p>Furthermore, many multimodal networks based on audio and video have also achieved excellent results in the field of depression detection. Niu et&#xa0;al. (<xref ref-type="bibr" rid="B33">33</xref>) introduced a novel Spatio-Temporal Attention network combined with a Multimodal Attention Feature Fusion strategy for predicting depression levels by capturing multimodal cues in 2020. In addition, Sun et&#xa0;al. (<xref ref-type="bibr" rid="B34">34</xref>) developed a multi-modal adaptive fusion transformer network and it is tailored to extract long-term context from uni-modal data. This network employs an adaptive fusion technique to integrate multimodal features effectively. Bucur et&#xa0;al. (<xref ref-type="bibr" rid="B35">35</xref>) introduced a unique time-enriched multimodal transformer architecture that leverages pre-trained models to extract image and text embeddings from social media posts in 2023. Operating at the user level, their model integrates time2vec positional embeddings to account for the timing of posts. Furthermore, they developed a variant designed to handle randomly sampled, unordered post sets, thereby increasing robustness against dataset noise. Li et&#xa0;al. (<xref ref-type="bibr" rid="B36">36</xref>) proposed a Decoupled Multimodal Distillation (DMD) framework to address modality heterogeneity in emotion recognition by separating each modality&#x2019;s representation into modality-exclusive and modality-irrelevant components. A graph-based distillation unit (GD-Unit) enables dynamic, adaptive knowledge transfer between modalities via learned edge weights. This flexible structure improves feature discrimination and crossmodal alignment, achieving superior performance on standard multimodal emotion recognition benchmarks.</p>
<p>In summary, recent advancements in the field of automatic depression estimation have showcased the potential of deep learning and multimodal approaches in improving the accuracy and reliability of depression detection. Various methods utilizing speech signals, facial expressions, and video data have been developed, each contributing unique strengths and innovations. As the field continues to evolve, future research should focus on refining these models, addressing their limitations, and improve the effectiveness of depression detection.</p>
</sec>
<sec id="s3" sec-type="materials|methods">
<label>3</label>
<title>Materials and methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>Dataset and preprocessing methods</title>
<p>This chapter provides an overview of the datasets utilized in this study, as well as the preprocessing methods applied to the raw data, encompassing audio, video, and text modalities.</p>
<sec id="s3_1_1">
<label>3.1.1</label>
<title>AVEC 2014 dataset</title>
<p>In this study, we utilize the Audio-Visual Emotion Recognition Challenge (AVEC) 2014 dataset (<xref ref-type="bibr" rid="B28">28</xref>). This dataset is among the few that offer unprocessed audio and video data, which are critical for analyzing nuanced behavioral cues and expressions associated with depression. The AVEC 2014 dataset includes 150 German participants (96 female and 54 male) and the subjects had a mean age of 31.5 years, with a standard deviation of 12.3 years, ranging from 18 to 63 years. Each of them completed two tasks to generate differentiated audiovisual data. The tasks selected are as follows: (1) Northwind dataset: Participants recite a passage from the fable &#x201c;Die Sonne und der Wind&#x201d; (The North Wind and the Sun) in German. (2) Freeform dataset: Participants express themselves spontaneously by answering various prompts, such as: &#x201c;What is your favorite dish?&#x201d;; &#x201c;What was your best gift, and why?&#x201d;; or &#x201c;Discuss a sad childhood memory.&#x201d; All responses are answered in German. Therefore, these tasks have been specifically selected to enable a comprehensive analysis of both verbal and nonverbal cues associated with depression. Additionally, each participant&#x2019;s depression severity is quantified using the Beck Depression Inventory-II (BDI-II) (<xref ref-type="bibr" rid="B37">37</xref>) scores, which serve as labels for the dataset. According to standard interpretation guidelines, BDI-II scores can be categorized as follows: 0&#x2013;13 indicates minimal depression, 14&#x2013;19 mild depression, 20&#x2013;28 moderate depression, and 29&#x2013;63 severe depression.</p>
</sec>
<sec id="s3_1_2">
<label>3.1.2</label>
<title>Video preprocessing</title>
<p>In this section, we detail the preprocessing steps applied to the video data of the AVEC 2014 dataset, and the process is depicted in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The preprocessing steps applied to the video data (MTCNN indicates Multi-task Cascaded Convolutional Neural Networks).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g002.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a process of face detection and analysis from video. It begins with a video frame, which is sampled into multiple frames. These frames are processed using an MTCNN algorithm, resulting in cropped facial images. The images undergo an image pyramid transformation, then pass sequentially through P-Net, R-Net, and O-Net modules for further analysis, with a final processed facial image as output.</alt-text>
</graphic>
</fig>
<p>First, we perform sampling on the raw videos. Since the original videos vary in length, we employ an adaptive sampling interval approach. This method calculates the sampling interval based on the length of the video and the number of frames, ensuring that each video yields exactly one hundred frames after sampling.</p>
<p>To minimize the interference from the background and other irrelevant information, each frame is processed through Multi-task Cascaded Convolutional Neural Networks (MTCNN) (<xref ref-type="bibr" rid="B38">38</xref>). First, MTCNN employs an image pyramid (<xref ref-type="bibr" rid="B39">39</xref>) to handle various face sizes within the frame, allowing it to perform detection across multiple scales. Then, the processed images are sent to Proposal Network (P-Net). It generates candidate facial regions by rapidly scanning the resized images, proposing potential areas that likely contain faces. After that, the outputs of the P-Net are sent to the Refine Network (R-Net) and this stage refines these candidates, filtering out false positives. Finally, the outputs of R-Net are sent to the Output Network and this stage also provides final bounding boxes and associated confidence scores to confirm the presence of facial features and output the final processed frames.</p>
<p>Following this preprocessing procedure, we have successfully processed the video data for 150 participants, with each set consisting of 100 frames.</p>
</sec>
<sec id="s3_1_3">
<label>3.1.3</label>
<title>Audio preprocessing</title>
<p>The processing of audio data in this study focuses on both global and local features, utilizing eGeMAPS (<xref ref-type="bibr" rid="B40">40</xref>) and MFCC (<xref ref-type="bibr" rid="B41">41</xref>) respectively. This section will detail the preprocessing steps used to extract these features, ensuring a comprehensive analysis for depression detection.</p>
<p>To control variables that might affect experimental outcomes, the extraction of audio features employed the Northwind dataset from AVEC2014. This ensures consistency in the content spoken by all participants. After extracting audio from original video files, the audio undergoes a noise reduction process. This involves spectral gating and wavelet denoising to remove noise while preserving essential aspects of the voice. Following the denoising, MFCC and eGeMAPS are extracted and the overall process is shown in the upper part of <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>The preprocessing steps applied to the audio data and text data.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g003.tif">
<alt-text content-type="machine-generated">Audio preprocessing involves extracting and denoising audio from videos, then framing it into multiple sections. These frames generate eGeMAPS and MFCC features. Text preprocessing extracts audio, converts it to raw text, and cleans and normalizes the text. The process uses data from AVEC2014 Northwind and Freeform datasets.</alt-text>
</graphic>
</fig>
<p>MFCCs are widely used in speech and audio processing and they play a pivotal role in capturing the local features of audio for our analysis. MFCCs are derived from the Fourier transform of a signal, with a focus on the Mel scale, which approximates the human auditory system&#x2019;s response more closely than the linearly-spaced frequency bands used in the standard cepstral analysis. The calculation of MFCCs is as <xref ref-type="disp-formula" rid="eq1">Equation&#xa0;1</xref>:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>2595</mml:mn>
<mml:msub>
<mml:mrow>
<mml:mi>log</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mn>700</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Initially, each audio track is segmented into 1,000 frames and for each frame, we extract the first 20 MFCCs. To further enhance the descriptive power of our features, we compute both the first and second derivatives of these coefficients, effectively capturing the dynamic changes in the cepstral features over time. As a result, each frame is represented by 60 MFCC features (20 coefficients plus their first and second derivatives), culminating in a 1000 * 60 feature matrix for each audio sample. This feature set allows us to analyze the local characteristics of the speech.</p>
<p>Meanwhile, we utilize the eGeMAPS (<xref ref-type="bibr" rid="B40">40</xref>) to represent the global features of speech. Unlike the MFCCs which focus on capturing fine-grained local properties of audio, eGeMAPS is designed to encapsulate the overall statistical characteristics of speech. This feature set is particularly effective for its comprehensive coverage of voice attributes that are commonly implicated in emotional states and psychological conditions.</p>
<p>eGeMAPS is an acoustic parameter set built on the Geneva Minimalistic Acoustic Parameter Set (GeMAPS). eGeMAPS consists of 88 features derived from basic acoustic descriptors (Low-Level Descriptors, LLDs) through various statistical methods, and these statistical features are known as High-Level Statistics (HSFs). The features cover multiple acoustic aspects including frequency, energy, and spectral properties. Each audio sample in our dataset is processed to extract a complete set of these eGeMAPS features, ultimately transforming the entire speech signal into a single 1 * 88 vector. This representation captures the global attributes of the speech.</p>
</sec>
<sec id="s3_1_4">
<label>3.1.4</label>
<title>Text preprocessing</title>
<p>To capture the diverse semantic information expressed by participants during interviews, our study begins by extracting textual data from the AVEC 2014 Freeform dataset, and the process is shown in the lower part of <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. The first step involves extracting audio from the video recordings, followed by a noise reduction process to ensure the clarity of human voices. Several preprocessing techniques are employed here, including spectral gating to reduce background noise and dynamic range compression to maintain a consistent audio level.</p>
<p>Once the audio is cleaned, it is converted into German text. Following transcription, the German text undergoes text cleaning and normalization. This involves transforming all text to lowercase, stripping away punctuation and special characters, and discarding stopwords that do not add significant value to the depression detection.</p>
<p>Thus, we have thoroughly and meticulously preprocessed the data for 150 participants, obtaining refined final datasets across three modalities: audio, video, and text. This comprehensive preparation ensures data well-suited for the subsequent stages of our analysis.</p>
</sec>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Framework of IMDD-Net</title>
<p>This chapter is dedicated to detailing the architecture of the IMDD-Net, a multimodal network designed for assessing depression. The IMDD-Net integrates audio, text, and video modalities, using both local and global information within each modality to enhance estimation accuracy. The network processes these modalities through four specialized channels and a multimodal fusion and inference process, culminating in the output of BDI-II scores to evaluate the severity of depressive disorders. The architecture of the network is illustrated in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>. In the following sections, the composition and functionalities of the IMDD-Net&#x2019;s architecture will be introduced.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Framework of the proposed Integrative Multimodal Depression Detection Network (IMDD-Net).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g004.tif">
<alt-text content-type="machine-generated">Diagram of a multimodal model for analyzing video, text, and audio data. Video frames undergo patch embedding and attention via TimeSformer, outputting Feature A. Text is tokenized, embedded, and processed through a BERT-based network to yield Feature B. Audio is split into MFCC and eGeMAPS features, processed, producing Features C and D. All features undergo fusion via Kronecker product. The result passes through residual blocks, culminating in a BDI-II score prediction.</alt-text>
</graphic>
</fig>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Video feature extraction subnetwork</title>
<p>In order to capture both local and global features from video data effectively, the TimeSformer (<xref ref-type="bibr" rid="B42">42</xref>) network has been selected for the video modality channel of the IMDD-Net. The TimeSformer builds upon the Vision Transformer (ViT) (<xref ref-type="bibr" rid="B43">43</xref>) by integrating temporal processing capabilities, allowing it to effectively address the dynamic nature of video data. Specifically, the TimeSformer extends the Vision Transformer architecture by incorporating temporal dimensions into the self-attention mechanism. This enables the model to process sequences of video frames rather than static images.</p>
<p>Initially, each frame (64 * 64 pixels) of the video is divided into patches of size 8 * 8 pixels. These patches are then linearly embedded into 512-dimensional vectors. After patch embedding, each patch is concatenated with learned spatial (position within a frame) and temporal (position within the sequence) embeddings. Space and time embeddings enhance the model&#x2019;s ability to interpret the positional context of each patch both within individual frames and across the sequence. The embedded patches are processed through multiple layers of spatial and temporal self-attention mechanisms. Each layer consists of 12 blocks (depth=12), where both local interactions within frames and global interactions across frames are captured (<xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>). Each attention block is followed by a multilayer perceptron (MLP) and the output from the final layer of the TimeSformer is passed through a linear layer that shapes the output into a high-dimensional feature vector of size 5*1 per video. This vector represents the key features extracted and processed from the video which contain both local and global features.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Local &amp; global features extracted by space &amp; time attention of TimeSformer.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g005.tif">
<alt-text content-type="machine-generated">Three grid-like frames show a woman's face labeled 'Frame t - 0', 'Frame t', and 'Frame t + 0'. Each frame features different lighting with green, blue, and red shades. The subject appears to be speaking.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Textual feature extraction subnetwork</title>
<p>In the text modality channel of the IMDD-Net, we leverage the capabilities of the Bidirectional Encoder Representations from Transformers (BERT) (<xref ref-type="bibr" rid="B27">27</xref>) to analyze textual data. BERT is particularly well-suited for this task due to its ability to capture both local and global contextual information from text. Unlike directional models, which read the text input sequentially (left-to-right or right-to-left), the BERT model reads the entire sequence of words at once. This characteristic allows BERT to capture the meaning of a specific word in different contexts, providing a deeper understanding of the text as a whole.</p>
<p>For our study, we utilize the German variant of BERT, specifically the Bert-base-german-cased model, which has been pre-trained on a large corpus of German text. The model features an embedding layer that converts tokenized text into numerical vectors. Text data is initially processed using the BertTokenizer, and adjusts the length of these sentences to a uniform 350 tokens by padding. The subsequent transformer blocks process the input using self-attention mechanisms, followed by an MLP layer at the end to transform the token embeddings into a single vector.</p>
<p>The final output is a high-dimensional feature vector sized 5 * 1 for each piece of text, encapsulating both local and global contextual information.</p>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Audio feature extraction subnetwork</title>
<p>The audio modality channel in the IMDD-Net processes audio data to capture both local and global features.</p>
<p>Local Feature Extraction with MFCC: The audio files are first processed to extract MFCC, which focus on local information at the frame level. Each audio file is segmented into frames, and for each frame, 60 MFCC features are computed, resulting in a feature matrix of size 60 * 1000. These MFCC features are then processed through a Transformer model with 8 attention heads and a hidden dimension of 64. The Transformer encoder is composed of 3 layers and the 5 * 1 feature vector representing local speech information is outputted.</p>
<p>Global Feature Extraction with eGeMAPS: In parallel, eGeMAPS features are extracted to capture the global acoustic properties of the audio file and the size of input is 88 * 1. The eGeMAPS features are processed using a network structured similar to ResNet, which is adapted here for one-dimensional audio signal processing. This network is composed of 18 residual blocks and each block includes layers of linear transformations with skip connections. After processing through an MLP, the 5 * 1 high-dimensional feature vector representing global audio information is outputted.</p>
<p>At this point, we have obtained two vectors that respectively represent the global and local features of the audio information.</p>
</sec>
<sec id="s3_2_4">
<label>3.2.4</label>
<title>Multimodal feature fusion and inference</title>
<p>In this final phase of the IMDD-Net, we converge the extracted features from different modalities to form a unified representation, leveraging the power of multimodal data fusion to enhance estimation precision.</p>
<p>The use of the Kronecker product in our fusion process allows for a detailed interaction between features from different modalities. Unlike simpler fusion techniques such as concatenation or averaging, the Kronecker product facilitates a richer and more expressive combination by mathematically intertwining the feature sets, thus capturing both inter-modal and intra-modal dependencies.</p>
<p>The Kronecker product, denoted by <inline-formula>
<mml:math display="inline" id="im1">
<mml:mo>&#x2297;</mml:mo>
</mml:math>
</inline-formula>, is a mathematical operation on two matrices. For matrices <inline-formula>
<mml:math display="inline" id="im2">
<mml:mi>A</mml:mi>
</mml:math>
</inline-formula> of size <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>*</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im4">
<mml:mi>B</mml:mi>
</mml:math>
</inline-formula> of size <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>*</mml:mo>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the Kronecker product <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2297;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a block matrix of size <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mo>*</mml:mo>
<mml:mi>n</mml:mi>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The operation is defined as <xref ref-type="disp-formula" rid="eq2">Equation&#xa0;2</xref>:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2297;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo>[</mml:mo>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>11</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd>
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd>
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where each element <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of matrix <inline-formula>
<mml:math display="inline" id="im9">
<mml:mi>A</mml:mi>
</mml:math>
</inline-formula> is multiplied by matrix <inline-formula>
<mml:math display="inline" id="im10">
<mml:mi>B</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>In the context of our IMDD-Net, the 5 * 1 feature vectors from each modality (representing both global and local information) undergo a transformation through the Kronecker product, resulting in a combined feature matrix. When these 5 * 1 vectors from the three modalities (audio, video, text) are subjected to the Kronecker product sequentially, the dimensionality of the resulting feature matrix expands to 625 * 1. This expansion not only increases the feature space but also preserves the unique characteristics of each modality.</p>
<p>The 625 * 1 feature vector is then processed through a deep residual network consisting of 18 layers and the output from the ResNet is a predictive value that correlates with the Beck Depression Inventory-II (BDI-II) scores.</p>
<p>In summary, the architecture of the IMDD-Net is designed for multimodal data. By integrating and processing local and global features from video, audio, and text inputs through advanced models, the system improves the accuracy of depression estimation and provides new insights for detecting depression.</p>
</sec>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Training methodology</title>
<p>The training and validation strategy of the IMDD-Net is designed to ensure a robust and comprehensive assessment of the model&#x2019;s performance.</p>
<p>The dataset consists of data from 150 participants. To validate the effectiveness and stability of the model, five-fold cross-validation is employed. The dataset is divided into five equal parts randomly. In each fold, one part is held out as a validation set while the other four parts are used for training. This process is repeated five times, with each of the five parts used exactly once as the validation set.</p>
<p>The Huber loss function is selected because it is an error metric that combines the best aspects of L1 norm (mean absolute error) and L2 norm (mean squared error) loss functions, making it particularly effective for regression problems.</p>
<p>The Huber loss is defined by the <xref ref-type="disp-formula" rid="eq3">Equation&#xa0;3</xref>:</p>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>&#x3b4;</mml:mi>
</mml:msub>
<mml:mo>(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mo>{</mml:mo>
<mml:mtable equalrows="true" equalcolumns="true">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>|</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>|</mml:mo>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>(</mml:mo>
<mml:mo>|</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>|</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>)</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>|</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>|</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&gt;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im11">
<mml:mi>y</mml:mi>
</mml:math>
</inline-formula> represents the true value, <inline-formula>
<mml:math display="inline" id="im12">
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
</mml:math>
</inline-formula> is the predicted value and <inline-formula>
<mml:math display="inline" id="im13">
<mml:mi>&#x3b4;</mml:mi>
</mml:math>
</inline-formula> is a threshold parameter that defines the boundary between the quadratic loss and the linear loss. This threshold is adjusted as 1 (<inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) in our experiment.</p>
<p>The optimizer used is AdamW (Initial learning rate = 0.001), which combines the advantages of AdaGrad and RMSProp optimization methods and includes weight decay (weight_decay=0.01) regularization to prevent overfitting.</p>
<p>To evaluate the performance of our model, we adopt Mean Absolute Error (MAE) and Root Mean Square Error (RMSE) as primary evaluation metrics. These two measures are widely used in the field of automatic depression estimation, particularly on the AVEC 2014 dataset, allowing for direct and fair comparisons with prior state-of-the-art methods. MAE provides a straightforward interpretation of average error, while RMSE penalizes larger deviations more heavily, offering insight into prediction stability. MAE and RMSE which are defined as <xref ref-type="disp-formula" rid="eq4">Equations&#xa0;4</xref> and <xref ref-type="disp-formula" rid="eq5">5</xref>:</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>=</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mo>&#x2211;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>'</mml:mo>
</mml:msubsup>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>'</mml:mo>
</mml:msubsup>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where N is the total number of observations, <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the prediction from the model, and <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msubsup>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>'</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is the actual observed value.</p>
<p>The model was trained for 300 epochs, and <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref> presents the Huber loss, MAE, and RMSE curves averaged across the five cross-validation folds.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Averaged training curves of Huber loss, MAE, and RMSE over 300 epochs across five cross-validation folds.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g006.tif">
<alt-text content-type="machine-generated">Two line graphs depict training performance over 300 epochs. The left graph shows the Huber loss decreasing from about 18 to below 6. The right graph shows Mean Absolute Error (MAE) and Root Mean Square Error (RMSE) both decreasing, with MAE starting around 17.5 and finishing below 10, and RMSE starting above 22.5 and finishing around 10.5.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s4" sec-type="results">
<label>4</label>
<title>Results</title>
<sec id="s4_1">
<label>4.1</label>
<title>Experiments result of IMDD-Net</title>
<p>In this section, we present the experimental results of the IMDD-Net, comparing its performance against state-of-the-art (SOTA) methods and the baseline from the AVEC 2014 challenge. The evaluation metrics used to assess the performance are RMSE and MAE.</p>
<p>As shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>, our model has achieved state-of-the-art performance with an RMSE of 7.55 and an MAE of 5.75. The AVEC 2014 baseline has an RMSE of 9.89 and an MAE of 7.89 (<xref ref-type="bibr" rid="B28">28</xref>). Cai et&#xa0;al. (<xref ref-type="bibr" rid="B44">44</xref>) proposed an end-to-end time-domain channel attention network (TDCA-Net) for depression detection and on the AVEC 2014 dataset, their model achieved an RMSE of 8.90 and an MAE of 7.08. Additionally, Dong et&#xa0;al. (<xref ref-type="bibr" rid="B19">19</xref>) proposed a hierarchical depression detection model combining deep speaker recognition and speech emotion recognition features, achieving an RMSE of 8.73 and an MAE of 7.32 on the AVEC 2014 dataset. Moreover, Uddin et&#xa0;al. (<xref ref-type="bibr" rid="B30">30</xref>) developed a two-stream deep spatiotemporal network for depression level estimation, resulting in an RMSE of 8.78 and an MAE of 6.86. Furthermore, He et&#xa0;al. (<xref ref-type="bibr" rid="B31">31</xref>) introduced an end-to-end trainable intelligent system utilizing a 3D convolutional neural network with a spatiotemporal feature aggregation module achieving an RMSE of 8.42 and an MAE of 6.78 on the AVEC 2014 dataset and Shang et&#xa0;al. (<xref ref-type="bibr" rid="B45">45</xref>) proposed a method called Local Quaternion and Global Deep Network, which integrates local quaternion and global deep features for facial depression recognition, achieving an RMSE of 7.84 and an MAE of 6.08. Moreover, Melo et&#xa0;al. (<xref ref-type="bibr" rid="B46">46</xref>) introduced the Maximization and Differentiation Network to represent facial expression variations relevant for depression assessment, achieving an RMSE of 7.90 and an MAE of 6.19 on the AVEC 2014 dataset. Additionally, Niu et&#xa0;al. (<xref ref-type="bibr" rid="B47">47</xref>) proposed a multi-scale and multi-region facial dynamic representation method for depression prediction, achieving an RMSE of 7.98 and an MAE of 6.14 and Melo et&#xa0;al. (<xref ref-type="bibr" rid="B48">48</xref>) proposed a two-stream model with a novel temporal pooling method for capturing spatio-temporal dynamics in video clips, achieving an RMSE of 7.94 and an MAE of 6.20 on the AVEC 2014 dataset. Pan et&#xa0;al. (<xref ref-type="bibr" rid="B49">49</xref>) proposed the Spatial-Temporal Attention Depression Recognition Network, which enhances feature extraction by capturing global and local spatial-temporal information, achieving an RMSE of 7.75 and an MAE of 6.00 and Song et&#xa0;al. (<xref ref-type="bibr" rid="B50">50</xref>) proposed a method for video-based automatic depression analysis using multi-scale video-level features and novel spectral representations, achieving an RMSE of 7.15 and an MAE of 5.95 on the AVEC 2014 dataset.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>The comparison of different methods and their structures on the AVEC 2014 dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Method</th>
<th valign="middle" align="center">Network structure</th>
<th valign="middle" align="center">Modality</th>
<th valign="middle" align="center">RMSE<bold>&#x2193;</bold>
</th>
<th valign="middle" align="center">MAE&#x2193;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">AVEC 2014 baseline (<xref ref-type="bibr" rid="B28">28</xref>)</td>
<td valign="middle" align="center">Support Vector Regression</td>
<td valign="middle" align="center">Audio-Video</td>
<td valign="middle" align="center">9.89</td>
<td valign="middle" align="center">7.89</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B44">44</xref>)</td>
<td valign="middle" align="center">An end-to-end time-domain channel attention network (TDCA-Net)</td>
<td valign="middle" align="center">Audio</td>
<td valign="middle" align="center">8.90</td>
<td valign="middle" align="center">7.08</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B19">19</xref>)</td>
<td valign="middle" align="center">A hierarchical model combining deep speaker recognition and speech emotion recognition features</td>
<td valign="middle" align="center">Audio</td>
<td valign="middle" align="center">8.73</td>
<td valign="middle" align="center">7.32</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B30">30</xref>)</td>
<td valign="middle" align="center">A two-stream deep spatiotemporal network</td>
<td valign="middle" align="center">Video</td>
<td valign="middle" align="center">8.78</td>
<td valign="middle" align="center">6.96</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B31">31</xref>)</td>
<td valign="middle" align="center">An end-to-end trainable system utilizing a 3D convolutional neural network with a spatiotemporal feature aggregation module</td>
<td valign="middle" align="center">Audio-Video</td>
<td valign="middle" align="center">8.42</td>
<td valign="middle" align="center">6.78</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B45">45</xref>)</td>
<td valign="middle" align="center">Local Quaternion and Global Deep Network</td>
<td valign="middle" align="center">Video</td>
<td valign="middle" align="center">7.84</td>
<td valign="middle" align="center">6.08</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B46">46</xref>)</td>
<td valign="middle" align="center">Maximization and Differentiation Network</td>
<td valign="middle" align="center">Video</td>
<td valign="middle" align="center">7.90</td>
<td valign="middle" align="center">6.19</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B47">47</xref>)</td>
<td valign="middle" align="center">A multi-scale and multi-region facial dynamic representation method</td>
<td valign="middle" align="center">Video</td>
<td valign="middle" align="center">7.98</td>
<td valign="middle" align="center">6.14</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B48">48</xref>)</td>
<td valign="middle" align="center">A two-stream model with a novel temporal pooling method for capturing spatio-temporal dynamics in video clips</td>
<td valign="middle" align="center">Video</td>
<td valign="middle" align="center">7.94</td>
<td valign="middle" align="center">6.20</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B49">49</xref>)</td>
<td valign="middle" align="center">Spatial-Temporal Attention Depression Recognition Network</td>
<td valign="middle" align="center">Video</td>
<td valign="middle" align="center">7.75</td>
<td valign="middle" align="center">6.00</td>
</tr>
<tr>
<td valign="middle" align="center">(<xref ref-type="bibr" rid="B50">50</xref>)</td>
<td valign="middle" align="center">Using multi-scale video-level features and novel spectral representations</td>
<td valign="middle" align="center">Video</td>
<td valign="middle" align="center">
<bold>7.15</bold>
</td>
<td valign="middle" align="center">5.95</td>
</tr>
<tr>
<td valign="middle" align="center">
<bold>Ours</bold>
</td>
<td valign="middle" align="center">Integrative Multimodal Depression Detection Network by using local and global multimodality features</td>
<td valign="middle" align="center">Audio-Video-Text</td>
<td valign="middle" align="center">7.55</td>
<td valign="middle" align="center">
<bold>5.75</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The symbol "&#x2193;" indicates that lower values represent better performance (as for MSE and RMSE).</p>
</fn>
<fn>
<p>Bold values denote the best performance across methods.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In summary, our model has achieved state-of-the-art performance, demonstrating its effectiveness in depression detection. Notably, our model achieved the lowest MAE among all compared models, with a score of 5.75, indicating its superior accuracy in predicting depression severity. Furthermore, our model attained the second-lowest RMSE, with a score of 7.55, closely following the RMSE of 7.15 achieved by Song et&#xa0;al. [38]. These results underscore the robustness and precision of our IMDD-Net in integrating multimodal data and global and local features to enhance depression detection.</p>
<p>In our study, we generated a BDI-II value comparison bar plot and an error histogram to evaluate the prediction performance of the IMDD-Net visually (<xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>). <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7A</bold>
</xref> presents a bar plot comparing the real and predicted BDI-II values for each sample. The x-axis corresponds to the sample indices, and the y-axis represents the BDI-II values. Each sample is represented by two bars: one for the real value and one for the predicted value. The red dashed line in the figure represents the threshold at a BDI-II score of 13. Typically, BDI-II scores below 13 are considered indicative of no depression, while scores above 13 suggest the presence of depressive symptoms. Analysis of <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7A</bold>
</xref> reveals that the IMDD-Net tends to underestimate the actual BDI-II scores of individuals with depression (BDI-II scores greater than 13, (43 of 73 are underestimated), while overestimating the BDI-II scores of those without depression (BDI-II scores less than 13, 45 of 77 are overestimated).</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Differences between real and predicted BDI-II values and histogram of prediction errors of IMDD-Net. <bold>(A)</bold> The comparison of real and predicted BDI-II values; <bold>(B)</bold> Histogram of prediction errors.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g007.tif">
<alt-text content-type="machine-generated">Panel A shows a bar graph with real and predicted BDI-II values plotted against sample index. A red dashed line at BDI-II equals thirteen is highlighted. Panel B presents a histogram of IMDD prediction errors, with frequencies peaking around zero and distributed between negative twenty and fifteen.</alt-text>
</graphic>
</fig>
<p>As shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7B</bold>
</xref>, the error histogram illustrates the distribution of the prediction errors, which are calculated as the difference between the real and predicted BDI-II values. The histogram provides a visual representation of how closely the predicted values align with the actual values. The x-axis represents the prediction error, while the y-axis represents the frequency of each error value. The majority of the errors are concentrated around zero and 57.33% sample is located in the error range of &#xb1; 5, indicating that the IMDD-Net&#x2019;s predictions are generally accurate.</p>
<p>To further illustrate the experimental results, we conducted a Brand-Altman analysis and performed a regression analysis. The Brand-Altman scatter plot and the regression plot are presented in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref> to illustrate the agreement and predictive accuracy of the IMDD-Net.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>The Bland-Altman plot and regression analysis for IMDD-Net predictions.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g008.tif">
<alt-text content-type="machine-generated">Bland-Altman plot and regression plot for depression predictions depicted. The left graph shows differences between real and predicted BDI-II scores with bias and standard deviation lines. The right graph shows a regression line and data points comparing real and predicted BDI-II scores, with metrics such as mean absolute error, root mean square error, and R-squared value listed.</alt-text>
</graphic>
</fig>
<p>The Bland-Altman plot (the left part of <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>) compares the differences between the predicted BDI-II scores and the actual BDI-II scores against their averages. The mean difference (bias) is calculated to be &#x2212;0.97 with a standard deviation of 7.61. The limits of agreement, defined as the mean difference &#xb1; 1.96 times the standard deviation, range from &#x2212;15.89 to 13.95.</p>
<p>The regression analysis (the right part of <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>) shows the relationship between the predicted BDI-II scores and the actual BDI-II scores. The regression equation is given by: Predicted BDI-II = 0.85 &#xd7; Actual BDI-II + 3.22. The coefficient of determination (R2) is calculated to be 0.65, indicating that approximately 65% of the variance in the actual BDI-II scores can be explained by the IMDD-Net&#x2019;s predictions.</p>
<p>In summary, the Bland-Altman analysis shows good agreement between the predicted and actual BDI-II scores, while the regression analysis confirms the predictive accuracy of the IMDD-Net. These results collectively highlight the effectiveness of the IMDD-Net in assessing depression severity.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Classification performance analysis</title>
<p>To further validate the effectiveness of IMDD-Net, we performed a classification analysis using a threshold of 13 on the BDI-II scores. Participants with scores above 13 were classified as depressed, while those with scores of 13 or below were classified as non-depressed. The confusion matrix (The left part of <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>) provides a detailed breakdown of the classification results, showcasing the true positive (TP) is 62, false positive (FP) is 20, true negative (TN) is 57, and false negative (FN) is 11. Therefore, the performance metrics are calculated as follows: Accuracy (ACC) is approximately 79.3%, Sensitivity (SEN) is 84.9%, Specificity (SPE) is 74.0%, Positive Predictive Value (PPV) is 75.6%, and Negative Predictive Value (NPV) is 83.8% (The right part of <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>).</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>The confusion matrix and classification performance of IMDD-Net.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g009.tif">
<alt-text content-type="machine-generated">Confusion matrix and bar chart for depression classification performance. The confusion matrix shows true positive values for normal (57) and depressed (62) instances. The bar chart presents metrics: accuracy (0.79), sensitivity (0.85), specificity (0.74), positive predictive value (0.76), and negative predictive value (0.84).</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Ablation experiment</title>
<p>To further emphasize the importance of multimodal data in the detection of depression, we conducted an ablation study. In this study, we evaluated the performance of the IMDD-Net using individual modalities and combinations of two modalities and compared the results with the complete multimodal configuration.</p>
<p>For the ablation study, we considered the following configurations: (a) Audio Only (MFCC and eGeMAPS); (b) Video Only; (c) Text Only; (d) Audio + Video; (e) Audio + Text; (f) Video + Text; (g) IMDD (Audio + Video + Text). Each configuration was used to train and test the model independently, following the same training methodology as described in the previous sections.</p>
<p>The performance of each configuration was evaluated by RMSE and MAE. The results are summarized in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. Among the single modalities, the text modality achieves the best performance with the lowest RMSE (7.81) and MAE (6.36). For dual-modality combinations, the Video + Text configuration yields the best results with an RMSE of 7.66 and an MAE of 5.84. Previous clinical psychology studies have indicated that the relationship between language users (e.g., speakers or writers) and their texts is meaningful and shows considerable promise for the depression detection (<xref ref-type="bibr" rid="B51">51</xref>). A study (<xref ref-type="bibr" rid="B51">51</xref>) also suggests that it may be possible to identify individuals at risk for depression through text-based analysis, which aligns with the results of our ablation experiments.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The result of the ablation experiment of IMDD-Net.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model variant</th>
<th valign="middle" align="center">RMSE&#x2193;</th>
<th valign="middle" align="center">MAE&#x2193;</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Audio Only</td>
<td valign="middle" align="center">8.49</td>
<td valign="middle" align="center">6.88</td>
</tr>
<tr>
<td valign="middle" align="left">Video Only</td>
<td valign="middle" align="center">8.27</td>
<td valign="middle" align="center">6.63</td>
</tr>
<tr>
<td valign="middle" align="left">Text Only</td>
<td valign="middle" align="center">7.81</td>
<td valign="middle" align="center">6.36</td>
</tr>
<tr>
<td valign="middle" align="left">Audio + Video</td>
<td valign="middle" align="center">7.72</td>
<td valign="middle" align="center">6.05</td>
</tr>
<tr>
<td valign="middle" align="left">Audio + Text</td>
<td valign="middle" align="center">7.68</td>
<td valign="middle" align="center">5.99</td>
</tr>
<tr>
<td valign="middle" align="left">Video + Text</td>
<td valign="middle" align="center">7.66</td>
<td valign="middle" align="center">5.84</td>
</tr>
<tr>
<td valign="middle" align="left">IMDD (Audio + Video + Text)</td>
<td valign="middle" align="center">
<bold>7.55</bold>
</td>
<td valign="middle" align="center">
<bold>5.75</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The symbol "&#x2193;" indicates that lower values represent better performance (as for MSE and RMSE).</p>
</fn>
<fn>
<p>Bold values denote the best performance across methods.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The ablation experiments and corresponding statistical analysis results are also shown in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>, where the standard deviation is also included from five-fold cross-validation and paired t-tests are conducted on the RMSE and MAE metrics. The RMSE of the Audio Only and Video Only configuration is significantly higher than that of other configurations (p&lt;0.05). The Text Only configuration presents significantly lower RMSE than other two single modality configurations (p&lt;0.05), indicating that the information underlying the text might be more valuable for the depression detection or easier to be extracted by our IMDD-Net. The MAE of the IMDD (Audio + Video + Text) is significantly lower than that of single modality and Audio + Video configurations (p&lt;0.05).</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>The result of the ablation experiment and statistical analysis. <bold>(A)</bold> RMSE and MAE. <bold>(B)</bold> Pairwise p-value heatmap of RMSE. <bold>(C)</bold> Pairwise p-value heatmap of MAE. (* indicates a significant difference.).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1602650-g010.tif">
<alt-text content-type="machine-generated">Panel A shows a bar graph comparing RMSE and MAE scores for different modality variants, with higher values for RMSE. Panels B and C display heatmaps of pairwise p-values for RMSE and MAE, indicating statistical significance between modality pairs through color gradients.</alt-text>
</graphic>
</fig>
<p>The ablation study clearly illustrates the critical role of multimodal data integration in the IMDD-Net. Each modality contributes unique and valuable information and enhances the network&#x2019;s ability to detect depression accurately. The superior performance of the complete multimodal configuration underscores the necessity of leveraging diverse data sources in the field of depression detection.</p>
<p>An important experimental discovery in this study is the identification of the critical role played by multimodal data integration, specifically emphasizing the complementary nature of text, audio, and video modalities. Through ablation experiments, we observed that integrating text modality consistently improved performance more than audio or video alone, indicating that linguistic patterns provide highly discriminative signals for depression. This finding is consistent with clinical research, which suggests that linguistic expressions offer reliable and sensitive indicators of depressive states, capturing nuanced cognitive and emotional disturbances that might be less prominently reflected through facial expressions or vocal characteristics alone.</p>
<p>Furthermore, the comparative analysis of prediction errors and Bland-Altman agreement plots revealed specific behavioral patterns in our model&#x2019;s predictions: the IMDD-Net tended to underestimate depressive severity in participants with higher BDI-II scores, possibly due to subtle or suppressed emotional cues in severely depressed individuals that are difficult to capture comprehensively from video or audio data alone. Conversely, it slightly overestimated scores in non-depressed individuals, suggesting a potential sensitivity toward ambiguous or transient emotional cues. These nuanced error patterns highlight opportunities for further refining modality-specific feature extraction and fusion techniques.</p>
<p>Overall, these experimental insights underscore the necessity of multimodal fusion in depression detection frameworks, particularly emphasizing the distinct contribution and sensitivity of linguistic information. This emphasizes the need for future research to prioritize advanced linguistic feature extraction methods and adaptive multimodal fusion strategies, ultimately improving the clinical utility and interpretability of automated depression detection systems.</p>
</sec>
</sec>
<sec id="s5" sec-type="discussion">
<label>5</label>
<title>Discussion</title>
<p>This study introduced IMDD-Net, a multimodal deep learning model designed to enhance depression detection by integrating video, audio, and text data. Our findings demonstrate that incorporating multiple modalities provides a more comprehensive and robust assessment of depressive symptoms. By leveraging both local and global feature extraction, IMDD-Net effectively captures short-term behavioral cues as well as long-term patterns in facial expressions, vocal tone, and linguistic structures. The model achieved an RMSE of 7.55 and an MAE of 5.75 for depression severity estimation, along with a classification accuracy of 79.3%, demonstrating high sensitivity (84.9%) and specificity (74.0%).</p>
<p>This study underscores the potential of multimodal deep learning in advancing objective, scalable, and accessible mental health assessments. By addressing current challenges, models like IMDD-Net could significantly contribute to the development of AI-driven tools for depression screening and monitoring, bridging the gap between computational psychiatry and clinical practice.</p>
<sec id="s5_1">
<label>5.1</label>
<title>Effectiveness of multimodal data in depression prediction</title>
<p>The use of multimodal data, particularly the combination of audio, video, and text, has proven to be highly effective in depression prediction. Depression is a disorder that manifests through various behavioral, cognitive, and linguistic patterns (<xref ref-type="bibr" rid="B52">52</xref>). Studies have consistently shown that individuals with depression exhibit distinct facial expressions, speech patterns, and language use, which together provide a more comprehensive and reliable means of assessment (<xref ref-type="bibr" rid="B53">53</xref>, <xref ref-type="bibr" rid="B54">54</xref>).</p>
<p>Facial expression analysis plays a crucial role in depression detection, as individuals suffering from depression often exhibit reduced facial variability, diminished smiling, and increased negative affect (<xref ref-type="bibr" rid="B55">55</xref>). Similarly, speech-based features such as prosody, pitch variation, articulation rate, and speech fluency offer valuable indicators of depressive states (<xref ref-type="bibr" rid="B56">56</xref>). Depressed individuals often demonstrate monotonous speech, slower articulation, increased hesitations, and longer response latencies (<xref ref-type="bibr" rid="B57">57</xref>), all of which can be extracted using acoustic analysis techniques such as MFCC and eGeMAPS. In addition to audio-visual cues, the linguistic patterns of individuals with depression often reflect negative sentiment, cognitive distortions, self-referential focus, and reduced syntactic complexity (<xref ref-type="bibr" rid="B51">51</xref>). By analyzing transcribed speech or written text, natural language processing (NLP) models can identify markers such as higher usage of first-person pronouns, excessive expressions of sadness or hopelessness, and reduced use of complex sentence structures (<xref ref-type="bibr" rid="B58">58</xref>).</p>
<p>While unimodal approaches relying on either facial expressions, speech, or text have demonstrated promising results, they often suffer from limited accuracy due to the heterogeneous nature of depressive symptoms (<xref ref-type="bibr" rid="B59">59</xref>). For instance, an individual may suppress facial expressions while still exhibiting changes in speech tone and linguistic patterns, or their speech may remain neutral while textual markers indicate distress. By integrating these three modalities, deep learning models can compensate for the weaknesses of each individual modality, leading to more robust and reliable depression prediction.</p>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>The role of global and local features in multimodal fusion</title>
<p>The integration of global and local features in multimodal learning plays a crucial role in capturing the complex manifestations of depression. Depression affects individuals in both momentary expressions and long-term behavioral patterns (<xref ref-type="bibr" rid="B60">60</xref>), making it essential for predictive models to analyze information across multiple temporal scales. Local features focus on fine-grained, short-term behavioral signals such as micro-expressions, brief tonal fluctuations in speech, and word-level linguistic markers, which provide immediate but transient indicators of depressive symptoms (<xref ref-type="bibr" rid="B61">61</xref>). On the other hand, global features capture long-term dependencies, trends in speech fluency, sustained emotional states in facial expressions, and discourse-level linguistic patterns, which reflect the broader psychological state of an individual over an extended period (<xref ref-type="bibr" rid="B62">62</xref>, <xref ref-type="bibr" rid="B63">63</xref>).</p>
<p>Depression manifests at multiple temporal and behavioral levels, necessitating an approach that jointly models local variations and global trends. Future research should continue to explore adaptive multimodal fusion strategies that dynamically balance local and global information, ensuring that automated depression detection models remain both effective and clinically interpretable.</p>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>Limitations and future directions</title>
<p>In this paper, the IMDD-Net is introduced as a novel deep learning model designed to enhance the accuracy of depression detection by leveraging local and global features from video, audio, and text cues. This model effectively integrates multimodal data to provide a comprehensive analysis of depressive symptoms. By using the Kronecker product for multimodal fusion, our network explores deep interactions between modalities, further enhancing assessment accuracy. Experimental results demonstrate that the IMDD-Net achieves state-of-the-art performance. The robustness and precision of IMDD-Net are proved in identifying depression by extra experiment. In summary, via deep learning approach and video-audio-text multimodal data, IMDD-Net might be a useful tool of estimating depression risk in an undisturbed and convenient manner.</p>
<p>While the IMDD-Net has demonstrated state-of-the-art performance, there are several limitations that need further investigation. First, the IMDD-Net does not effectively address the issue of missing modalities. If any one of the input modalities (audio, video, or text) is absent, the model&#x2019;s performance may be impacted. But it also means that depression can manifest through various modalities, encompassing diverse behavioral, emotional, and physiological indicators. Second, the integration of multimodal data and the use of advanced feature extraction techniques increase the computational complexity and resource requirements. This may hinder the deployment of the IMDD-Net in real-world, resource-constrained environments. Third, the current implementation of the IMDD-Net focuses on accuracy rather than real-time processing capabilities. For practical applications in clinical settings, the model needs to be optimized for faster inference without compromising accuracy. Furthermore, the current version of IMDD-Net has not been optimized for real-time deployment. Due to the model&#x2019;s reliance on high-dimensional features, multiple deep architectures and Kronecker-based multimodal fusion, the computational load is relatively high. This may pose challenges for real-time inference, particularly in resource-constrained environments such as mobile platforms or telehealth applications. Finally, each modality (audio, video, text) comes with its own set of challenges, such as background noise in audio data, varying lighting conditions in video data, and the need for accurate transcription in text data. Addressing these issues comprehensively remains a challenge.</p>
<p>The ethical considerations are of crucial importance. While the IMDD-Net shows potential for use in clinical or diagnostic settings, its deployment raises important ethical considerations. First, the collection and analysis of audio, video, and textual data involve sensitive personal information, necessitating strict data privacy protection and informed consent protocols. Second, false positives may lead to unnecessary anxiety or stigmatization, while false negatives could result in missed opportunities for early intervention. Therefore, the model should not function as a standalone diagnostic tool but rather as an assistive system to support clinical decision-making. Additionally, bias in training data&#x2014;such as demographic imbalance&#x2014;could propagate disparities in predictions. Ensuring fairness, transparency, and human oversight will be essential in any future clinical deployment of the IMDD-Net.</p>
<p>Future work should focus on training and validating the IMDD-Net on larger and more diverse datasets, including data from various cultural backgrounds and different age groups. This would enhance the model&#x2019;s generalizability and robustness. As a future direction, we also plan to collaborate with clinical partners to conduct prospective validation studies or pilot trials, aiming to assess the model&#x2019;s reliability and generalizability in diverse patient populations. More efforts should be made to optimize the IMDD-Net for real-time processing through reducing computational complexity and improving the efficiency of the feature extraction and inference processes. Developing robust methods to handle missing modalities is crucial. This could involve creating imputation techniques or designing the model to be more resilient to incomplete input data, ensuring consistent performance regardless of data availability. Recovering missing modalities also represents a viable approach (<xref ref-type="bibr" rid="B64">64</xref>). By addressing these limitations and exploring these future directions, the IMDD-Net can provide a more reliable and efficient tool for the detection of depression.</p>
</sec>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusion</title>
<p>In this study, we introduced the Integrative Multimodal Depression Detection Network (IMDD-Net), a novel deep-learning framework that effectively integrates video, audio, and textual modalities to enhance depression detection accuracy. By systematically capturing both local and global features within each modality and employing an advanced multimodal fusion strategy based on the Kronecker product, IMDD-Net provides a robust representation of depressive symptoms.</p>
<p>Experimental results conducted on the AVEC 2014 dataset demonstrate the superior performance of IMDD-Net, achieving state-of-the-art outcomes with a Root Mean Square Error (RMSE) of 7.55 and a Mean Absolute Error (MAE) of 5.75 in predicting BDI-II scores. The classification analysis further validates the model&#x2019;s practical utility, yielding an accuracy of approximately 79.3% in distinguishing depressed individuals from non-depressed ones. Ablation studies underscore the critical contribution of each modality and reinforce the necessity of incorporating multimodal data.</p>
<p>Despite these promising outcomes, IMDD-Net faces several limitations that merit attention. The computational complexity associated with multimodal fusion and high-dimensional feature extraction poses challenges for real-time and resource-constrained applications. Moreover, the model currently lacks comprehensive clinical validation, limiting immediate deployment in clinical practice. Future studies will address these limitations by optimizing the computational efficiency, developing methods to handle incomplete modality inputs, and rigorously evaluating clinical validity through prospective trials.</p>
<p>Ultimately, by bridging advanced computational methods with multimodal behavioral indicators, IMDD-Net represents a significant step toward objective, accurate, and accessible depression screening, paving the way for enhanced mental health assessment practices.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <uri xlink:href="https://doi.org/10.1145/2647868.2647869">https://doi.org/10.1145/2647868.2647869</uri>.</p>
</sec>
<sec id="s8" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The studies involving humans were approved by the Ethics Committee of Northeastern University. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study. Written informed consent was obtained from the individual(s) for the publication of any identifiable images or data included in this article.</p>
</sec>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>YL: Data curation, Formal Analysis, Investigation, Methodology, Software, Validation, Visualization, Writing &#x2013; original draft. XY: Conceptualization, Data curation, Investigation, Validation, Visualization, Writing &#x2013; review &amp; editing. MZ: Data curation, Investigation, Software, Writing &#x2013; review &amp; editing. JW: Data curation, Software, Writing &#x2013; review &amp; editing. YY: Conceptualization, Validation, Visualization, Writing &#x2013; review &amp; editing. WQ: Conceptualization, Funding acquisition, Project administration, Resources, Supervision, Validation, Writing &#x2013; review &amp; editing. SQ: Conceptualization, Funding acquisition, Methodology, Project administration, Resources, Supervision, Validation, Visualization, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s10" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research and/or publication of this article. This study was supported by the National Natural Science Foundation of China (62271131) and the Fundamental Research Funds for the Central Universities (N25BJD013).</p>
</sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s12" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s13" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Herrman</surname> <given-names>H</given-names>
</name>
<name>
<surname>Kieling</surname> <given-names>C</given-names>
</name>
<name>
<surname>McGorry</surname> <given-names>P</given-names>
</name>
<name>
<surname>Horton</surname> <given-names>R</given-names>
</name>
<name>
<surname>Sargent</surname> <given-names>J</given-names>
</name>
<name>
<surname>Patel</surname> <given-names>V</given-names>
</name>
</person-group>. <article-title>Reducing the global burden of depression: a Lancet-World Psychiatric Association Commission</article-title>. <source>Lancet</source>. (<year>2019</year>) <volume>393</volume>:<page-range>e42&#x2013;e3</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0140-6736(18)32408-5</pub-id>, PMID: <pub-id pub-id-type="pmid">30482607</pub-id></citation></ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marwaha</surname> <given-names>S</given-names>
</name>
<name>
<surname>Palmer</surname> <given-names>E</given-names>
</name>
<name>
<surname>Suppes</surname> <given-names>T</given-names>
</name>
<name>
<surname>Cons</surname> <given-names>E</given-names>
</name>
<name>
<surname>Young</surname> <given-names>AH</given-names>
</name>
<name>
<surname>Upthegrove</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>Novel and emerging treatments for major depression</article-title>. <source>Lancet</source>. (<year>2023</year>) <volume>401</volume>:<page-range>141&#x2013;53</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0140-6736(22)02080-3</pub-id>, PMID: <pub-id pub-id-type="pmid">36535295</pub-id></citation></ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vandana</surname>
</name>
<name>
<surname>Marriwala</surname> <given-names>N</given-names>
</name>
<name>
<surname>Chaudhary</surname> <given-names>D</given-names>
</name>
</person-group>. <article-title>A hybrid model for depression detection using deep learning</article-title>. <source>Measur: Sens</source>. (<year>2023</year>) <volume>25</volume>:<fpage>100587</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.measen.2022.100587</pub-id>
</citation></ref>
<ref id="B4">
<label>4</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Cohn</surname> <given-names>JF</given-names>
</name>
<name>
<surname>Kruez</surname> <given-names>TS</given-names>
</name>
<name>
<surname>Matthews</surname> <given-names>I</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Nguyen</surname> <given-names>MH</given-names>
</name>
<name>
<surname>Padilla</surname> <given-names>MT</given-names>
</name>
<etal/>
</person-group> eds. <article-title>"Detecting depression from facial actions and vocal prosody</article-title>, " In: <conf-name>2009 3rd international conference on affective computing and intelligent interaction and workshops</conf-name>, <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>, (<year>2009</year>) pp. <page-range>1&#x2013;7</page-range>.</citation></ref>
<ref id="B5">
<label>5</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Beck</surname> <given-names>AT</given-names>
</name>
<name>
<surname>Alford</surname> <given-names>BA</given-names>
</name>
</person-group>. <source>Depression: Causes and treatment</source>. <publisher-loc>Philadelphia, USA, Pennsylvania</publisher-loc>: <publisher-name>University of Pennsylvania Press</publisher-name> (<year>2009</year>).</citation></ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mundt</surname> <given-names>JC</given-names>
</name>
<name>
<surname>Snyder</surname> <given-names>PJ</given-names>
</name>
<name>
<surname>Cannizzaro</surname> <given-names>MS</given-names>
</name>
<name>
<surname>Chappie</surname> <given-names>K</given-names>
</name>
<name>
<surname>Geralts</surname> <given-names>DS</given-names>
</name>
</person-group>. <article-title>Voice acoustic measures of depression severity and treatment response collected via interactive voice response (IVR) technology</article-title>. <source>J Neurolinguistics</source>. (<year>2007</year>) <volume>20</volume>:<fpage>50</fpage>&#x2013;<lpage>64</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jneuroling.2006.04.001</pub-id>, PMID: <pub-id pub-id-type="pmid">21253440</pub-id></citation></ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bzdok</surname> <given-names>D</given-names>
</name>
<name>
<surname>Meyer-Lindenberg</surname> <given-names>A</given-names>
</name>
</person-group>. <article-title>Machine learning for precision psychiatry: opportunities and challenges</article-title>. <source>Biol Psychiatry Cognit Neurosci Neuroimaging</source>. (<year>2018</year>) <volume>3</volume>:<page-range>223&#x2013;30</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.bpsc.2017.11.007</pub-id>, PMID: <pub-id pub-id-type="pmid">29486863</pub-id></citation></ref>
<ref id="B8">
<label>8</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nunes</surname> <given-names>SO</given-names>
</name>
<name>
<surname>Reiche</surname> <given-names>EM</given-names>
</name>
<name>
<surname>Morimoto</surname> <given-names>HK</given-names>
</name>
<name>
<surname>Matsuo</surname> <given-names>T</given-names>
</name>
<name>
<surname>Itano</surname> <given-names>EN</given-names>
</name>
<name>
<surname>Xavier</surname> <given-names>EC</given-names>
</name>
<etal/>
</person-group>. <article-title>Immune and hormonal activity in adults suffering from depression</article-title>. <source>Braz J Med Biol Res</source>. (<year>2002</year>) <volume>35</volume>:<page-range>581&#x2013;7</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1590/S0100-879X2002000500011</pub-id>, PMID: <pub-id pub-id-type="pmid">12011944</pub-id></citation></ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kupfer</surname> <given-names>DJ</given-names>
</name>
<name>
<surname>Foster</surname> <given-names>FG</given-names>
</name>
</person-group>. <article-title>Interval between onset of sleep and rapid-eye-movement sleep as an indicator of depression</article-title>. <source>Lancet</source>. (<year>1972</year>) <volume>2</volume>:<page-range>684&#x2013;6</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0140-6736(72)92090-9</pub-id>, PMID: <pub-id pub-id-type="pmid">4115821</pub-id></citation></ref>
<ref id="B10">
<label>10</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McDermott</surname> <given-names>LM</given-names>
</name>
<name>
<surname>Ebmeier</surname> <given-names>KP</given-names>
</name>
</person-group>. <article-title>A meta-analysis of depression severity and cognitive function</article-title>. <source>J Affect Disord</source>. (<year>2009</year>) <volume>119</volume>:<fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jad.2009.04.022</pub-id>, PMID: <pub-id pub-id-type="pmid">19428120</pub-id></citation></ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>J</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>B</given-names>
</name>
<name>
<surname>Dong</surname> <given-names>S</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>B</given-names>
</name>
<etal/>
</person-group>. <article-title>Spatio-temporal learning and explaining for dynamic functional connectivity analysis: Application to depression</article-title>. <source>J Affect Disord</source>. (<year>2024</year>) <volume>364</volume>:<page-range>266&#x2013;73</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jad.2024.08.014</pub-id>, PMID: <pub-id pub-id-type="pmid">39137835</pub-id></citation></ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname> <given-names>X</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>L</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>M</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>S</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>X</given-names>
</name>
<etal/>
</person-group>. <article-title>Social anxiety prediction based on ERP features: A deep learning approach</article-title>. <source>J Affect Disord</source>. (<year>2024</year>) <volume>367</volume>:<page-range>545&#x2013;53</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jad.2024.09.006</pub-id>, PMID: <pub-id pub-id-type="pmid">39236887</pub-id></citation></ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname> <given-names>W</given-names>
</name>
<name>
<surname>Moncy</surname> <given-names>JC</given-names>
</name>
<name>
<surname>Ghazi-Noori</surname> <given-names>A-R</given-names>
</name>
<name>
<surname>Woodham</surname> <given-names>RD</given-names>
</name>
<name>
<surname>Rezaei</surname> <given-names>H</given-names>
</name>
<name>
<surname>Bramon</surname> <given-names>E</given-names>
</name>
<etal/>
</person-group>. <article-title>Enhanced network synchronization connectivity following transcranial direct current stimulation (tDCS) in bipolar depression: effects on EEG oscillations and deep learning-based predictors of clinical remission</article-title>. <source>J Affect Disord</source>. (<year>2025</year>) <volume>369</volume>:<page-range>576&#x2013;87</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jad.2024.09.054</pub-id>, PMID: <pub-id pub-id-type="pmid">39293596</pub-id></citation></ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jin</surname> <given-names>H</given-names>
</name>
<name>
<surname>Nath</surname> <given-names>SS</given-names>
</name>
<name>
<surname>Schneider</surname> <given-names>S</given-names>
</name>
<name>
<surname>Junghaenel</surname> <given-names>D</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>S</given-names>
</name>
<name>
<surname>Kaplan</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>An informatics approach to examine decision-making impairments in the daily life of individuals with depression</article-title>. <source>J BioMed Inform</source>. (<year>2021</year>) <volume>122</volume>:<fpage>103913</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jbi.2021.103913</pub-id>, PMID: <pub-id pub-id-type="pmid">34487888</pub-id></citation></ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moura</surname> <given-names>I</given-names>
</name>
<name>
<surname>Teles</surname> <given-names>A</given-names>
</name>
<name>
<surname>Viana</surname> <given-names>D</given-names>
</name>
<name>
<surname>Marques</surname> <given-names>J</given-names>
</name>
<name>
<surname>Coutinho</surname> <given-names>L</given-names>
</name>
<name>
<surname>Silva</surname> <given-names>F</given-names>
</name>
</person-group>. <article-title>Digital Phenotyping of Mental Health using multimodal sensing of multiple situations of interest: A Systematic Literature Review</article-title>. <source>J BioMed Inform</source>. (<year>2023</year>) <volume>138</volume>:<fpage>104278</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jbi.2022.104278</pub-id>, PMID: <pub-id pub-id-type="pmid">36586498</pub-id></citation></ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thai</surname> <given-names>M</given-names>
</name>
<name>
<surname>Nair</surname> <given-names>AU</given-names>
</name>
<name>
<surname>Klimes-Dougan</surname> <given-names>B</given-names>
</name>
<name>
<surname>Albott</surname> <given-names>CS</given-names>
</name>
<name>
<surname>Silamongkol</surname> <given-names>T</given-names>
</name>
<name>
<surname>Corkrum</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. <article-title>Deep transcranial magnetic stimulation for adolescents with treatment-resistant depression: A preliminary dose-finding study exploring safety and clinical effectiveness</article-title>. <source>J Affect Disord</source>. (<year>2024</year>) <volume>354</volume>:<fpage>589</fpage>&#x2013;<lpage>600</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jad.2024.03.061</pub-id>, PMID: <pub-id pub-id-type="pmid">38484878</pub-id></citation></ref>
<ref id="B17">
<label>17</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>L</given-names>
</name>
<name>
<surname>Niu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tiwari</surname> <given-names>P</given-names>
</name>
<name>
<surname>Marttinen</surname> <given-names>P</given-names>
</name>
<name>
<surname>Su</surname> <given-names>R</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>Deep learning for depression recognition with audiovisual cues: A review</article-title>. <source>Inf Fusion</source>. (<year>2022</year>) <volume>80</volume>:<fpage>56</fpage>&#x2013;<lpage>86</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.inffus.2021.10.012</pub-id>
</citation></ref>
<ref id="B18">
<label>18</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Churi</surname> <given-names>H</given-names>
</name>
<name>
<surname>Keshri</surname> <given-names>P</given-names>
</name>
<name>
<surname>Khamkar</surname> <given-names>S</given-names>
</name>
<name>
<surname>Sankhe</surname> <given-names>A</given-names>
</name>
</person-group>. <source>A deep learning approach for depression classification using audio features</source>. <publisher-loc>Tamilnadu</publisher-loc>: <publisher-name>Fast Track Publications</publisher-name> (<year>2021</year>).</citation></ref>
<ref id="B19">
<label>19</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X</given-names>
</name>
</person-group>. <article-title>A hierarchical depression detection model based on vocal and emotional cues</article-title>. <source>Neurocomputing</source>. (<year>2021</year>) <volume>441</volume>:<page-range>279&#x2013;90</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2021.02.019</pub-id>
</citation></ref>
<ref id="B20">
<label>20</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>L</given-names>
</name>
<name>
<surname>Cao</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>Automated depression analysis using convolutional neural networks from speech</article-title>. <source>J BioMed Inform</source>. (<year>2018</year>) <volume>83</volume>:<page-range>103&#x2013;11</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jbi.2018.05.007</pub-id>, PMID: <pub-id pub-id-type="pmid">29852317</pub-id></citation></ref>
<ref id="B21">
<label>21</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Joshi</surname> <given-names>ML</given-names>
</name>
<name>
<surname>Kanoongo</surname> <given-names>N</given-names>
</name>
</person-group>. <article-title>Depression detection using emotional artificial intelligence and machine learning: A closer review</article-title>. <source>Mater Today: Proc</source>. (<year>2022</year>) <volume>58</volume>:<page-range>217&#x2013;26</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.matpr.2022.01.467</pub-id>
</citation></ref>
<ref id="B22">
<label>22</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trotzek</surname> <given-names>M</given-names>
</name>
<name>
<surname>Koitka</surname> <given-names>S</given-names>
</name>
<name>
<surname>Friedrich</surname> <given-names>CM</given-names>
</name>
</person-group>. <article-title>Utilizing neural networks and linguistic metadata for early detection of depression indications in text sequences</article-title>. <source>IEEE Trans Knowledge Data Eng</source>. (<year>2020</year>) <volume>32</volume>:<fpage>588</fpage>&#x2013;<lpage>601</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TKDE.69</pub-id>
</citation></ref>
<ref id="B23">
<label>23</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saffar</surname> <given-names>AH</given-names>
</name>
<name>
<surname>Mann</surname> <given-names>TK</given-names>
</name>
<name>
<surname>Ofoghi</surname> <given-names>B</given-names>
</name>
</person-group>. <article-title>Textual emotion detection in health: Advances and applications</article-title>. <source>J BioMed Inform</source>. (<year>2023</year>) <volume>137</volume>:<fpage>104258</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jbi.2022.104258</pub-id>, PMID: <pub-id pub-id-type="pmid">36528329</pub-id></citation></ref>
<ref id="B24">
<label>24</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dalal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Jain</surname> <given-names>S</given-names>
</name>
<name>
<surname>Dave</surname> <given-names>M</given-names>
</name>
</person-group>. <article-title>DepressionFeature: Underlying ontology for user-specific depression analysis</article-title>. <source>J Supercomput</source>. (<year>2025</year>) <volume>81</volume>:<fpage>1</fpage>&#x2013;<lpage>21</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11227-024-06585-w</pub-id>
</citation></ref>
<ref id="B25">
<label>25</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>Depression detection via a Chinese social media platform: a novel causal relation-aware deep learning approach</article-title>. <source>J Supercomput</source>. (<year>2024</year>) <volume>80</volume>:<page-range>10327&#x2013;56</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11227-023-05830-y</pub-id>
</citation></ref>
<ref id="B26">
<label>26</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>Depression clinical detection model based on social media: a federated deep learning approach</article-title>. <source>J Supercomput</source>. (<year>2024</year>) <volume>80</volume>:<page-range>7931&#x2013;54</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11227-023-05754-7</pub-id>
</citation></ref>
<ref id="B27">
<label>27</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Devlin</surname> <given-names>J</given-names>
</name>
<name>
<surname>Chang</surname> <given-names>M-W</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>K</given-names>
</name>
<name>
<surname>Toutanova</surname> <given-names>K</given-names>
</name>
</person-group>. <source>Bert: Pre-training of deep bidirectional transformers for language understanding</source>. <publisher-loc>Minneapolis, Minnesota</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name> (<year>2018</year>).</citation></ref>
<ref id="B28">
<label>28</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Valstar</surname> <given-names>M</given-names>
</name>
<name>
<surname>Schuller</surname> <given-names>B</given-names>
</name>
<name>
<surname>Smith</surname> <given-names>K</given-names>
</name>
<name>
<surname>Almaev</surname> <given-names>T</given-names>
</name>
<name>
<surname>Eyben</surname> <given-names>F</given-names>
</name>
<name>
<surname>Krajewski</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <source>AVEC '14: Proceedings of the 4th International Workshop on Audio/Visual Emotion Challenge</source>. <publisher-loc>New York NY United States</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>, (<year>2014</year>).</citation></ref>
<ref id="B29">
<label>29</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Li</surname> <given-names>X</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>W</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>H</given-names>
</name>
</person-group>, eds. <article-title>"Depression severity prediction from facial expression based on the DRR_DepressionNet network</article-title>, " In: <conf-name>2020 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name>, <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>, (<year>2020</year>), pp. <page-range>2757&#x2013;64</page-range>.</citation></ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Uddin</surname> <given-names>MA</given-names>
</name>
<name>
<surname>Joolee</surname> <given-names>JB</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>Y-K</given-names>
</name>
</person-group>. <article-title>Depression level prediction using deep spatiotemporal features and multilayer bi-LTSM</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2022</year>) <volume>13</volume>:<page-range>864&#x2013;70</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2020.2970418</pub-id>
</citation></ref>
<ref id="B31">
<label>31</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>L</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>C</given-names>
</name>
<name>
<surname>Tiwari</surname> <given-names>P</given-names>
</name>
<name>
<surname>Pandey</surname> <given-names>HM</given-names>
</name>
<name>
<surname>Dang</surname> <given-names>W</given-names>
</name>
</person-group>. <article-title>Intelligent system for depression scale estimation with facial expressions and case study in industrial intelligence</article-title>. <source>Int J Intelligent Syst</source>. (<year>2021</year>) <volume>37</volume>:<page-range>10140&#x2013;56</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/int.22426</pub-id>
</citation></ref>
<ref id="B32">
<label>32</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rajawat</surname> <given-names>AS</given-names>
</name>
<name>
<surname>Bedi</surname> <given-names>P</given-names>
</name>
<name>
<surname>Goyal</surname> <given-names>SB</given-names>
</name>
<name>
<surname>Bhaladhare</surname> <given-names>P</given-names>
</name>
<name>
<surname>Aggarwal</surname> <given-names>A</given-names>
</name>
<name>
<surname>Singhal</surname> <given-names>RS</given-names>
</name>
</person-group>. <article-title>Fusion fuzzy logic and deep learning for depression detection using facial expressions</article-title>. <source>Proc Comput Sci</source>. (<year>2023</year>) <volume>218</volume>:<page-range>2795&#x2013;805</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.procs.2023.01.251</pub-id>
</citation></ref>
<ref id="B33">
<label>33</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tao</surname> <given-names>J</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Lian</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>Multimodal spatiotemporal representation for automatic depression level detection</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2023</year>) <volume>14</volume>:<fpage>294</fpage>&#x2013;<lpage>307</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2020.3031345</pub-id>
</citation></ref>
<ref id="B34">
<label>34</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>H</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Chai</surname> <given-names>S</given-names>
</name>
<name>
<surname>Qiu</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>L</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>X</given-names>
</name>
<etal/>
</person-group>. <article-title>Multi-modal adaptive fusion transformer network for the estimation of depression level</article-title>. <source>Sens (Basel)</source>. (<year>2021</year>) <volume>21</volume>:<fpage>4764</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s21144764</pub-id>, PMID: <pub-id pub-id-type="pmid">34300504</pub-id></citation></ref>
<ref id="B35">
<label>35</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Bucur</surname> <given-names>A-M</given-names>
</name>
<name>
<surname>Cosma</surname> <given-names>A</given-names>
</name>
<name>
<surname>Rosso</surname> <given-names>P</given-names>
</name>
<name>
<surname>Dinu</surname> <given-names>LP</given-names>
</name>
</person-group>, eds. <article-title>"It&#x2019;s just a matter of time: Detecting depression with time-enriched multimodal transformers</article-title>, " In: <conf-name>European Conference on Information Retrieval</conf-name>, <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer</publisher-name>, (<year>2023</year>), pp. <page-range>200&#x2013;15</page-range>.</citation></ref>
<ref id="B36">
<label>36</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Cui</surname> <given-names>Z</given-names>
</name>
</person-group>, eds. <article-title>"Decoupled multimodal distilling for emotion recognition</article-title>," In: <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, <publisher-loc>Los Alamitos</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>, (<year>2023</year>), pp. <page-range>6631&#x2013;40</page-range>.</citation></ref>
<ref id="B37">
<label>37</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Beck</surname> <given-names>AT</given-names>
</name>
<name>
<surname>Steer</surname> <given-names>RA</given-names>
</name>
<name>
<surname>Brown</surname> <given-names>GK</given-names>
</name>
</person-group>. <source>BDI-II: Beck depression inventory</source>. <publisher-loc>Springer, NY, USA</publisher-loc>: <publisher-name>Pearson</publisher-name> (<year>1996</year>).</citation></ref>
<ref id="B38">
<label>38</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>K</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Qiao</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>Joint face detection and alignment using multitask cascaded convolutional networks</article-title>. <source>IEEE Signal Process Lett</source>. (<year>2016</year>) <volume>23</volume>:<page-range>1499&#x2013;503</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LSP.2016.2603342</pub-id>
</citation></ref>
<ref id="B39">
<label>39</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Adelson</surname> <given-names>EH</given-names>
</name>
<name>
<surname>Anderson</surname> <given-names>CH</given-names>
</name>
<name>
<surname>Bergen</surname> <given-names>JR</given-names>
</name>
<name>
<surname>Burt</surname> <given-names>PJ</given-names>
</name>
<name>
<surname>Ogden</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>Pyramid methods in image processing</article-title>. <source>RCA engineer</source>. (<year>1984</year>) <volume>29</volume>:<fpage>33</fpage>&#x2013;<lpage>41</lpage>.</citation></ref>
<ref id="B40">
<label>40</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eyben</surname> <given-names>F</given-names>
</name>
<name>
<surname>Scherer</surname> <given-names>KR</given-names>
</name>
<name>
<surname>Schuller</surname> <given-names>BW</given-names>
</name>
<name>
<surname>Sundberg</surname> <given-names>J</given-names>
</name>
<name>
<surname>Andre</surname> <given-names>E</given-names>
</name>
<name>
<surname>Busso</surname> <given-names>C</given-names>
</name>
<etal/>
</person-group>. <article-title>The Geneva minimalistic acoustic parameter set (GeMAPS) for voice research and affective computing</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2016</year>) <volume>7</volume>:<fpage>190</fpage>&#x2013;<lpage>202</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2015.2457417</pub-id>
</citation></ref>
<ref id="B41">
<label>41</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tiwari</surname> <given-names>V</given-names>
</name>
</person-group>. <article-title>MFCC and its applications in speaker recognition</article-title>. <source>Int J Emerg Technol</source>. (<year>2010</year>) <volume>1</volume>:<fpage>19</fpage>&#x2013;<lpage>22</lpage>.</citation></ref>
<ref id="B42">
<label>42</label>
<citation citation-type="book">
<person-group person-group-type="editor">
<name>
<surname>Bertasius</surname> <given-names>G</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H</given-names>
</name>
<name>
<surname>Torresani</surname> <given-names>L</given-names>
</name>
</person-group>, eds. <source>Is space-time attention all you need for video understanding? In IICML</source> <publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates, Inc</publisher-name>, (<year>2021</year>) (<volume>2</volume>(<issue>3</issue>):<fpage>4</fpage>.</citation></ref>
<ref id="B43">
<label>43</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname> <given-names>A</given-names>
</name>
<name>
<surname>Beyer</surname> <given-names>L</given-names>
</name>
<name>
<surname>Kolesnikov</surname> <given-names>A</given-names>
</name>
<name>
<surname>Weissenborn</surname> <given-names>D</given-names>
</name>
<name>
<surname>Zhai</surname> <given-names>X</given-names>
</name>
<name>
<surname>Unterthiner</surname> <given-names>T</given-names>
</name>
<etal/>
</person-group>. <article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>. <source>arXiv preprint arXiv:2010.11929</source> (<year>2020</year>).</citation></ref>
<ref id="B44">
<label>44</label>
<citation citation-type="book">
<person-group person-group-type="editor">
<name>
<surname>Cai</surname> <given-names>C</given-names>
</name>
<name>
<surname>Niu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B</given-names>
</name>
<name>
<surname>Tao</surname> <given-names>J</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>X</given-names>
</name>
</person-group>, eds. <source>TDCA-Net: Time-Domain Channel Attention Network for Depression Detection</source>. <publisher-loc>Red Hook, NY, USA</publisher-loc>: <publisher-name>Interspeech Curran Associates, Inc</publisher-name>, (<year>2021</year>).</citation></ref>
<ref id="B45">
<label>45</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>G</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>T</given-names>
</name>
<etal/>
</person-group>. <article-title>LQGDNet: A local quaternion and global deep network for facial depression recognition</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2023</year>) <volume>14</volume>:<page-range>2557&#x2013;63</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2021.3139651</pub-id>
</citation></ref>
<ref id="B46">
<label>46</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>de Melo</surname> <given-names>WC</given-names>
</name>
<name>
<surname>Granger</surname> <given-names>E</given-names>
</name>
<name>
<surname>L&#xf3;pez</surname> <given-names>MB</given-names>
</name>
</person-group>. <article-title>MDN: A deep maximization-differentiation network for spatio-temporal depression detection</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2023</year>) <volume>14</volume>:<page-range>578&#x2013;90</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2021.3072579</pub-id>
</citation></ref>
<ref id="B47">
<label>47</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Niu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tao</surname> <given-names>J</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B</given-names>
</name>
</person-group>, eds. <article-title>"Multi-scale and multi-region facial discriminative representation for automatic depression level prediction"</article-title>, In: <conf-name>ICASSP 2021&#x2013;2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name>, <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>, (<year>2021</year>), pp. <page-range>1325&#x2013;9</page-range>.</citation></ref>
<ref id="B48">
<label>48</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>De Melo</surname> <given-names>WC</given-names>
</name>
<name>
<surname>Granger</surname> <given-names>E</given-names>
</name>
<name>
<surname>Lopez</surname> <given-names>MB</given-names>
</name>
</person-group>, eds. <article-title>"Encoding temporal information for automatic depression recognition from facial analysis</article-title>, " In: <conf-name>ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name>, <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>, (<year>2020</year>), pp. <page-range>1080&#x2013;4</page-range>.</citation></ref>
<ref id="B49">
<label>49</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pan</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Shang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>T</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>G</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>H</given-names>
</name>
<etal/>
</person-group>. <article-title>Spatial&#x2013;Temporal Attention Network for Depression Recognition from facial videos</article-title>. <source>Expert Syst Appl</source>. (<year>2024</year>) <volume>237</volume>:<fpage>121410</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2023.121410</pub-id>
</citation></ref>
<ref id="B50">
<label>50</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname> <given-names>S</given-names>
</name>
<name>
<surname>Jaiswal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>L</given-names>
</name>
<name>
<surname>Valstar</surname> <given-names>M</given-names>
</name>
</person-group>. <article-title>Spectral representation of behaviour primitives for depression analysis</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2022</year>) <volume>13</volume>:<page-range>829&#x2013;44</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2020.2970712</pub-id>
</citation></ref>
<ref id="B51">
<label>51</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Havigerov&#xe1;</surname> <given-names>JM</given-names>
</name>
<name>
<surname>Haviger</surname> <given-names>J</given-names>
</name>
<name>
<surname>Ku&#x10d;era</surname> <given-names>D</given-names>
</name>
<name>
<surname>Hoffmannov&#xe1;</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>Text-based detection of the risk of depression</article-title>. <source>Front Psychol</source>. (<year>2019</year>) <volume>10</volume>:<elocation-id>513</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpsyg.2019.00513</pub-id>, PMID: <pub-id pub-id-type="pmid">30936845</pub-id></citation></ref>
<ref id="B52">
<label>52</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Buchwald</surname> <given-names>AM</given-names>
</name>
<name>
<surname>Rudick-Davis</surname> <given-names>D</given-names>
</name>
</person-group>. <article-title>The symptoms of major depression</article-title>. <source>J Abnormal Psychol</source>. (<year>1993</year>) <volume>102</volume>:<fpage>197</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1037/0021-843X.102.2.197</pub-id>
</citation></ref>
<ref id="B53">
<label>53</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bourke</surname> <given-names>C</given-names>
</name>
<name>
<surname>Douglas</surname> <given-names>K</given-names>
</name>
<name>
<surname>Porter</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>Processing of facial emotion expression in major depression: a review</article-title>. <source>Aust New Z J Psychiatry</source>. (<year>2010</year>) <volume>44</volume>:<page-range>681&#x2013;96</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.3109/00048674.2010.496359</pub-id>, PMID: <pub-id pub-id-type="pmid">20636189</pub-id></citation></ref>
<ref id="B54">
<label>54</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Balsters M</surname> <given-names>JH</given-names>
</name>
<name>
<surname>Krahmer E</surname> <given-names>J</given-names>
</name>
<name>
<surname>Swerts M</surname> <given-names>GJ</given-names>
</name>
<name>
<surname>Vingerhoets A</surname> <given-names>JJM</given-names>
</name>
</person-group>. <article-title>Verbal and nonverbal correlates for depression: a review</article-title>. <source>Curr Psychiatry Rev</source>. (<year>2012</year>) <volume>8</volume>:<page-range>227&#x2013;34</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.2174/157340012800792966</pub-id>
</citation></ref>
<ref id="B55">
<label>55</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hale</surname> <given-names>WW</given-names>
<suffix>III</suffix>
</name>
</person-group>. <article-title>Judgment of facial expressions and depression persistence</article-title>. <source>Psychiatry Res</source>. (<year>1998</year>) <volume>80</volume>:<page-range>265&#x2013;74</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0165-1781(98)00070-5</pub-id>, PMID: <pub-id pub-id-type="pmid">9796942</pub-id></citation></ref>
<ref id="B56">
<label>56</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Koops</surname> <given-names>S</given-names>
</name>
<name>
<surname>Brederoo</surname> <given-names>SG</given-names>
</name>
<name>
<surname>de Boer</surname> <given-names>JN</given-names>
</name>
<name>
<surname>Nadema</surname> <given-names>FG</given-names>
</name>
<name>
<surname>Voppel</surname> <given-names>AE</given-names>
</name>
<name>
<surname>Sommer</surname> <given-names>IE</given-names>
</name>
</person-group>. <article-title>Speech as a biomarker for depression</article-title>. <source>CNS Neurol Disorders Drug Targets CNS Neurol Disorders)</source>. (<year>2023</year>) <volume>22</volume>:<page-range>152&#x2013;60</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.2174/1871527320666211213125847</pub-id>, PMID: <pub-id pub-id-type="pmid">34961469</pub-id></citation></ref>
<ref id="B57">
<label>57</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cummins</surname> <given-names>N</given-names>
</name>
<name>
<surname>Scherer</surname> <given-names>S</given-names>
</name>
<name>
<surname>Krajewski</surname> <given-names>J</given-names>
</name>
<name>
<surname>Schnieder</surname> <given-names>S</given-names>
</name>
<name>
<surname>Epps</surname> <given-names>J</given-names>
</name>
<name>
<surname>Quatieri</surname> <given-names>TF</given-names>
</name>
</person-group>. <article-title>A review of depression and suicide risk assessment using speech analysis</article-title>. <source>Speech Commun</source>. (<year>2015</year>) <volume>71</volume>:<fpage>10</fpage>&#x2013;<lpage>49</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.specom.2015.03.004</pub-id>
</citation></ref>
<ref id="B58">
<label>58</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Wolohan</surname> <given-names>J</given-names>
</name>
<name>
<surname>Hiraga</surname> <given-names>M</given-names>
</name>
<name>
<surname>Mukherjee</surname> <given-names>A</given-names>
</name>
<name>
<surname>Sayyed</surname> <given-names>ZA</given-names>
</name>
<name>
<surname>Millard</surname> <given-names>M</given-names>
</name>
</person-group>, eds. <article-title>"Detecting linguistic traces of depression in topic-restricted text: Attending to self-stigmatized depression with NLP</article-title>, " In: <conf-name>Proceedings of the first international workshop on language cognition and computational models</conf-name>, <publisher-loc>Santa Fe, New Mexico, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>, (<year>2018</year>), pp. <page-range>11&#x2013;21</page-range>.</citation></ref>
<ref id="B59">
<label>59</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Angst</surname> <given-names>J</given-names>
</name>
<name>
<surname>Merikangas</surname> <given-names>KR</given-names>
</name>
</person-group>. <article-title>Multi-dimensional criteria for the diagnosis of depression</article-title>. <source>J Affect Disord</source>. (<year>2001</year>) <volume>62</volume>:<fpage>7</fpage>&#x2013;<lpage>15</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0165-0327(00)00346-3</pub-id>, PMID: <pub-id pub-id-type="pmid">11172869</pub-id></citation></ref>
<ref id="B60">
<label>60</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wichers</surname> <given-names>M</given-names>
</name>
</person-group>. <article-title>The dynamic nature of depression: a new micro-level perspective of mental disorder that meets current challenges</article-title>. <source>psychol Med</source>. (<year>2014</year>) <volume>44</volume>:<page-range>1349&#x2013;60</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1017/S0033291713001979</pub-id>, PMID: <pub-id pub-id-type="pmid">23942140</pub-id></citation></ref>
<ref id="B61">
<label>61</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Pampouchidou</surname> <given-names>A</given-names>
</name>
<name>
<surname>Simantiraki</surname> <given-names>O</given-names>
</name>
<name>
<surname>Fazlollahi</surname> <given-names>A</given-names>
</name>
<name>
<surname>Pediaditis</surname> <given-names>M</given-names>
</name>
<name>
<surname>Manousos</surname> <given-names>D</given-names>
</name>
<name>
<surname>Roniotis</surname> <given-names>A</given-names>
</name>
<etal/>
</person-group>. <article-title>"Depression assessment by fusing high and low level features from audio, video, and text</article-title>," In: <conf-name>Proceedings of the 6th International Workshop on Audio/Visual Emotion Challenge</conf-name>, <publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>, (<year>2016</year>), pp. <page-range>27&#x2013;34</page-range>.</citation></ref>
<ref id="B62">
<label>62</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>de Melo</surname> <given-names>WC</given-names>
</name>
<name>
<surname>Granger</surname> <given-names>E</given-names>
</name>
<name>
<surname>Hadid</surname> <given-names>A</given-names>
</name>
</person-group>, eds. <article-title>"Combining global and local convolutional 3d networks for detecting depression from facial expressions</article-title>," In: <conf-name>2019 14th ieee international conference on automatic face &amp; gesture recognition (fg 2019)</conf-name>, <publisher-loc>Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>, (<year>2019</year>), pp. <page-range>1&#x2013;8</page-range>.</citation></ref>
<ref id="B63">
<label>63</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>G</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>T</given-names>
</name>
<etal/>
</person-group>. <article-title>LQGDNet: A local quaternion and global deep network for facial depression recognition</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2021</year>) <volume>14</volume>:<page-range>2557&#x2013;63</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2021.3139651</pub-id>
</citation></ref>
<ref id="B64">
<label>64</label>
<citation citation-type="confproc">
<person-group person-group-type="editor">
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Cui</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
</person-group>, eds. <article-title>"Distribution-consistent modal recovering for incomplete multimodal learning</article-title>," In: <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, <publisher-loc>Los Alamitos</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>, (<year>2023</year>), pp. <page-range>22025&#x2013;34</page-range>.</citation></ref>
</ref-list>
</back>
</article>